Commit ee848f06 authored by Dos Santos David's avatar Dos Santos David Committed by David Dos Santos

re-organize project

parent ab59b47a
#!/bin/bash
BINPATH=$(dirname $0)
python3 "$BINPATH/../main.py" $@
import queue import queue
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
class Document: class Document:
def __init__(self, document_id): def __init__(self, document_id):
self._document_id = document_id self._document_id = document_id
self.abstract = ""
@property @property
def document_id(self): def document_id(self):
...@@ -49,36 +52,29 @@ class Document: ...@@ -49,36 +52,29 @@ class Document:
def references(self, value): def references(self, value):
self._references = value.strip() self._references = value.strip()
@property
def abstract(self):
return self._abstract
@abstract.setter
def abstract(self, value):
self._abstract = value.strip()
def __str__(self): def __str__(self):
s = "[id: {doc_id}] {title} (published at {published_at})".format( s = "id: {doc_id}\ntitle :{title}\npublished at {published_at}\n".format(
doc_id=self.document_id, doc_id=self.document_id,
title=self.title.upper(), title=self.title.upper(),
published_at=self.published_at published_at=self.published_at,
) )
if self.abstract != "":
s += "abstract :{abstract}\n".format(abstract=self.abstract)
return s return s
def tokenize(self): def tokenize(self, tokenizer=SimpleTokenizer):
""" """
Tokenize a document Tokenize a document
""" """
items = [ self._title, self._authors, self._references ] return tokenizer.get_tokens(self)
tokens = []
for item in items:
tokens += self._tokenize_element(item)
return tokens
@staticmethod
def _tokenize_element(item):
"""
Tokenize an element of the document
"""
# TODO: make it better
return item.replace("!", " ").replace(".", " ").split()
from gogole.document import Document
class CACMParser:
MARKERS = {
'.I': 'document',
'.T': 'title',
'.B': 'published_at',
'.A': 'authors',
'.N': 'added_at',
'.X': 'references',
'.W': 'abstract'
}
DOCUMENT_MARKER = '.I'
def __init__(self, filename):
self.filename = filename
def find_marker_in_line(self, line):
"""
Return the marker of the line if it exists, or None.
"""
return next((m for m in self.MARKERS.keys() if line.startswith(m)), None)
def parse(self, limit=None):
documents = dict()
with open(self.filename, "r") as f:
current_document_id = None
current_marker = None
buffer = ""
for line in f:
# find the current marker
marker = self.find_marker_in_line(line)
if marker == self.DOCUMENT_MARKER:
# this is a new document
current_document_id = int(line[len(marker) + 1])
if limit is not None \
and current_document_id >= limit:
break
documents[current_document_id] = Document(current_document_id)
elif marker is not None:
# new marker
if current_marker is not None:
setattr(
documents[current_document_id],
self.MARKERS[current_marker],
buffer)
# reset the buffer
buffer = ""
current_marker = marker
else:
buffer += line
return documents
class AbstractTokenizer:
"""
Abstract Tokenizer
"""
@staticmethod
def get_tokens(document: 'Document'):
raise Error("Method get_tokens not implemented")
from itertools import chain
from gogole.tokenizer.abstract_tokenizer import AbstractTokenizer
class SimpleTokenizer(AbstractTokenizer):
"""
Simple tokenizer using any space or punctuation sign as separator
"""
SEPARATORS = [" ", ".", ",", "!", "?", ":", ";"]
@staticmethod
def get_tokens(document: 'Document'):
tokens = [
document.title,
]
for sep in self.SEPARATORS:
tokens = chain(*[t.sep(sep) for t in tokens])
tokens = [t for t in tokens if t != ""]
return tokens
import argparse import argparse
from document import Document from gogole.parser.cacm_parser import CACMParser
MARKERS = {
'.I': 'document',
'.T': 'title',
'.B': 'published_at',
'.A': 'authors',
'.N': 'added_at',
'.X': 'references',
}
DOCUMENT_MARKER = '.I'
def extract_documents_from_file(file):
documents = dict()
current_marker = None
current_document_id = None
buffer = ""
for line in file:
ms = [m for m in MARKERS.keys() if line.startswith(m)]
if len(ms) > 0:
assert(len(ms) == 1)
marker = ms[0]
if marker == DOCUMENT_MARKER:
# new document
current_document_id = int(line[len(marker)+1:])
if current_document_id > 100:
break
documents[current_document_id] = Document(current_document_id)
else:
if current_marker is not None:
setattr(documents[current_document_id], MARKERS[current_marker], buffer)
buffer = ""
current_marker = marker
else:
buffer += line
return documents
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -51,9 +8,11 @@ def main(): ...@@ -51,9 +8,11 @@ def main():
args = parser.parse_args() args = parser.parse_args()
documents = dict() parser = CACMParser(args.file)
with open(args.file, 'r') as f: documents = parser.parse(limit=10)
documents = extract_documents_from_file(f)
print("\n\n".join(str(d) for d in documents.values()))
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment