Commit 478a4bcc authored by Prot Alexandre's avatar Prot Alexandre

adding stanford campus - wip

parent 848b94a0
...@@ -6,6 +6,12 @@ __pycache__/ ...@@ -6,6 +6,12 @@ __pycache__/
# C extensions # C extensions
*.so *.so
# Data files
data/
# VSCode
.vscode
# Distribution / packaging # Distribution / packaging
.Python .Python
build/ build/
......
...@@ -7,8 +7,8 @@ from gogole.utils import heap_law ...@@ -7,8 +7,8 @@ from gogole.utils import heap_law
ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law'] ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law']
def load_documents(collection_file, parser_cls=CACMParser, limit=None): def load_documents(parser_cls=CACMParser, limit=None):
parser = parser_cls(collection_file) parser = parser_cls()
return parser.parse_all(limit=limit) return parser.parse_all(limit=limit)
...@@ -18,7 +18,7 @@ def run_analyze(args): ...@@ -18,7 +18,7 @@ def run_analyze(args):
if 'all' in commands: if 'all' in commands:
commands = ANALYZE_COMMANDS commands = ANALYZE_COMMANDS
documents = load_documents(args.file) documents = load_documents()
tokenizer = SimpleTokenizer(args.stop_words_file) tokenizer = SimpleTokenizer(args.stop_words_file)
......
...@@ -6,7 +6,7 @@ from gogole.tokenizer.simple_tokenizer import SimpleTokenizer ...@@ -6,7 +6,7 @@ from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
# #
def run_build(args): def run_build(args):
parser = CACMParser(args.file) parser = CACMParser()
tokenizer = SimpleTokenizer(args.stop_words_file) tokenizer = SimpleTokenizer(args.stop_words_file)
indexer = BSBIIndexer(maxsize=128) indexer = BSBIIndexer(maxsize=128)
......
from gogole.parser import CACMParser, StanfordParser
COLLECTIONS = {"cacm": CACMParser, "stanford": StanfordParser}
from gogole.document.abstract_document import AbstractDocument
from gogole.document.cacm_document import CACMDocument
from gogole.document.stanford_document import StanfordDocument
class AbstractDocument:
"""
Abstract Document
"""
@staticmethod
def get_raw_content():
raise Error("Method get_raw_content not implemented. Please reach out to David Dos Santos @ raton.laveur@cloporte.fr")
import queue
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
class Document: class CACMDocument:
def __init__(self, document_id): def __init__(self, document_id):
self._document_id = document_id self._document_id = document_id
...@@ -45,6 +43,11 @@ class Document: ...@@ -45,6 +43,11 @@ class Document:
abstract="abstract : {}\n".format(self.abstract) if self.abstract != "" else "" abstract="abstract : {}\n".format(self.abstract) if self.abstract != "" else ""
) )
def get_raw_content(self):
return " ".join([self.title, self.abstract, self.keywords])
def tokenize(self, tokenizer=SimpleTokenizer()): def tokenize(self, tokenizer=SimpleTokenizer()):
""" """
Tokenize a document Tokenize a document
......
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
class StanfordDocument:
def __init__(self, document_id):
self._document_id = document_id
@property
def document_id(self):
return self._document_id
@property
def url(self) -> str:
return self._url
@url.setter
def url(self, value: str):
self.url = value.strip()
@property
def keywords(self):
return self._keywords
@keywords.setter
def keywords(self, value):
self._keywords = value.strip()
def __str__(self):
return "[ID #{doc_id}] {url}\n{keywords}".format(
doc_id=self.document_id,
url=self.url,
keywords="keywords: {}\n".format(self.keywords) if self.keywords != "" else ""
)
def get_raw_content(self):
return self.keywords
def tokenize(self, tokenizer=SimpleTokenizer()):
"""
Tokenize a document
"""
return tokenizer.get_tokens(self)
from gogole.parser.cacm_parser import CACMParser
from gogole.parser.stanford_parser import StanfordParser
from gogole.document import Document from gogole.document import CACMDocument
class CACMParser: class CACMParser:
FILENAME = "data/cacm.all"
MARKERS = { MARKERS = {
'.I': 'document', '.I': 'document',
'.T': 'title', '.T': 'title',
...@@ -14,9 +16,6 @@ class CACMParser: ...@@ -14,9 +16,6 @@ class CACMParser:
DOCUMENT_MARKER = '.I' DOCUMENT_MARKER = '.I'
def __init__(self, filename):
self.filename = filename
def find_marker_in_line(self, line): def find_marker_in_line(self, line):
""" """
Return the marker of the line if it exists, or None. Return the marker of the line if it exists, or None.
...@@ -27,7 +26,7 @@ class CACMParser: ...@@ -27,7 +26,7 @@ class CACMParser:
def find_documents(self, limit=None): def find_documents(self, limit=None):
counter = 0 # count documents found counter = 0 # count documents found
with open(self.filename, 'r') as f: with open(self.FILENAME, 'r') as f:
current_document_id = None current_document_id = None
current_document = None current_document = None
...@@ -48,7 +47,7 @@ class CACMParser: ...@@ -48,7 +47,7 @@ class CACMParser:
if limit is not None and counter >= limit: if limit is not None and counter >= limit:
return return
current_document = Document(current_document_id) current_document = CACMDocument(current_document_id)
counter += 1 counter += 1
elif marker is not None: elif marker is not None:
......
from gogole.document import StanfordDocument
from os import walk
class StanfordParser:
DIRECTORY = "data/standford"
def find_documents(self, limit=None):
counter = 0 # count documents found
for collection_index in range(10):
collection_dir = DIRECTORY + "/" + str(collection_index)
for (dirpath, dirnames, filenames) in walk(collection_dir):
f.extend(filenames)
print(filenames)
"""
with open(self.FILENAME, 'r') as f:
current_document_id = None
current_document = None
current_marker = None
buffer = ""
for line in f:
marker = self.find_marker_in_line(line)
if marker == self.DOCUMENT_MARKER:
if current_document is not None:
yield current_document
current_document_id = int(line[3:])
if limit is not None and counter >= limit:
return
current_document = CACMDocument(current_document_id)
counter += 1
elif marker is not None:
if current_marker is not None and current_marker in self.MARKERS.keys():
setattr(
current_document,
self.MARKERS[current_marker],
buffer)
buffer = ""
current_marker = marker
else:
buffer += line
yield current_document
"""
"""
def parse_all(self, limit=None):
return {doc.document_id: doc for doc in self.find_documents(limit)}
"""
...@@ -22,11 +22,7 @@ class SimpleTokenizer(AbstractTokenizer): ...@@ -22,11 +22,7 @@ class SimpleTokenizer(AbstractTokenizer):
def get_tokens(self, document: 'gogole.document.Document'): def get_tokens(self, document: 'gogole.document.Document'):
tokens = [ tokens = [document.get_raw_content()]
document.title,
document.abstract,
document.keywords,
]
# #
# 1/ split by any separator # 1/ split by any separator
......
import argparse import argparse
from gogole.commands import analyze_command, build_command from gogole.commands import analyze_command, build_command
from gogole.config import COLLECTIONS
def main(): def main():
...@@ -9,7 +9,10 @@ def main(): ...@@ -9,7 +9,10 @@ def main():
# top-level parser # top-level parser
# collection information # collection information
argsparser.add_argument("-f", "--file", help="input collection file") argsparser.add_argument(
"-c", "--collection",
help="collection to use")
argsparser.add_argument("--stop-words-file", help="stop words list filename") argsparser.add_argument("--stop-words-file", help="stop words list filename")
subparsers = argsparser.add_subparsers() subparsers = argsparser.add_subparsers()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment