Commit 71037300 authored by Prot Alexandre's avatar Prot Alexandre

adding stanford parser

parent 478a4bcc
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law
from gogole.config import COLLECTIONS
ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law']
def load_documents(parser_cls=CACMParser, limit=None):
def load_documents(parser_cls, limit=None):
parser = parser_cls()
return parser.parse_all(limit=limit)
def run_analyze(args):
parser_cls = COLLECTIONS[args.collection]
commands = args.sub_command
if 'all' in commands:
commands = ANALYZE_COMMANDS
documents = load_documents()
documents = load_documents(parser_cls)
tokenizer = SimpleTokenizer(args.stop_words_file)
......
from gogole.document import StanfordDocument
from os import walk
from os import listdir
class StanfordParser:
DIRECTORY = "data/standford"
DIRECTORY = "data/stanford"
def find_documents(self, limit=None):
counter = 0 # count documents found
for collection_index in range(10):
collection_dir = DIRECTORY + "/" + str(collection_index)
collection_dir = self.DIRECTORY + "/" + str(collection_index)
for (dirpath, dirnames, filenames) in walk(collection_dir):
f.extend(filenames)
print(filenames)
"""
with open(self.FILENAME, 'r') as f:
current_document_id = None
current_document = None
current_marker = None
buffer = ""
for line in f:
marker = self.find_marker_in_line(line)
if marker == self.DOCUMENT_MARKER:
if current_document is not None:
yield current_document
current_document_id = int(line[3:])
if limit is not None and counter >= limit:
return
current_document = CACMDocument(current_document_id)
for filename in listdir(collection_dir):
with open(collection_dir + "/" + filename, 'r') as f:
current_document_id = str(collection_index) + filename
current_document = StanfordDocument(current_document_id)
counter += 1
content = ""
elif marker is not None:
if current_marker is not None and current_marker in self.MARKERS.keys():
setattr(
current_document,
self.MARKERS[current_marker],
buffer)
for line in f:
content += line
buffer = ""
current_marker = marker
current_document.keywords = content
yield current_document
else:
buffer += line
yield current_document
"""
"""
def parse_all(self, limit=None):
return {doc.document_id: doc for doc in self.find_documents(limit)}
"""
import argparse
from gogole.commands import analyze_command, build_command
from gogole.config import COLLECTIONS
def main():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment