Commit 71037300 authored by Prot Alexandre's avatar Prot Alexandre

adding stanford parser

parent 478a4bcc
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law from gogole.utils import heap_law
from gogole.config import COLLECTIONS
ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law'] ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law']
def load_documents(parser_cls=CACMParser, limit=None): def load_documents(parser_cls, limit=None):
parser = parser_cls() parser = parser_cls()
return parser.parse_all(limit=limit) return parser.parse_all(limit=limit)
def run_analyze(args): def run_analyze(args):
parser_cls = COLLECTIONS[args.collection]
commands = args.sub_command commands = args.sub_command
if 'all' in commands: if 'all' in commands:
commands = ANALYZE_COMMANDS commands = ANALYZE_COMMANDS
documents = load_documents() documents = load_documents(parser_cls)
tokenizer = SimpleTokenizer(args.stop_words_file) tokenizer = SimpleTokenizer(args.stop_words_file)
......
from gogole.document import StanfordDocument from gogole.document import StanfordDocument
from os import walk from os import listdir
class StanfordParser: class StanfordParser:
DIRECTORY = "data/standford" DIRECTORY = "data/stanford"
def find_documents(self, limit=None): def find_documents(self, limit=None):
counter = 0 # count documents found counter = 0 # count documents found
for collection_index in range(10): for collection_index in range(10):
collection_dir = DIRECTORY + "/" + str(collection_index) collection_dir = self.DIRECTORY + "/" + str(collection_index)
for (dirpath, dirnames, filenames) in walk(collection_dir): for filename in listdir(collection_dir):
f.extend(filenames) with open(collection_dir + "/" + filename, 'r') as f:
print(filenames) current_document_id = str(collection_index) + filename
""" current_document = StanfordDocument(current_document_id)
with open(self.FILENAME, 'r') as f:
current_document_id = None
current_document = None
current_marker = None
buffer = ""
for line in f:
marker = self.find_marker_in_line(line)
if marker == self.DOCUMENT_MARKER:
if current_document is not None:
yield current_document
current_document_id = int(line[3:])
if limit is not None and counter >= limit:
return
current_document = CACMDocument(current_document_id)
counter += 1 counter += 1
content = ""
elif marker is not None: for line in f:
content += line
if current_marker is not None and current_marker in self.MARKERS.keys():
setattr(
current_document,
self.MARKERS[current_marker],
buffer)
buffer = "" current_document.keywords = content
current_marker = marker yield current_document
else:
buffer += line
yield current_document
"""
"""
def parse_all(self, limit=None): def parse_all(self, limit=None):
return {doc.document_id: doc for doc in self.find_documents(limit)} return {doc.document_id: doc for doc in self.find_documents(limit)}
"""
import argparse import argparse
from gogole.commands import analyze_command, build_command from gogole.commands import analyze_command, build_command
from gogole.config import COLLECTIONS
def main(): def main():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment