stanford_parser.py 1.08 KB
Newer Older
1
from gogole.document import StanfordDocument
Prot Alexandre's avatar
Prot Alexandre committed
2
from os import listdir
Dos Santos David's avatar
Dos Santos David committed
3
import itertools
4 5 6

class StanfordParser:

Prot Alexandre's avatar
Prot Alexandre committed
7
    DIRECTORY = "data/stanford"
8 9 10


    def find_documents(self, limit=None):
Dos Santos David's avatar
Dos Santos David committed
11
        counter = itertools.count(start=1) # count documents found
12 13

        for collection_index in range(10):
Prot Alexandre's avatar
Prot Alexandre committed
14
            collection_dir = self.DIRECTORY + "/" + str(collection_index)
15

Prot Alexandre's avatar
Prot Alexandre committed
16
            for filename in listdir(collection_dir):
17 18 19
                if limit is not None and counter >= limit:
                        return

Prot Alexandre's avatar
Prot Alexandre committed
20
                with open(collection_dir + "/" + filename, 'r') as f:
Dos Santos David's avatar
Dos Santos David committed
21 22
                    url = str(collection_index) + filename
                    current_document_id = next(counter)
Prot Alexandre's avatar
Prot Alexandre committed
23
                    current_document = StanfordDocument(current_document_id)
Dos Santos David's avatar
Dos Santos David committed
24

Prot Alexandre's avatar
Prot Alexandre committed
25
                    content = ""
26

Prot Alexandre's avatar
Prot Alexandre committed
27 28
                    for line in f:
                        content += line
29

Prot Alexandre's avatar
Prot Alexandre committed
30 31
                    current_document.keywords = content
                    yield current_document
32 33 34 35


    def parse_all(self, limit=None):
        return {doc.document_id: doc for doc in self.find_documents(limit)}