Commit 505aab8d authored by Dos Santos David's avatar Dos Santos David

timeit load documents

parent 0731c491
from gogole.utils import timeit
class Collection:
Describe a standard collection
def __init__(self):
self._parser = None
self._parser = None # type: gogole.parser.Parser
self._indexer = None
self._tokenizer = None
def parser(self):
def parser(self) -> 'gogole.parser.Parser':
return self._parser
def indexer(self):
def indexer(self) -> 'gogole.indexer.Indexer':
return self._indexer
def tokenizer(self):
def tokenizer(self) -> 'gogole.tokenizer.Tokenizer':
return self._tokenizer
def load_documents_in_indexer(self, limit=None):
nb_documents = 0
for document in self.parser.find_documents(limit=limit):
nb_documents += 1
counted_tokens = self.tokenizer.get_counted_tokens(document)
self.indexer.add_document_tokens(document, counted_tokens)
return nb_documents
......@@ -9,22 +9,21 @@ def run(collection, args):
if not args.build_no_cache:
# try to load index from the disk
if indexer.load_from_cache():
print('cache loaded from disk')
ok, t = indexer.load_from_cache()
if ok:
print('cache loaded from disk in {elapsed_time:.2f} ms'.format(elapsed_time=t))
build_index = True
if build_index:
# load the documents in the indexer
print('... loading the documents...')
nb_documents, elapsed_time = collection.load_documents_in_indexer()
count_documents = 0
for document in parser.find_documents(limit=None):
count_documents += 1
counted_tokens = tokenizer.get_counted_tokens(document)
indexer.add_document_tokens(document, counted_tokens)
print('... Loaded {nb_documents} documents in {elapsed_time:.2f} ms'.format(nb_documents=nb_documents, elapsed_time=elapsed_time))
print('... {} documents loaded'.format(count_documents))
print('... creating the index')
_,t = indexer.build_index()
print('... index created in {elapsed_time:.2f} ms'.format(elapsed_time=t))
......@@ -280,6 +280,7 @@ class BSBIIndexer(Indexer):
with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.token_to_token_id, f, pickle.HIGHEST_PROTOCOL)
def load_from_cache(self):
with open(self.TOKENS_MAP_FILE, 'rb') as f:
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment