Commit c166296a authored by Dos Santos David's avatar Dos Santos David

load data from cache

parent c9a0c858
......@@ -5,18 +5,28 @@ def run_index_command(collection, args):
indexer = collection.indexer
if args.index_command == 'build':
print('... loading the documents...')
build_index = False
count_documents = 0
for doc_id, document in enumerate(parser.find_documents(limit=None)):
count_documents += 1
tokens = tokenizer.get_tokens(document)
if not args.build_no_cache:
# try to load index from the disk
if indexer.load_from_cache():
print('cache loaded from disk')
else:
build_index = True
indexer.add_document_tokens(doc_id, tokens)
if build_index:
print('... loading the documents...')
print('... {} documents loaded'.format(count_documents))
count_documents = 0
for doc_id, document in enumerate(parser.find_documents(limit=None)):
count_documents += 1
tokens = tokenizer.get_tokens(document)
indexer.build_index()
indexer.add_document_tokens(doc_id, tokens)
print('... {} documents loaded'.format(count_documents))
indexer.build_index()
if args.index_command == 'lookup':
......
......@@ -3,19 +3,24 @@ import itertools
import heapq
import os
import math
import pickle
from gogole.utils import timeit
class BSBIIndexer:
BLOCK_SIZE = 8
INDEX_FILE = '.cache/index'
TOKENS_MAP_FILE = '.cache/tokens_map'
DOCUMENTS_MAP_FILE = '.cache/documents_map'
def __init__(self, maxsize=None):
"""
:param maxsize: max size of the buffer (in bytes)
"""
self.tokens_map = dict()
self.token_id_seq = itertools.count() # next token id
self.init_token_id_seq() # next token id
self.maxsize = maxsize
self.buffer = []
......@@ -24,6 +29,9 @@ class BSBIIndexer:
self.tmp_filenames = []
self.tmp_file_id_seq = itertools.count()
def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start)
def find_or_create_token_id(self, token):
if token not in self.tokens_map:
token_id = next(self.token_id_seq)
......@@ -84,7 +92,7 @@ class BSBIIndexer:
try:
merged_tuples_iterator = heapq.merge(*map(self._read_in_chunks, tmp_files))
with open('.cache/index', 'wb') as f:
with open(self.INDEX_FILE, 'wb') as f:
for t in merged_tuples_iterator:
# TODO: maybe write by block ?
f.write(t)
......@@ -92,6 +100,8 @@ class BSBIIndexer:
for fp in tmp_files:
fp.close()
self.save_to_disk()
def _read_token_id(self, file, pos):
file.seek(pos*self.BLOCK_SIZE, 0)
token_id = struct.unpack('i', file.read(4))[0]
......@@ -111,7 +121,7 @@ class BSBIIndexer:
token_id = self.tokens_map[token]
with open('.cache/index', 'rb') as f:
with open(self.INDEX_FILE, 'rb') as f:
upper_bound = (os.fstat(f.fileno()).st_size) // self.BLOCK_SIZE
lower_bound = 0
......@@ -144,3 +154,19 @@ class BSBIIndexer:
lower_bound = mid+1
else:
upper_bound = mid-1
def save_to_disk(self):
with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.tokens_map, f, pickle.HIGHEST_PROTOCOL)
def load_from_cache(self):
try:
with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.tokens_map = pickle.load(f)
return True
except FileNotFoundError:
return False
self.init_token_id_seq(max(self.tokens_map.keys()))
......@@ -9,3 +9,6 @@ class Indexer:
def token_lookup(self, token):
raise Exception('token_lookup not implemented')
def load_from_cache(self):
return Exception("load_from_cache not implemented")
......@@ -16,3 +16,6 @@ class SimpleIndexer(Indexer):
def token_lookup(self, token):
return self.tokens_document_map.get(token, [])
def load_from_cache(self):
return False
......@@ -48,6 +48,11 @@ def build_cli_index_parser(root_parser):
lookup_parser.add_argument('token', nargs=1)
build_parser = index_subparser.add_parser('build', help="build the index")
build_parser.add_argument('--no-cache',
help='do not use the cache when building the index',
action='store_true',
dest='build_no_cache'
)
def build_cli_parser():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment