Commit 383d7664 authored by Dos Santos David's avatar Dos Santos David

store document norms

parent afa3163c
......@@ -26,14 +26,17 @@ class BSBIIndexer(Indexer):
self.tmp_filenames = []
self.tmp_file_id_seq = itertools.count()
self.count_documents = 0
self.document_norms = dict()
self.INDEX_FILE = '.cache/{}_index'.format(collection_name)
self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name)
self.DOCUMENTS_MAP_FILE = '.cache/{}_documents_map'.format(collection_name)
self.DOCUMENT_NORMS_FILE = '.cache/{}_document_norms'.format(collection_name)
self.status = self.INDEX_STATUS_NOT_CREATED
def get_collection_size(self):
return len(self.document_norms)
def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start)
......@@ -72,14 +75,22 @@ class BSBIIndexer(Indexer):
# reset the buffer
self.buffer = []
def compute_document_vector_norm(self, counted_tokens):
norm = 0
for token, count in counted_tokens.items():
norm += (1 + math.log10(count))**2
return math.sqrt(norm)
def add_document_tokens(self, document, counted_tokens):
doc_id = document.document_id
# convert tokens to token ids
token_ids = set()
self.count_documents += 1
self.document_norms[doc_id] = self.compute_document_vector_norm(counted_tokens)
for token, frequency in counted_tokens.items():
token_id = self.find_or_create_token_id(token)
self.buffer += [(token_id, doc_id, frequency)]
......@@ -180,12 +191,18 @@ class BSBIIndexer(Indexer):
with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.tokens_map, f, pickle.HIGHEST_PROTOCOL)
with open(self.DOCUMENT_NORMS_FILE, 'wb') as f:
pickle.dump(self.document_norms, f, pickle.HIGHEST_PROTOCOL)
def load_from_cache(self):
try:
with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.status = self.INDEX_STATUS_CREATED
self.tokens_map = pickle.load(f)
with open(self.DOCUMENT_NORMS_FILE, 'rb') as f:
self.document_norms = pickle.load(f)
self.status = self.INDEX_STATUS_CREATED
return True
except FileNotFoundError:
......
......@@ -9,7 +9,7 @@ class BooleanQuery(Query):
# Assume the expression
# is in the conjunctive normal form
last_doc_id = self.collection.indexer.count_documents-1
last_doc_id = self.collection.indexer.get_collection_size()-1
and_queries = query.split(self.OPERATOR_AND)
doc_ids_by_conjunction = list()
......
......@@ -34,7 +34,7 @@ class VectorialQuery(Query):
tf = defaultdict(lambda: defaultdict(int))
N = self.collection.indexer.count_documents
N = self.collection.indexer.get_collection_size()
for token in tokens:
doc_ids = self.collection.indexer.token_lookup_with_frequency(token)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment