diff --git a/gogole/indexer/bsbi_indexer.py b/gogole/indexer/bsbi_indexer.py index b0be4da090303e37965019461a0c378ef51c02b2..cc42790fc626ed8b923c41e05849f54afa6239bf 100644 --- a/gogole/indexer/bsbi_indexer.py +++ b/gogole/indexer/bsbi_indexer.py @@ -26,14 +26,17 @@ class BSBIIndexer(Indexer): self.tmp_filenames = [] self.tmp_file_id_seq = itertools.count() - self.count_documents = 0 + self.document_norms = dict() self.INDEX_FILE = '.cache/{}_index'.format(collection_name) self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name) - self.DOCUMENTS_MAP_FILE = '.cache/{}_documents_map'.format(collection_name) + self.DOCUMENT_NORMS_FILE = '.cache/{}_document_norms'.format(collection_name) self.status = self.INDEX_STATUS_NOT_CREATED + def get_collection_size(self): + return len(self.document_norms) + def init_token_id_seq(self, start=0): self.token_id_seq = itertools.count(start=start) @@ -72,14 +75,22 @@ class BSBIIndexer(Indexer): # reset the buffer self.buffer = [] + def compute_document_vector_norm(self, counted_tokens): + norm = 0 + for token, count in counted_tokens.items(): + norm += (1 + math.log10(count))**2 + + return math.sqrt(norm) + def add_document_tokens(self, document, counted_tokens): doc_id = document.document_id # convert tokens to token ids token_ids = set() - self.count_documents += 1 + self.document_norms[doc_id] = self.compute_document_vector_norm(counted_tokens) for token, frequency in counted_tokens.items(): + token_id = self.find_or_create_token_id(token) self.buffer += [(token_id, doc_id, frequency)] @@ -180,12 +191,18 @@ class BSBIIndexer(Indexer): with open(self.TOKENS_MAP_FILE, 'wb') as f: pickle.dump(self.tokens_map, f, pickle.HIGHEST_PROTOCOL) + with open(self.DOCUMENT_NORMS_FILE, 'wb') as f: + pickle.dump(self.document_norms, f, pickle.HIGHEST_PROTOCOL) + def load_from_cache(self): try: with open(self.TOKENS_MAP_FILE, 'rb') as f: - self.status = self.INDEX_STATUS_CREATED self.tokens_map = pickle.load(f) + with open(self.DOCUMENT_NORMS_FILE, 'rb') as f: + self.document_norms = pickle.load(f) + + self.status = self.INDEX_STATUS_CREATED return True except FileNotFoundError: diff --git a/gogole/query/boolean_query.py b/gogole/query/boolean_query.py index 12703161d33e135b5bcdfca6293795ff710712a2..b28f7f8341dc20379e52b438ca7e8e821cdd3f43 100644 --- a/gogole/query/boolean_query.py +++ b/gogole/query/boolean_query.py @@ -9,7 +9,7 @@ class BooleanQuery(Query): # Assume the expression # is in the conjunctive normal form - last_doc_id = self.collection.indexer.count_documents-1 + last_doc_id = self.collection.indexer.get_collection_size()-1 and_queries = query.split(self.OPERATOR_AND) doc_ids_by_conjunction = list() diff --git a/gogole/query/vectorial_query.py b/gogole/query/vectorial_query.py index 54da2eb694f0004121a288d3d5e3f6b830704e3b..71954875c4d455b2b234acc35f7b8496106639ee 100644 --- a/gogole/query/vectorial_query.py +++ b/gogole/query/vectorial_query.py @@ -34,7 +34,7 @@ class VectorialQuery(Query): tf = defaultdict(lambda: defaultdict(int)) - N = self.collection.indexer.count_documents + N = self.collection.indexer.get_collection_size() for token in tokens: doc_ids = self.collection.indexer.token_lookup_with_frequency(token)