From 28f574093e330993476442ae4e4e4bb86343c79f Mon Sep 17 00:00:00 2001 From: David Dos Santos Date: Sun, 4 Feb 2018 10:49:08 +0100 Subject: [PATCH] store right doc norms --- gogole/indexer/bsbi_indexer.py | 70 ++++++++++++++++++++++---------- gogole/parser/stanford_parser.py | 2 +- gogole/query/vectorial_query.py | 55 ++++++++++++++----------- 3 files changed, 81 insertions(+), 46 deletions(-) diff --git a/gogole/indexer/bsbi_indexer.py b/gogole/indexer/bsbi_indexer.py index f7472b6..3068f3e 100644 --- a/gogole/indexer/bsbi_indexer.py +++ b/gogole/indexer/bsbi_indexer.py @@ -10,7 +10,8 @@ from gogole.utils import timeit from gogole.indexer import Indexer class BSBIIndexer(Indexer): - BLOCK_SIZE = 16 + BLOCK_SIZE = 12 + DOCUMENT_METATADATA_BLOCK_SIZE = 6 def __init__(self, collection_name, maxsize=None): """ BSBIIndexer constructs a inverted index on disk @@ -59,9 +60,11 @@ class BSBIIndexer(Indexer): # dict: token_id -> number of documents having this token self.token_id_to_df = defaultdict(int) + self.index = dict() + self.nb_documents = 0 + def get_collection_size(self): - # TODO: use the size of the documents_norms file - return len(self.document_norms) + return os.stat(self.DOCUMENT_METADATA_FILE).st_size // self.DOCUMENT_METATADATA_BLOCK_SIZE def init_token_id_seq(self, start=0): self.token_id_seq = itertools.count(start=start) @@ -92,9 +95,12 @@ class BSBIIndexer(Indexer): """Cleanup temporary files """ + self.index = dict() + for filename in self.tmp_filenames: os.remove(filename) + def flush_buffer(self): sorted_tuples = sorted(self.buffer) @@ -103,23 +109,25 @@ class BSBIIndexer(Indexer): with open(filename, 'wb') as f: self.tmp_filenames.append(f.name) - for token_id, doc_id, frequency, doc_max_frequency in sorted_tuples: + for token_id, doc_id, frequency in sorted_tuples: # assume we already are at the end of the file b = bytearray() b += struct.pack('i', token_id) b += struct.pack('i', doc_id) b += struct.pack('i', frequency) - b += struct.pack('i', doc_max_frequency) f.write(b) # reset the buffer self.buffer = [] - def compute_document_vector_norm(self, counted_tokens): + def compute_document_norm(self, counted_tokens): norm = 0 - for token, count in counted_tokens.items(): - norm += (1 + math.log10(count))**2 + N = self.nb_documents + + for token, frequency in counted_tokens.items(): + token_id = self.token_to_token_id[token] + norm += (1+math.log10(frequency))**2 * (math.log10(N/self.token_id_to_df[token_id]))**2 return math.sqrt(norm) @@ -135,23 +143,22 @@ class BSBIIndexer(Indexer): # convert tokens to token ids token_ids = set() - # get max frequency among tokens - _, max_frequency = counted_tokens.most_common(1)[0] - - self.document_norms[doc_id] = self.compute_document_vector_norm(counted_tokens) + self.index[doc_id] = counted_tokens for token, frequency in counted_tokens.items(): token_id = self.find_or_create_token_id(token) self.token_id_to_df[token_id] += 1 - self.buffer += [(token_id, doc_id, frequency, max_frequency)] + self.buffer += [(token_id, doc_id, frequency)] if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize: self.flush_buffer() + self.nb_documents += 1 + - def _read_in_chunks(self, f, blocksize=8): + def _read_in_chunks(self, f, blocksize=12): while True: data = f.read(blocksize) if not data: @@ -178,9 +185,35 @@ class BSBIIndexer(Indexer): fp.close() self.save_to_disk() + + self.save_document_metadata() self.cleanup() # cleanup temporary files self.status = self.INDEX_STATUS_CREATED + + def save_document_metadata(self): + with open(self.DOCUMENT_METADATA_FILE, 'wb') as f: + for _, counted_tokens in self.index.items(): + norm = self.compute_document_norm(counted_tokens) + _, max_frequency = counted_tokens.most_common(1)[0] + b = bytearray() + b += struct.pack('H', max_frequency) # H: unsigned short integer (2 bytes) + b += struct.pack('f', norm) # f: float (4 bytes) + f.write(b) + + + def get_documents_metadata(self, doc_ids): + result = dict() + with open(self.DOCUMENT_METADATA_FILE, 'rb') as f: + for doc_id in doc_ids: + f.seek(self.DOCUMENT_METATADATA_BLOCK_SIZE * (doc_id-1)) + max_frequency = struct.unpack('H', f.read(2))[0] + norm = struct.unpack('f', f.read(4))[0] + + result[doc_id] = (max_frequency, norm) + + return result + def _read_token_id(self, file, pos): file.seek(pos*self.BLOCK_SIZE, 0) token_id = struct.unpack('i', file.read(4))[0] @@ -220,9 +253,8 @@ class BSBIIndexer(Indexer): if t_id == token_id: doc_id = struct.unpack('i', f.read(4))[0] frequency = struct.unpack('i', f.read(4))[0] - max_frequency = struct.unpack('i', f.read(4))[0] - document_ids[doc_id] = frequency, max_frequency + document_ids[doc_id] = frequency for p in [pos+1, pos-1]: if p not in visited and lower_bound <= p and upper_bound >= p: @@ -245,17 +277,11 @@ class BSBIIndexer(Indexer): with open(self.TOKENS_MAP_FILE, 'wb') as f: pickle.dump(self.token_to_token_id, f, pickle.HIGHEST_PROTOCOL) - with open(self.DOCUMENT_NORMS_FILE, 'wb') as f: - pickle.dump(self.document_norms, f, pickle.HIGHEST_PROTOCOL) - def load_from_cache(self): try: with open(self.TOKENS_MAP_FILE, 'rb') as f: self.token_to_token_id = pickle.load(f) - with open(self.DOCUMENT_NORMS_FILE, 'rb') as f: - self.document_norms = pickle.load(f) - self.status = self.INDEX_STATUS_CREATED return True diff --git a/gogole/parser/stanford_parser.py b/gogole/parser/stanford_parser.py index e32b699..668e788 100644 --- a/gogole/parser/stanford_parser.py +++ b/gogole/parser/stanford_parser.py @@ -8,7 +8,7 @@ class StanfordParser: def find_documents(self, limit=None): - counter = itertools.count() # count documents found + counter = itertools.count(start=1) # count documents found for collection_index in range(10): collection_dir = self.DIRECTORY + "/" + str(collection_index) diff --git a/gogole/query/vectorial_query.py b/gogole/query/vectorial_query.py index 99ed1ac..2599e8c 100644 --- a/gogole/query/vectorial_query.py +++ b/gogole/query/vectorial_query.py @@ -8,8 +8,7 @@ WEIGHTING_TYPE_TF_IDF = "tf-idf" WEIGHTING_TYPE_NORM_TF_IDF = "norm-tf-idf" WEIGHTING_TYPE_NORM_FREQ = "norm-freq" -# TODO: Implement norm-tf-idf weight type and add it to weighting types -WEIGHTING_TYPES = [WEIGHTING_TYPE_NORM_FREQ, WEIGHTING_TYPE_TF_IDF] +WEIGHTING_TYPES = [WEIGHTING_TYPE_NORM_FREQ, WEIGHTING_TYPE_TF_IDF, WEIGHTING_TYPE_NORM_TF_IDF] class VectorialQuery(Query): @@ -25,7 +24,7 @@ class VectorialQuery(Query): )) for position, doc_id in enumerate(self.find_n_first_elements(results, n=10), start=1): - print('{}: doc id {}\n'.format(position, doc_id)) + print('{} [cos: {}]: doc id {}\n'.format(position, results[doc_id], doc_id)) @@ -43,45 +42,55 @@ class VectorialQuery(Query): N = self.collection.indexer.get_collection_size() + + + # compute tf(document, token) and df for all tokens in the query + for token in tokens: - doc_ids = self.collection.indexer.token_lookup_with_frequency(token) + doc_id_to_frequency = self.collection.indexer.token_lookup_with_frequency(token) + + if len(doc_id_to_frequency) > 0: + df[token] += len(doc_id_to_frequency) - if len(doc_ids) > 0: - df[token] += len(doc_ids) - for doc_id, freq in doc_ids.items(): - token_freq, max_freq = freq - if self.weight_type == WEIGHTING_TYPE_NORM_FREQ: - tf[doc_id][token] += (token_freq / max_freq) - else: - tf[doc_id][token] += token_freq + for doc_id, freq in doc_id_to_frequency.items(): + tf[doc_id][token] = freq similarities = defaultdict(int) - for doc_id, tokens_frequency in tf.items(): + doc_ids_to_metadata = self.collection.indexer.get_documents_metadata(tf.keys()) + + for doc_id, token_to_frequency in tf.items(): dot_product = 0 squared_norm_query = 0 for token, token_df in df.items(): doc_weight = 0 - if token in tokens_frequency: - if tokens_frequency[token] == 0: - raise Exception("frequency of {} is 0".format(token)) + if token in token_to_frequency: + # if the token is present in the query + # compute coordinates for the document vector and query vector + # and make the product of both + + if self.weight_type in [WEIGHTING_TYPE_TF_IDF, WEIGHTING_TYPE_NORM_TF_IDF]: + doc_weight = (1 + math.log10(token_to_frequency[token])) * math.log10(N/token_df) + + elif self.weight_type == WEIGHTING_TYPE_NORM_FREQ: + max_freq, _ = doc_ids_to_metadata[doc_id] + doc_weight = (1 + math.log10(token_to_frequency[token]/max_freq)) - if self.weight_type == WEIGHTING_TYPE_NORM_FREQ: - doc_weight = 1 + math.log10(tokens_frequency[token]) - else: - doc_weight = (1 + math.log10(tokens_frequency[token])) * math.log10(N/token_df) - query_weight = (1 + math.log10(tf_query[token])) + query_weight = (1 + math.log10(tf_query[token])) * math.log10(N/token_df) squared_norm_query += (query_weight**2) dot_product += doc_weight * query_weight - norm_doc = self.collection.indexer.document_norms[doc_id] - similarities[doc_id] = dot_product / (norm_doc + math.sqrt(squared_norm_query)) + if self.weight_type == WEIGHTING_TYPE_NORM_TF_IDF: + _, norm_doc = doc_ids_to_metadata[doc_id] + similarities[doc_id] = dot_product / (norm_doc * math.sqrt(squared_norm_query)) + else: + similarities[doc_id] = dot_product return similarities -- 2.22.0