Commit 28f57409 authored by Dos Santos David's avatar Dos Santos David

store right doc norms

parent 6815b51c
...@@ -10,7 +10,8 @@ from gogole.utils import timeit ...@@ -10,7 +10,8 @@ from gogole.utils import timeit
from gogole.indexer import Indexer from gogole.indexer import Indexer
class BSBIIndexer(Indexer): class BSBIIndexer(Indexer):
BLOCK_SIZE = 16 BLOCK_SIZE = 12
DOCUMENT_METATADATA_BLOCK_SIZE = 6
def __init__(self, collection_name, maxsize=None): def __init__(self, collection_name, maxsize=None):
""" BSBIIndexer constructs a inverted index on disk """ BSBIIndexer constructs a inverted index on disk
...@@ -59,9 +60,11 @@ class BSBIIndexer(Indexer): ...@@ -59,9 +60,11 @@ class BSBIIndexer(Indexer):
# dict: token_id -> number of documents having this token # dict: token_id -> number of documents having this token
self.token_id_to_df = defaultdict(int) self.token_id_to_df = defaultdict(int)
self.index = dict()
self.nb_documents = 0
def get_collection_size(self): def get_collection_size(self):
# TODO: use the size of the documents_norms file return os.stat(self.DOCUMENT_METADATA_FILE).st_size // self.DOCUMENT_METATADATA_BLOCK_SIZE
return len(self.document_norms)
def init_token_id_seq(self, start=0): def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start) self.token_id_seq = itertools.count(start=start)
...@@ -92,9 +95,12 @@ class BSBIIndexer(Indexer): ...@@ -92,9 +95,12 @@ class BSBIIndexer(Indexer):
"""Cleanup temporary files """Cleanup temporary files
""" """
self.index = dict()
for filename in self.tmp_filenames: for filename in self.tmp_filenames:
os.remove(filename) os.remove(filename)
def flush_buffer(self): def flush_buffer(self):
sorted_tuples = sorted(self.buffer) sorted_tuples = sorted(self.buffer)
...@@ -103,23 +109,25 @@ class BSBIIndexer(Indexer): ...@@ -103,23 +109,25 @@ class BSBIIndexer(Indexer):
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
self.tmp_filenames.append(f.name) self.tmp_filenames.append(f.name)
for token_id, doc_id, frequency, doc_max_frequency in sorted_tuples: for token_id, doc_id, frequency in sorted_tuples:
# assume we already are at the end of the file # assume we already are at the end of the file
b = bytearray() b = bytearray()
b += struct.pack('i', token_id) b += struct.pack('i', token_id)
b += struct.pack('i', doc_id) b += struct.pack('i', doc_id)
b += struct.pack('i', frequency) b += struct.pack('i', frequency)
b += struct.pack('i', doc_max_frequency)
f.write(b) f.write(b)
# reset the buffer # reset the buffer
self.buffer = [] self.buffer = []
def compute_document_vector_norm(self, counted_tokens): def compute_document_norm(self, counted_tokens):
norm = 0 norm = 0
for token, count in counted_tokens.items(): N = self.nb_documents
norm += (1 + math.log10(count))**2
for token, frequency in counted_tokens.items():
token_id = self.token_to_token_id[token]
norm += (1+math.log10(frequency))**2 * (math.log10(N/self.token_id_to_df[token_id]))**2
return math.sqrt(norm) return math.sqrt(norm)
...@@ -135,23 +143,22 @@ class BSBIIndexer(Indexer): ...@@ -135,23 +143,22 @@ class BSBIIndexer(Indexer):
# convert tokens to token ids # convert tokens to token ids
token_ids = set() token_ids = set()
# get max frequency among tokens self.index[doc_id] = counted_tokens
_, max_frequency = counted_tokens.most_common(1)[0]
self.document_norms[doc_id] = self.compute_document_vector_norm(counted_tokens)
for token, frequency in counted_tokens.items(): for token, frequency in counted_tokens.items():
token_id = self.find_or_create_token_id(token) token_id = self.find_or_create_token_id(token)
self.token_id_to_df[token_id] += 1 self.token_id_to_df[token_id] += 1
self.buffer += [(token_id, doc_id, frequency, max_frequency)] self.buffer += [(token_id, doc_id, frequency)]
if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize: if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
self.flush_buffer() self.flush_buffer()
self.nb_documents += 1
def _read_in_chunks(self, f, blocksize=8): def _read_in_chunks(self, f, blocksize=12):
while True: while True:
data = f.read(blocksize) data = f.read(blocksize)
if not data: if not data:
...@@ -178,9 +185,35 @@ class BSBIIndexer(Indexer): ...@@ -178,9 +185,35 @@ class BSBIIndexer(Indexer):
fp.close() fp.close()
self.save_to_disk() self.save_to_disk()
self.save_document_metadata()
self.cleanup() # cleanup temporary files self.cleanup() # cleanup temporary files
self.status = self.INDEX_STATUS_CREATED self.status = self.INDEX_STATUS_CREATED
def save_document_metadata(self):
with open(self.DOCUMENT_METADATA_FILE, 'wb') as f:
for _, counted_tokens in self.index.items():
norm = self.compute_document_norm(counted_tokens)
_, max_frequency = counted_tokens.most_common(1)[0]
b = bytearray()
b += struct.pack('H', max_frequency) # H: unsigned short integer (2 bytes)
b += struct.pack('f', norm) # f: float (4 bytes)
f.write(b)
def get_documents_metadata(self, doc_ids):
result = dict()
with open(self.DOCUMENT_METADATA_FILE, 'rb') as f:
for doc_id in doc_ids:
f.seek(self.DOCUMENT_METATADATA_BLOCK_SIZE * (doc_id-1))
max_frequency = struct.unpack('H', f.read(2))[0]
norm = struct.unpack('f', f.read(4))[0]
result[doc_id] = (max_frequency, norm)
return result
def _read_token_id(self, file, pos): def _read_token_id(self, file, pos):
file.seek(pos*self.BLOCK_SIZE, 0) file.seek(pos*self.BLOCK_SIZE, 0)
token_id = struct.unpack('i', file.read(4))[0] token_id = struct.unpack('i', file.read(4))[0]
...@@ -220,9 +253,8 @@ class BSBIIndexer(Indexer): ...@@ -220,9 +253,8 @@ class BSBIIndexer(Indexer):
if t_id == token_id: if t_id == token_id:
doc_id = struct.unpack('i', f.read(4))[0] doc_id = struct.unpack('i', f.read(4))[0]
frequency = struct.unpack('i', f.read(4))[0] frequency = struct.unpack('i', f.read(4))[0]
max_frequency = struct.unpack('i', f.read(4))[0]
document_ids[doc_id] = frequency, max_frequency document_ids[doc_id] = frequency
for p in [pos+1, pos-1]: for p in [pos+1, pos-1]:
if p not in visited and lower_bound <= p and upper_bound >= p: if p not in visited and lower_bound <= p and upper_bound >= p:
...@@ -245,17 +277,11 @@ class BSBIIndexer(Indexer): ...@@ -245,17 +277,11 @@ class BSBIIndexer(Indexer):
with open(self.TOKENS_MAP_FILE, 'wb') as f: with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.token_to_token_id, f, pickle.HIGHEST_PROTOCOL) pickle.dump(self.token_to_token_id, f, pickle.HIGHEST_PROTOCOL)
with open(self.DOCUMENT_NORMS_FILE, 'wb') as f:
pickle.dump(self.document_norms, f, pickle.HIGHEST_PROTOCOL)
def load_from_cache(self): def load_from_cache(self):
try: try:
with open(self.TOKENS_MAP_FILE, 'rb') as f: with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.token_to_token_id = pickle.load(f) self.token_to_token_id = pickle.load(f)
with open(self.DOCUMENT_NORMS_FILE, 'rb') as f:
self.document_norms = pickle.load(f)
self.status = self.INDEX_STATUS_CREATED self.status = self.INDEX_STATUS_CREATED
return True return True
......
...@@ -8,7 +8,7 @@ class StanfordParser: ...@@ -8,7 +8,7 @@ class StanfordParser:
def find_documents(self, limit=None): def find_documents(self, limit=None):
counter = itertools.count() # count documents found counter = itertools.count(start=1) # count documents found
for collection_index in range(10): for collection_index in range(10):
collection_dir = self.DIRECTORY + "/" + str(collection_index) collection_dir = self.DIRECTORY + "/" + str(collection_index)
......
...@@ -8,8 +8,7 @@ WEIGHTING_TYPE_TF_IDF = "tf-idf" ...@@ -8,8 +8,7 @@ WEIGHTING_TYPE_TF_IDF = "tf-idf"
WEIGHTING_TYPE_NORM_TF_IDF = "norm-tf-idf" WEIGHTING_TYPE_NORM_TF_IDF = "norm-tf-idf"
WEIGHTING_TYPE_NORM_FREQ = "norm-freq" WEIGHTING_TYPE_NORM_FREQ = "norm-freq"
# TODO: Implement norm-tf-idf weight type and add it to weighting types WEIGHTING_TYPES = [WEIGHTING_TYPE_NORM_FREQ, WEIGHTING_TYPE_TF_IDF, WEIGHTING_TYPE_NORM_TF_IDF]
WEIGHTING_TYPES = [WEIGHTING_TYPE_NORM_FREQ, WEIGHTING_TYPE_TF_IDF]
class VectorialQuery(Query): class VectorialQuery(Query):
...@@ -25,7 +24,7 @@ class VectorialQuery(Query): ...@@ -25,7 +24,7 @@ class VectorialQuery(Query):
)) ))
for position, doc_id in enumerate(self.find_n_first_elements(results, n=10), start=1): for position, doc_id in enumerate(self.find_n_first_elements(results, n=10), start=1):
print('{}: doc id {}\n'.format(position, doc_id)) print('{} [cos: {}]: doc id {}\n'.format(position, results[doc_id], doc_id))
...@@ -43,45 +42,55 @@ class VectorialQuery(Query): ...@@ -43,45 +42,55 @@ class VectorialQuery(Query):
N = self.collection.indexer.get_collection_size() N = self.collection.indexer.get_collection_size()
# compute tf(document, token) and df for all tokens in the query
for token in tokens: for token in tokens:
doc_ids = self.collection.indexer.token_lookup_with_frequency(token) doc_id_to_frequency = self.collection.indexer.token_lookup_with_frequency(token)
if len(doc_id_to_frequency) > 0:
df[token] += len(doc_id_to_frequency)
if len(doc_ids) > 0:
df[token] += len(doc_ids)
for doc_id, freq in doc_ids.items(): for doc_id, freq in doc_id_to_frequency.items():
token_freq, max_freq = freq tf[doc_id][token] = freq
if self.weight_type == WEIGHTING_TYPE_NORM_FREQ:
tf[doc_id][token] += (token_freq / max_freq)
else:
tf[doc_id][token] += token_freq
similarities = defaultdict(int) similarities = defaultdict(int)
for doc_id, tokens_frequency in tf.items(): doc_ids_to_metadata = self.collection.indexer.get_documents_metadata(tf.keys())
for doc_id, token_to_frequency in tf.items():
dot_product = 0 dot_product = 0
squared_norm_query = 0 squared_norm_query = 0
for token, token_df in df.items(): for token, token_df in df.items():
doc_weight = 0 doc_weight = 0
if token in tokens_frequency: if token in token_to_frequency:
if tokens_frequency[token] == 0: # if the token is present in the query
raise Exception("frequency of {} is 0".format(token)) # compute coordinates for the document vector and query vector
# and make the product of both
if self.weight_type in [WEIGHTING_TYPE_TF_IDF, WEIGHTING_TYPE_NORM_TF_IDF]:
doc_weight = (1 + math.log10(token_to_frequency[token])) * math.log10(N/token_df)
elif self.weight_type == WEIGHTING_TYPE_NORM_FREQ:
max_freq, _ = doc_ids_to_metadata[doc_id]
doc_weight = (1 + math.log10(token_to_frequency[token]/max_freq))
if self.weight_type == WEIGHTING_TYPE_NORM_FREQ:
doc_weight = 1 + math.log10(tokens_frequency[token])
else:
doc_weight = (1 + math.log10(tokens_frequency[token])) * math.log10(N/token_df)
query_weight = (1 + math.log10(tf_query[token])) query_weight = (1 + math.log10(tf_query[token])) * math.log10(N/token_df)
squared_norm_query += (query_weight**2) squared_norm_query += (query_weight**2)
dot_product += doc_weight * query_weight dot_product += doc_weight * query_weight
norm_doc = self.collection.indexer.document_norms[doc_id] if self.weight_type == WEIGHTING_TYPE_NORM_TF_IDF:
similarities[doc_id] = dot_product / (norm_doc + math.sqrt(squared_norm_query)) _, norm_doc = doc_ids_to_metadata[doc_id]
similarities[doc_id] = dot_product / (norm_doc * math.sqrt(squared_norm_query))
else:
similarities[doc_id] = dot_product
return similarities return similarities
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment