Commit cc3cd962 authored by Dos Santos David's avatar Dos Santos David

store frequency of token in document in the reversed index

parent 38ffc77d
......@@ -20,9 +20,9 @@ def run_index_command(collection, args):
count_documents = 0
for document in parser.find_documents(limit=None):
count_documents += 1
tokens = tokenizer.get_tokens(document)
counted_tokens = tokenizer.get_counted_tokens(document)
indexer.add_document_tokens(document.document_id, tokens)
indexer.add_document_tokens(document, counted_tokens)
print('... {} documents loaded'.format(count_documents))
......@@ -31,11 +31,13 @@ def run_index_command(collection, args):
if args.index_command == 'lookup':
token = args.token[0]
doc_ids = indexer.token_lookup(token)
doc_ids = indexer.token_lookup_with_frequency(token)
if doc_ids:
print("{token} present in documents {doc_ids}".format(
print("{token} is present in {nb_docs} documents\n".format(
token=token,
doc_ids=", ".join(map(str, doc_ids))
nb_docs=len(doc_ids)
))
for doc_id, frequency in doc_ids.items():
print("doc : {doc_id} (frequency: {frequency})".format(doc_id=doc_id, frequency=frequency))
else:
print('no result :(')
......@@ -8,7 +8,7 @@ import pickle
from gogole.utils import timeit
class BSBIIndexer:
BLOCK_SIZE = 8
BLOCK_SIZE = 12
def __init__(self, collection_name, maxsize=None):
"""
......@@ -51,24 +51,27 @@ class BSBIIndexer:
with open(filename, 'wb') as f:
self.tmp_filenames.append(f.name)
for token_id, doc_id in sorted_tuples:
for token_id, doc_id, frequency in sorted_tuples:
# assume we already are at the end of the file
b = bytearray()
b += struct.pack('i', token_id) # H stands for unsigned short integer ( 2 bytes - up to 65535)
b += struct.pack('i', token_id)
b += struct.pack('i', doc_id)
b += struct.pack('i', frequency)
f.write(b)
# reset the buffer
self.buffer = []
def add_document_tokens(self, doc_id, tokens):
def add_document_tokens(self, document, counted_tokens):
doc_id = document.document_id
# convert tokens to token ids
token_ids = set()
for token in tokens:
for token, frequency in counted_tokens.items():
token_id = self.find_or_create_token_id(token)
self.buffer += [(token_id, doc_id)]
self.buffer += [(token_id, doc_id, frequency)]
if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
self.flush_buffer()
......@@ -108,16 +111,17 @@ class BSBIIndexer:
return token_id
@timeit("lookup done in")
def token_lookup(self, token):
def token_lookup_with_frequency(self, token):
"""
Returns a list of documents
where a given token is present
:param token: token to search in documents
"""
document_ids = dict()
if token not in self.tokens_map:
return set()
return document_ids
token_id = self.tokens_map[token]
......@@ -132,7 +136,6 @@ class BSBIIndexer:
to_visit = {mid}
visited = set()
t_id = None
document_ids = set()
while to_visit:
pos = to_visit.pop()
......@@ -141,7 +144,10 @@ class BSBIIndexer:
if t_id == token_id:
doc_id = struct.unpack('i', f.read(4))[0]
document_ids.add(doc_id)
frequency = struct.unpack('i', f.read(4))[0]
document_ids[doc_id] = frequency
for p in [pos+1, pos-1]:
if p not in visited and lower_bound <= p and upper_bound >= p:
to_visit.add(p)
......@@ -155,6 +161,10 @@ class BSBIIndexer:
else:
upper_bound = mid-1
def token_lookup(self, token):
return set(self.token_lookup_with_frequency(token).keys())
def save_to_disk(self):
with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.tokens_map, f, pickle.HIGHEST_PROTOCOL)
......
class Indexer:
def add_documents_token(self, doc_id, tokens):
def add_documents_token(self, document, tokens):
raise Exception('add_documents_token not implemented')
def build_index(self):
......
......@@ -7,15 +7,15 @@ class SimpleIndexer(Indexer):
def __init__(self):
self.tokens_document_map = defaultdict(list)
def add_documents_token(self, doc_id, tokens):
for token in tokens:
self.tokens_document_map[token] += doc_id
def add_documents_token(self, document, counted_tokens):
for token, count in counted_tokens.items():
self.tokens_document_map[token] += [(document.document_id,count)]
def build_index(self):
pass
def token_lookup(self, token):
return self.tokens_document_map.get(token, [])
return dict(self.tokens_document_map.get(token, []))
def load_from_cache(self):
return False
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment