Commit 7bfb2b56 authored by Dos Santos David's avatar Dos Santos David

finish bsbi indexer

parent 091334e3
......@@ -7,9 +7,7 @@ import math
class BSBIIndexer:
BLOCK_SIZE = 4
def __init__(self,
maxsize=None
):
def __init__(self, maxsize=None):
"""
:param maxsize: max size of the buffer (in bytes)
"""
......@@ -62,7 +60,7 @@ class BSBIIndexer:
token_id = self.find_or_create_token_id(token)
self.buffer += [(token_id, doc_id)]
if self.maxsize is not None and 2*2*len(self.buffer) >= self.maxsize:
if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
self.flush_buffer()
......@@ -92,6 +90,11 @@ class BSBIIndexer:
for fp in tmp_files:
fp.close()
def _read_token_id(self, file, pos):
file.seek(pos*self.BLOCK_SIZE, 0)
token_id = struct.unpack('H', file.read(2))[0]
return token_id
def token_lookup(self, token):
"""
Returns a list of documents
......@@ -99,5 +102,42 @@ class BSBIIndexer:
:param token: token to search in documents
"""
pass
if token not in self.tokens_map:
return []
token_id = self.tokens_map[token]
with open('.cache/index', 'rb') as f:
upper_bound = (os.fstat(f.fileno()).st_size) // self.BLOCK_SIZE
lower_bound = 0
while True:
mid = math.floor((upper_bound + lower_bound) / 2)
to_visit = {mid}
visited = set()
t_id = None
document_ids = set()
while to_visit:
pos = to_visit.pop()
visited.add(pos)
t_id = self._read_token_id(f, pos)
if t_id == token_id:
doc_id = struct.unpack('H', f.read(2))[0]
document_ids.add(doc_id)
for p in [pos+1, pos-1]:
if p not in visited and lower_bound <= p and upper_bound >= p:
to_visit.add(p)
if document_ids:
return document_ids
if t_id < token_id:
lower_bound = mid+1
else:
upper_bound = mid-1
......@@ -9,15 +9,15 @@ class SimpleTokenizer(AbstractTokenizer):
SEPARATORS = [" ", ".", ",", "!", "?", ":", ";", "\n", "(", ")"]
def __init__(self, stop_words_filename=None):
self.stop_words_filename = stop_words_filename
STOP_WORDS_LOCATION = 'data/common_words'
def __init__(self):
self._stop_words = set()
if stop_words_filename is not None:
self.load_stop_words()
def load_stop_words(self):
with open(self.stop_words_filename, 'r') as f:
with open(self.STOP_WORDS_LOCATION, 'r') as f:
self._stop_words = set([word.strip() for word in f.readlines()])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment