Commit 7bfb2b56 authored by Dos Santos David's avatar Dos Santos David

finish bsbi indexer

parent 091334e3
...@@ -7,9 +7,7 @@ import math ...@@ -7,9 +7,7 @@ import math
class BSBIIndexer: class BSBIIndexer:
BLOCK_SIZE = 4 BLOCK_SIZE = 4
def __init__(self, def __init__(self, maxsize=None):
maxsize=None
):
""" """
:param maxsize: max size of the buffer (in bytes) :param maxsize: max size of the buffer (in bytes)
""" """
...@@ -62,7 +60,7 @@ class BSBIIndexer: ...@@ -62,7 +60,7 @@ class BSBIIndexer:
token_id = self.find_or_create_token_id(token) token_id = self.find_or_create_token_id(token)
self.buffer += [(token_id, doc_id)] self.buffer += [(token_id, doc_id)]
if self.maxsize is not None and 2*2*len(self.buffer) >= self.maxsize: if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
self.flush_buffer() self.flush_buffer()
...@@ -92,6 +90,11 @@ class BSBIIndexer: ...@@ -92,6 +90,11 @@ class BSBIIndexer:
for fp in tmp_files: for fp in tmp_files:
fp.close() fp.close()
def _read_token_id(self, file, pos):
file.seek(pos*self.BLOCK_SIZE, 0)
token_id = struct.unpack('H', file.read(2))[0]
return token_id
def token_lookup(self, token): def token_lookup(self, token):
""" """
Returns a list of documents Returns a list of documents
...@@ -99,5 +102,42 @@ class BSBIIndexer: ...@@ -99,5 +102,42 @@ class BSBIIndexer:
:param token: token to search in documents :param token: token to search in documents
""" """
pass
if token not in self.tokens_map:
return []
token_id = self.tokens_map[token]
with open('.cache/index', 'rb') as f:
upper_bound = (os.fstat(f.fileno()).st_size) // self.BLOCK_SIZE
lower_bound = 0
while True:
mid = math.floor((upper_bound + lower_bound) / 2)
to_visit = {mid}
visited = set()
t_id = None
document_ids = set()
while to_visit:
pos = to_visit.pop()
visited.add(pos)
t_id = self._read_token_id(f, pos)
if t_id == token_id:
doc_id = struct.unpack('H', f.read(2))[0]
document_ids.add(doc_id)
for p in [pos+1, pos-1]:
if p not in visited and lower_bound <= p and upper_bound >= p:
to_visit.add(p)
if document_ids:
return document_ids
if t_id < token_id:
lower_bound = mid+1
else:
upper_bound = mid-1
...@@ -9,15 +9,15 @@ class SimpleTokenizer(AbstractTokenizer): ...@@ -9,15 +9,15 @@ class SimpleTokenizer(AbstractTokenizer):
SEPARATORS = [" ", ".", ",", "!", "?", ":", ";", "\n", "(", ")"] SEPARATORS = [" ", ".", ",", "!", "?", ":", ";", "\n", "(", ")"]
def __init__(self, stop_words_filename=None): STOP_WORDS_LOCATION = 'data/common_words'
self.stop_words_filename = stop_words_filename
def __init__(self):
self._stop_words = set() self._stop_words = set()
if stop_words_filename is not None: self.load_stop_words()
self.load_stop_words()
def load_stop_words(self): def load_stop_words(self):
with open(self.stop_words_filename, 'r') as f: with open(self.STOP_WORDS_LOCATION, 'r') as f:
self._stop_words = set([word.strip() for word in f.readlines()]) self._stop_words = set([word.strip() for word in f.readlines()])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment