Commit d8871cb7 authored by Dos Santos David's avatar Dos Santos David

switch to 8-bytes blocks

parent 4829dd23
......@@ -4,8 +4,10 @@ import heapq
import os
import math
from gogole.utils import timeit
class BSBIIndexer:
BLOCK_SIZE = 4
BLOCK_SIZE = 8
def __init__(self, maxsize=None):
"""
......@@ -44,8 +46,8 @@ class BSBIIndexer:
for token_id, doc_id in sorted_tuples:
# assume we already are at the end of the file
b = bytearray()
b += struct.pack('H', token_id) # H stands for unsigned short integer ( 2 bytes - up to 65535)
b += struct.pack('H', doc_id)
b += struct.pack('i', token_id) # H stands for unsigned short integer ( 2 bytes - up to 65535)
b += struct.pack('i', doc_id)
f.write(b)
......@@ -64,7 +66,7 @@ class BSBIIndexer:
self.flush_buffer()
def _read_in_chunks(self, f, blocksize=4):
def _read_in_chunks(self, f, blocksize=8):
while True:
data = f.read(blocksize)
if not data:
......@@ -72,7 +74,7 @@ class BSBIIndexer:
yield data
@timeit("index built in")
def build_index(self):
# 1/ flush the buffer
......@@ -92,9 +94,10 @@ class BSBIIndexer:
def _read_token_id(self, file, pos):
file.seek(pos*self.BLOCK_SIZE, 0)
token_id = struct.unpack('H', file.read(2))[0]
token_id = struct.unpack('i', file.read(4))[0]
return token_id
@timeit("lookup done in")
def token_lookup(self, token):
"""
Returns a list of documents
......@@ -127,7 +130,7 @@ class BSBIIndexer:
t_id = self._read_token_id(f, pos)
if t_id == token_id:
doc_id = struct.unpack('H', f.read(2))[0]
doc_id = struct.unpack('i', f.read(4))[0]
document_ids.add(doc_id)
for p in [pos+1, pos-1]:
if p not in visited and lower_bound <= p and upper_bound >= p:
......
from gogole.utils.timeit import timeit
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment