......@@ -4,6 +4,7 @@ import heapq
import os
import math
import pickle
from collections import defaultdict
from gogole.utils import timeit
from gogole.indexer import Indexer
......@@ -12,46 +13,85 @@ class BSBIIndexer(Indexer):
def __init__(self, collection_name, maxsize=None):
""" BSBIIndexer constructs a inverted index on disk
collection_name {string} -- the identifier of the collection
it will be useful to retrieve the correct index on the disk
Keyword Arguments:
maxsize {integer} -- [Maxsize of the buffer] (default: {None})
:param maxsize: max size of the buffer (in bytes)
self.tokens_map = dict()
self.init_token_id_seq() # next token id
# map a token id to a token
self.token_to_token_id = dict()
# initialize the token id sequence.
# it basically initializes an iterator producing the integers 0,1,2...
# max size of the buffer
# if the buffer exceed this limit, then the buffer will be written to the disk
self.maxsize = maxsize
self.buffer = []
# temporary files for the inverted index
self.tmp_filenames = []
self.tmp_file_id_seq = itertools.count()
self.document_norms = dict()
# name of the index files written to the disk
# we write several files to the disk
# * the inverted index is a map token_id -> postings (described as (doc_id, frequency))
self.INVERTED_INDEX_FILE = '.cache/{}_index'.format(collection_name)
self.INDEX_FILE = '.cache/{}_index'.format(collection_name)
# * it maps the token id to a token
self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name)
self.DOCUMENT_NORMS_FILE = '.cache/{}_document_norms'.format(collection_name)
# * it stores for each document:
# - its norm
# - the max frequency of its tokens
self.DOCUMENT_METADATA_FILE = '.cache/{}_documents_metadata'.format(collection_name)
# by default, the status of an index is NOT_CREATED
self.status = self.INDEX_STATUS_NOT_CREATED
# dict: token_id -> number of documents having this token
self.token_id_to_df = defaultdict(int)
def get_collection_size(self):
# TODO: use the size of the documents_norms file
return len(self.document_norms)
def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start)
def find_or_create_token_id(self, token):
if token not in self.tokens_map:
"""hash function of the token. Returns the id of the token
If the token was already analyzed, it returns its id
Otherwise it returns an id never used before
token {string} -- the token to hash
[int] -- the token_id corresponding to the token
if token not in self.token_to_token_id:
token_id = next(self.token_id_seq)
self.tokens_map[token] = token_id
self.token_to_token_id[token] = token_id
return token_id
return self.tokens_map[token]
return self.token_to_token_id[token]
def cleanup(self):
"""Cleanup temporary files
for filename in self.tmp_filenames:
......@@ -84,6 +124,13 @@ class BSBIIndexer(Indexer):
return math.sqrt(norm)
def add_document_tokens(self, document, counted_tokens):
"""Add a document and its token to the buffer
document {Document} -- document
counted_tokens {Counter} -- counted tokens
doc_id = document.document_id
# convert tokens to token ids
token_ids = set()
......@@ -96,6 +143,8 @@ class BSBIIndexer(Indexer):
for token, frequency in counted_tokens.items():
token_id = self.find_or_create_token_id(token)
self.token_id_to_df[token_id] += 1
self.buffer += [(token_id, doc_id, frequency, max_frequency)]
if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
......@@ -120,7 +169,7 @@ class BSBIIndexer(Indexer):
merged_tuples_iterator = heapq.merge(*map(self._read_in_chunks, tmp_files))
with open(self.INDEX_FILE, 'wb') as f:
with open(self.INVERTED_INDEX_FILE, 'wb') as f:
for t in merged_tuples_iterator:
# TODO: maybe write by block ?
......@@ -146,12 +195,12 @@ class BSBIIndexer(Indexer):
document_ids = dict()
if token not in self.tokens_map:
if token not in self.token_to_token_id:
return document_ids
token_id = self.tokens_map[token]
token_id = self.token_to_token_id[token]
with open(self.INDEX_FILE, 'rb') as f:
with open(self.INVERTED_INDEX_FILE, 'rb') as f:
upper_bound = (os.fstat(f.fileno()).st_size) // self.BLOCK_SIZE
lower_bound = 0
......@@ -194,7 +243,7 @@ class BSBIIndexer(Indexer):
def save_to_disk(self):
with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.tokens_map, f, pickle.HIGHEST_PROTOCOL)
pickle.dump(self.token_to_token_id, f, pickle.HIGHEST_PROTOCOL)
with open(self.DOCUMENT_NORMS_FILE, 'wb') as f:
pickle.dump(self.document_norms, f, pickle.HIGHEST_PROTOCOL)
......@@ -202,7 +251,7 @@ class BSBIIndexer(Indexer):
def load_from_cache(self):
with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.tokens_map = pickle.load(f)
self.token_to_token_id = pickle.load(f)
with open(self.DOCUMENT_NORMS_FILE, 'rb') as f:
self.document_norms = pickle.load(f)
......@@ -213,10 +262,10 @@ class BSBIIndexer(Indexer):
except FileNotFoundError:
return False
def get_index_size(self):
return os.stat(self.INDEX_FILE).st_size
return os.stat(self.INVERTED_INDEX_FILE).st_size
def get_tokens_map_size(self):
def get_token_to_token_id_size(self):
return os.stat(self.TOKENS_MAP_FILE).st_size
