Commit 6815b51c authored by Dos Santos David's avatar Dos Santos David

add docs and renaming

parent 5ddcd259
...@@ -4,6 +4,7 @@ import heapq ...@@ -4,6 +4,7 @@ import heapq
import os import os
import math import math
import pickle import pickle
from collections import defaultdict
from gogole.utils import timeit from gogole.utils import timeit
from gogole.indexer import Indexer from gogole.indexer import Indexer
...@@ -12,46 +13,85 @@ class BSBIIndexer(Indexer): ...@@ -12,46 +13,85 @@ class BSBIIndexer(Indexer):
BLOCK_SIZE = 16 BLOCK_SIZE = 16
def __init__(self, collection_name, maxsize=None): def __init__(self, collection_name, maxsize=None):
""" BSBIIndexer constructs a inverted index on disk
Arguments:
collection_name {string} -- the identifier of the collection
it will be useful to retrieve the correct index on the disk
Keyword Arguments:
maxsize {integer} -- [Maxsize of the buffer] (default: {None})
""" """
:param maxsize: max size of the buffer (in bytes)
"""
self.tokens_map = dict() # map a token id to a token
self.init_token_id_seq() # next token id self.token_to_token_id = dict()
# initialize the token id sequence.
# it basically initializes an iterator producing the integers 0,1,2...
self.init_token_id_seq()
# max size of the buffer
# if the buffer exceed this limit, then the buffer will be written to the disk
self.maxsize = maxsize self.maxsize = maxsize
self.buffer = [] self.buffer = []
# temporary files for the inverted index
self.tmp_filename_format='.cache/{}_tmp_index_{{}}'.format(collection_name) self.tmp_filename_format='.cache/{}_tmp_index_{{}}'.format(collection_name)
self.tmp_filenames = [] self.tmp_filenames = []
self.tmp_file_id_seq = itertools.count() self.tmp_file_id_seq = itertools.count()
self.document_norms = dict() # name of the index files written to the disk
# we write several files to the disk
# * the inverted index is a map token_id -> postings (described as (doc_id, frequency))
self.INVERTED_INDEX_FILE = '.cache/{}_index'.format(collection_name)
self.INDEX_FILE = '.cache/{}_index'.format(collection_name) # * it maps the token id to a token
self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name) self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name)
self.DOCUMENT_NORMS_FILE = '.cache/{}_document_norms'.format(collection_name)
# * it stores for each document:
# - its norm
# - the max frequency of its tokens
self.DOCUMENT_METADATA_FILE = '.cache/{}_documents_metadata'.format(collection_name)
# by default, the status of an index is NOT_CREATED
self.status = self.INDEX_STATUS_NOT_CREATED self.status = self.INDEX_STATUS_NOT_CREATED
# dict: token_id -> number of documents having this token
self.token_id_to_df = defaultdict(int)
def get_collection_size(self): def get_collection_size(self):
# TODO: use the size of the documents_norms file
return len(self.document_norms) return len(self.document_norms)
def init_token_id_seq(self, start=0): def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start) self.token_id_seq = itertools.count(start=start)
def find_or_create_token_id(self, token): def find_or_create_token_id(self, token):
if token not in self.tokens_map: """hash function of the token. Returns the id of the token
If the token was already analyzed, it returns its id
Otherwise it returns an id never used before
Arguments:
token {string} -- the token to hash
Returns:
[int] -- the token_id corresponding to the token
"""
if token not in self.token_to_token_id:
token_id = next(self.token_id_seq) token_id = next(self.token_id_seq)
self.tokens_map[token] = token_id self.token_to_token_id[token] = token_id
return token_id return token_id
else: else:
return self.tokens_map[token] return self.token_to_token_id[token]
def cleanup(self): def cleanup(self):
"""Cleanup temporary files
"""
for filename in self.tmp_filenames: for filename in self.tmp_filenames:
os.remove(filename) os.remove(filename)
...@@ -84,6 +124,13 @@ class BSBIIndexer(Indexer): ...@@ -84,6 +124,13 @@ class BSBIIndexer(Indexer):
return math.sqrt(norm) return math.sqrt(norm)
def add_document_tokens(self, document, counted_tokens): def add_document_tokens(self, document, counted_tokens):
"""Add a document and its token to the buffer
Arguments:
document {Document} -- document
counted_tokens {Counter} -- counted tokens
"""
doc_id = document.document_id doc_id = document.document_id
# convert tokens to token ids # convert tokens to token ids
token_ids = set() token_ids = set()
...@@ -96,6 +143,8 @@ class BSBIIndexer(Indexer): ...@@ -96,6 +143,8 @@ class BSBIIndexer(Indexer):
for token, frequency in counted_tokens.items(): for token, frequency in counted_tokens.items():
token_id = self.find_or_create_token_id(token) token_id = self.find_or_create_token_id(token)
self.token_id_to_df[token_id] += 1
self.buffer += [(token_id, doc_id, frequency, max_frequency)] self.buffer += [(token_id, doc_id, frequency, max_frequency)]
if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize: if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
...@@ -120,7 +169,7 @@ class BSBIIndexer(Indexer): ...@@ -120,7 +169,7 @@ class BSBIIndexer(Indexer):
try: try:
merged_tuples_iterator = heapq.merge(*map(self._read_in_chunks, tmp_files)) merged_tuples_iterator = heapq.merge(*map(self._read_in_chunks, tmp_files))
with open(self.INDEX_FILE, 'wb') as f: with open(self.INVERTED_INDEX_FILE, 'wb') as f:
for t in merged_tuples_iterator: for t in merged_tuples_iterator:
# TODO: maybe write by block ? # TODO: maybe write by block ?
f.write(t) f.write(t)
...@@ -146,12 +195,12 @@ class BSBIIndexer(Indexer): ...@@ -146,12 +195,12 @@ class BSBIIndexer(Indexer):
""" """
document_ids = dict() document_ids = dict()
if token not in self.tokens_map: if token not in self.token_to_token_id:
return document_ids return document_ids
token_id = self.tokens_map[token] token_id = self.token_to_token_id[token]
with open(self.INDEX_FILE, 'rb') as f: with open(self.INVERTED_INDEX_FILE, 'rb') as f:
upper_bound = (os.fstat(f.fileno()).st_size) // self.BLOCK_SIZE upper_bound = (os.fstat(f.fileno()).st_size) // self.BLOCK_SIZE
lower_bound = 0 lower_bound = 0
...@@ -194,7 +243,7 @@ class BSBIIndexer(Indexer): ...@@ -194,7 +243,7 @@ class BSBIIndexer(Indexer):
def save_to_disk(self): def save_to_disk(self):
with open(self.TOKENS_MAP_FILE, 'wb') as f: with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.tokens_map, f, pickle.HIGHEST_PROTOCOL) pickle.dump(self.token_to_token_id, f, pickle.HIGHEST_PROTOCOL)
with open(self.DOCUMENT_NORMS_FILE, 'wb') as f: with open(self.DOCUMENT_NORMS_FILE, 'wb') as f:
pickle.dump(self.document_norms, f, pickle.HIGHEST_PROTOCOL) pickle.dump(self.document_norms, f, pickle.HIGHEST_PROTOCOL)
...@@ -202,7 +251,7 @@ class BSBIIndexer(Indexer): ...@@ -202,7 +251,7 @@ class BSBIIndexer(Indexer):
def load_from_cache(self): def load_from_cache(self):
try: try:
with open(self.TOKENS_MAP_FILE, 'rb') as f: with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.tokens_map = pickle.load(f) self.token_to_token_id = pickle.load(f)
with open(self.DOCUMENT_NORMS_FILE, 'rb') as f: with open(self.DOCUMENT_NORMS_FILE, 'rb') as f:
self.document_norms = pickle.load(f) self.document_norms = pickle.load(f)
...@@ -213,10 +262,10 @@ class BSBIIndexer(Indexer): ...@@ -213,10 +262,10 @@ class BSBIIndexer(Indexer):
except FileNotFoundError: except FileNotFoundError:
return False return False
self.init_token_id_seq(max(self.tokens_map.keys())) self.init_token_id_seq(max(self.token_to_token_id.keys()))
def get_index_size(self): def get_index_size(self):
return os.stat(self.INDEX_FILE).st_size return os.stat(self.INVERTED_INDEX_FILE).st_size
def get_tokens_map_size(self): def get_token_to_token_id_size(self):
return os.stat(self.TOKENS_MAP_FILE).st_size return os.stat(self.TOKENS_MAP_FILE).st_size
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment