Commit c14592de authored by Dos Santos David's avatar Dos Santos David

add index stats command

parent fa5cbd52
...@@ -42,3 +42,13 @@ def run_index_command(collection, args): ...@@ -42,3 +42,13 @@ def run_index_command(collection, args):
print("doc : {doc_id} (frequency: {frequency})".format(doc_id=doc_id, frequency=frequency)) print("doc : {doc_id} (frequency: {frequency})".format(doc_id=doc_id, frequency=frequency))
else: else:
print('no result :(') print('no result :(')
if args.index_command == 'stats':
if collection.indexer.status == collection.indexer.INDEX_STATUS_NOT_CREATED:
print('index not created')
return
print('index created\n')
print('Size of the index :\t\t\t{size:,} bytes'.format(size=collection.indexer.get_index_size()))
print('Size of the map token->token_id :\t{size:,} bytes'.format(size=collection.indexer.get_tokens_map_size()))
...@@ -6,8 +6,9 @@ import math ...@@ -6,8 +6,9 @@ import math
import pickle import pickle
from gogole.utils import timeit from gogole.utils import timeit
from gogole.indexer import Indexer
class BSBIIndexer: class BSBIIndexer(Indexer):
BLOCK_SIZE = 12 BLOCK_SIZE = 12
def __init__(self, collection_name, maxsize=None): def __init__(self, collection_name, maxsize=None):
...@@ -31,6 +32,8 @@ class BSBIIndexer: ...@@ -31,6 +32,8 @@ class BSBIIndexer:
self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name) self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name)
self.DOCUMENTS_MAP_FILE = '.cache/{}_documents_map'.format(collection_name) self.DOCUMENTS_MAP_FILE = '.cache/{}_documents_map'.format(collection_name)
self.status = self.INDEX_STATUS_NOT_CREATED
def init_token_id_seq(self, start=0): def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start) self.token_id_seq = itertools.count(start=start)
...@@ -45,6 +48,10 @@ class BSBIIndexer: ...@@ -45,6 +48,10 @@ class BSBIIndexer:
return self.tokens_map[token] return self.tokens_map[token]
def cleanup(self):
for filename in self.tmp_filenames:
os.remove(filename)
def flush_buffer(self): def flush_buffer(self):
sorted_tuples = sorted(self.buffer) sorted_tuples = sorted(self.buffer)
...@@ -107,6 +114,8 @@ class BSBIIndexer: ...@@ -107,6 +114,8 @@ class BSBIIndexer:
fp.close() fp.close()
self.save_to_disk() self.save_to_disk()
self.cleanup() # cleanup temporary files
self.status = self.INDEX_STATUS_CREATED
def _read_token_id(self, file, pos): def _read_token_id(self, file, pos):
file.seek(pos*self.BLOCK_SIZE, 0) file.seek(pos*self.BLOCK_SIZE, 0)
...@@ -174,6 +183,7 @@ class BSBIIndexer: ...@@ -174,6 +183,7 @@ class BSBIIndexer:
def load_from_cache(self): def load_from_cache(self):
try: try:
with open(self.TOKENS_MAP_FILE, 'rb') as f: with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.status = self.INDEX_STATUS_CREATED
self.tokens_map = pickle.load(f) self.tokens_map = pickle.load(f)
return True return True
...@@ -182,3 +192,9 @@ class BSBIIndexer: ...@@ -182,3 +192,9 @@ class BSBIIndexer:
return False return False
self.init_token_id_seq(max(self.tokens_map.keys())) self.init_token_id_seq(max(self.tokens_map.keys()))
def get_index_size(self):
return os.stat(self.INDEX_FILE).st_size
def get_tokens_map_size(self):
return os.stat(self.TOKENS_MAP_FILE).st_size
class Indexer: class Indexer:
INDEX_STATUS_NOT_CREATED = 0
INDEX_STATUS_CREATED = 1
def add_documents_token(self, document, tokens): def add_documents_token(self, document, tokens):
raise Exception('add_documents_token not implemented') raise Exception('add_documents_token not implemented')
...@@ -12,3 +14,9 @@ class Indexer: ...@@ -12,3 +14,9 @@ class Indexer:
def load_from_cache(self): def load_from_cache(self):
return Exception("load_from_cache not implemented") return Exception("load_from_cache not implemented")
def get_index_size(self):
return 0
def get_tokens_map_size(self):
return 0
...@@ -17,6 +17,7 @@ You don't know where to start ? Let me help you ! ...@@ -17,6 +17,7 @@ You don't know where to start ? Let me help you !
* analyze all : I'll do some analyzes on the collection you gave me * analyze all : I'll do some analyzes on the collection you gave me
* index stats : I'll show you some statistics on the index
* index build : I'll build the reversed index for you * index build : I'll build the reversed index for you
* index lookup <token> : I'll tell you in where documents your token is * index lookup <token> : I'll tell you in where documents your token is
...@@ -24,7 +25,7 @@ You don't know where to start ? Let me help you ! ...@@ -24,7 +25,7 @@ You don't know where to start ? Let me help you !
search -b,--boolean <query> Your query must be in the Conjunctive normal form search -b,--boolean <query> Your query must be in the Conjunctive normal form
Like "a OR b OR c AND d" is "(a OR b OR c) AND d" Like "a OR b OR c AND d" is "(a OR b OR c) AND d"
* search -v,--vectorial <query> : Search using the vectorial model * search -v,--vectorial <query> : Search using the vectorial model
""" """
...@@ -58,6 +59,8 @@ def build_cli_index_parser(root_parser): ...@@ -58,6 +59,8 @@ def build_cli_index_parser(root_parser):
dest='build_no_cache' dest='build_no_cache'
) )
stats_parser = index_subparser.add_parser('stats', help='show stats about the index')
def build_cli_search_parser(root_parser): def build_cli_search_parser(root_parser):
search_parser = root_parser.add_parser('search', description='search for documents') search_parser = root_parser.add_parser('search', description='search for documents')
search_parser.add_argument('-b', '--boolean', action='store_const', const='boolean', dest='search_query_type', help="use the booolean model") search_parser.add_argument('-b', '--boolean', action='store_const', const='boolean', dest='search_query_type', help="use the booolean model")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment