Commit c14592de authored by Dos Santos David's avatar Dos Santos David

add index stats command

parent fa5cbd52
......@@ -42,3 +42,13 @@ def run_index_command(collection, args):
print("doc : {doc_id} (frequency: {frequency})".format(doc_id=doc_id, frequency=frequency))
else:
print('no result :(')
if args.index_command == 'stats':
if collection.indexer.status == collection.indexer.INDEX_STATUS_NOT_CREATED:
print('index not created')
return
print('index created\n')
print('Size of the index :\t\t\t{size:,} bytes'.format(size=collection.indexer.get_index_size()))
print('Size of the map token->token_id :\t{size:,} bytes'.format(size=collection.indexer.get_tokens_map_size()))
......@@ -6,8 +6,9 @@ import math
import pickle
from gogole.utils import timeit
from gogole.indexer import Indexer
class BSBIIndexer:
class BSBIIndexer(Indexer):
BLOCK_SIZE = 12
def __init__(self, collection_name, maxsize=None):
......@@ -31,6 +32,8 @@ class BSBIIndexer:
self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name)
self.DOCUMENTS_MAP_FILE = '.cache/{}_documents_map'.format(collection_name)
self.status = self.INDEX_STATUS_NOT_CREATED
def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start)
......@@ -45,6 +48,10 @@ class BSBIIndexer:
return self.tokens_map[token]
def cleanup(self):
for filename in self.tmp_filenames:
os.remove(filename)
def flush_buffer(self):
sorted_tuples = sorted(self.buffer)
......@@ -107,6 +114,8 @@ class BSBIIndexer:
fp.close()
self.save_to_disk()
self.cleanup() # cleanup temporary files
self.status = self.INDEX_STATUS_CREATED
def _read_token_id(self, file, pos):
file.seek(pos*self.BLOCK_SIZE, 0)
......@@ -174,6 +183,7 @@ class BSBIIndexer:
def load_from_cache(self):
try:
with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.status = self.INDEX_STATUS_CREATED
self.tokens_map = pickle.load(f)
return True
......@@ -182,3 +192,9 @@ class BSBIIndexer:
return False
self.init_token_id_seq(max(self.tokens_map.keys()))
def get_index_size(self):
return os.stat(self.INDEX_FILE).st_size
def get_tokens_map_size(self):
return os.stat(self.TOKENS_MAP_FILE).st_size
class Indexer:
INDEX_STATUS_NOT_CREATED = 0
INDEX_STATUS_CREATED = 1
def add_documents_token(self, document, tokens):
raise Exception('add_documents_token not implemented')
......@@ -12,3 +14,9 @@ class Indexer:
def load_from_cache(self):
return Exception("load_from_cache not implemented")
def get_index_size(self):
return 0
def get_tokens_map_size(self):
return 0
......@@ -17,6 +17,7 @@ You don't know where to start ? Let me help you !
* analyze all : I'll do some analyzes on the collection you gave me
* index stats : I'll show you some statistics on the index
* index build : I'll build the reversed index for you
* index lookup <token> : I'll tell you in where documents your token is
......@@ -58,6 +59,8 @@ def build_cli_index_parser(root_parser):
dest='build_no_cache'
)
stats_parser = index_subparser.add_parser('stats', help='show stats about the index')
def build_cli_search_parser(root_parser):
search_parser = root_parser.add_parser('search', description='search for documents')
search_parser.add_argument('-b', '--boolean', action='store_const', const='boolean', dest='search_query_type', help="use the booolean model")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment