Commit 99ff28e6 authored by Prot Alexandre's avatar Prot Alexandre
parents 78e8f3b4 6f878c84
FROM python:3.6-onbuild
......@@ -2,10 +2,12 @@ version: "2"
services:
app:
image: "python:3.6"
build: "./"
volumes:
- "./:/app"
working_dir: "/app"
command: "python3 main.py -f dumps/cacm.all"
- "data:/usr/src/app/data"
- "./:/usr/src/app"
working_dir: "/usr/src/app"
command: "./bin/gogole -c cacm"
volumes:
data:
from gogole.utils import timeit
class Collection:
"""
Describe a standard collection
"""
def __init__(self):
self._parser = None
self._parser = None # type: gogole.parser.Parser
self._indexer = None
self._tokenizer = None
@property
def parser(self):
def parser(self) -> 'gogole.parser.Parser':
return self._parser
@property
def indexer(self):
def indexer(self) -> 'gogole.indexer.Indexer':
return self._indexer
@property
def tokenizer(self):
def tokenizer(self) -> 'gogole.tokenizer.Tokenizer':
return self._tokenizer
@timeit
def load_documents_in_indexer(self, limit=None):
nb_documents = 0
for document in self.parser.find_documents(limit=limit):
nb_documents += 1
counted_tokens = self.tokenizer.get_counted_tokens(document)
self.indexer.add_document_tokens(document, counted_tokens)
return nb_documents
from gogole.commands import analyze_command, eval_command, index_command, search_command
from gogole.commands import analyze_command, eval_command, index_command, search_command, quit_command
MAIN_COMMANDS_MAP = {
'analyze': analyze_command.run,
'eval': eval_command.run,
'index': index_command.run,
'search': search_command.run
'search': search_command.run,
'quit': quit_command.run,
}
......@@ -9,22 +9,21 @@ def run(collection, args):
if not args.build_no_cache:
# try to load index from the disk
if indexer.load_from_cache():
print('cache loaded from disk')
ok, t = indexer.load_from_cache()
if ok:
print('cache loaded from disk in {elapsed_time:.2f} ms'.format(elapsed_time=t))
else:
build_index = True
if build_index:
# load the documents in the indexer
print('... loading the documents...')
nb_documents, elapsed_time = collection.load_documents_in_indexer()
count_documents = 0
for document in parser.find_documents(limit=None):
count_documents += 1
counted_tokens = tokenizer.get_counted_tokens(document)
indexer.add_document_tokens(document, counted_tokens)
print('... Loaded {nb_documents} documents in {elapsed_time:.2f} ms'.format(nb_documents=nb_documents, elapsed_time=elapsed_time))
print('... {} documents loaded'.format(count_documents))
print('... creating the index')
_,t = indexer.build_index()
print('... index created in {elapsed_time:.2f} ms'.format(elapsed_time=t))
......@@ -51,4 +50,5 @@ def run(collection, args):
print('index created\n')
print('Size of the index :\t\t\t{size:,} bytes'.format(size=collection.indexer.get_index_size()))
print('Size of the map token->token_id :\t{size:,} bytes'.format(size=collection.indexer.get_tokens_map_size()))
print('Size of the map token->token_id :\t{size:,} bytes'.format(size=collection.indexer.get_token_to_token_id_size()))
print('Size of document metadata file :\t{size:,} bytes'.format(size=collection.indexer.get_document_metadata_size()))
def run(collection, args):
raise KeyboardInterrupt()
......@@ -280,13 +280,14 @@ class BSBIIndexer(Indexer):
with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.token_to_token_id, f, pickle.HIGHEST_PROTOCOL)
@timeit
def load_from_cache(self):
try:
with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.token_to_token_id = pickle.load(f)
self.status = self.INDEX_STATUS_CREATED
self.init_token_id_seq(max(self.token_to_token_id.keys()))
self.init_token_id_seq(start=max(self.token_to_token_id.values()))
return True
......@@ -299,3 +300,6 @@ class BSBIIndexer(Indexer):
def get_token_to_token_id_size(self):
return os.stat(self.TOKENS_MAP_FILE).st_size
def get_document_metadata_size(self):
return os.stat(self.DOCUMENT_METADATA_FILE).st_size
import argparse
import sys
import traceback
from gogole import commands
from gogole.config import COLLECTIONS
......@@ -24,8 +26,11 @@ You don't know where to start ? Let me help you !
* search <query> : I'll show you the documents you need
search -b,--boolean <query> Your query must be in the Conjunctive normal form
Like "a OR b OR c AND d" is "(a OR b OR c) AND d"
* search -v,--vectorial <query> : Search using the vectorial model
* eval <n> : evaluate performances and limit evaluation to the n first requests
* quit
"""
......@@ -84,6 +89,9 @@ def build_cli_search_parser(root_parser):
search_parser.set_defaults(weight_type='tf-idf')
search_parser.set_defaults(search_query_type='boolean')
def build_cli_quit_parser(root_parser):
quit_parser = root_parser.add_parser('quit', description='quit')
def build_cli_parser():
# cli parser
cli_parser = argparse.ArgumentParser(prog="", add_help=False)
......@@ -93,6 +101,7 @@ def build_cli_parser():
build_cli_eval_parser(cli_subparser)
build_cli_index_parser(cli_subparser)
build_cli_search_parser(cli_subparser)
build_cli_quit_parser(cli_subparser)
return cli_parser
......@@ -116,11 +125,21 @@ if __name__ == "__main__":
cli_parser = build_cli_parser()
while True:
print('')
raw_input = input("gogole > ")
try:
print('')
raw_input = input("gogole > ")
args = cli_parser.parse_args(raw_input.split(' '))
commands.MAIN_COMMANDS_MAP[args.main_command](collection, args)
except SystemExit:
pass
except (KeyboardInterrupt, EOFError):
print('\nBye !\n\n\n')
sys.exit(0) # exit successfuly
except:
print('\nOuuups. Something is broken :/')
print('-'*60)
print(traceback.print_exc(file=sys.stdout))
print('-'*60)
......@@ -9,22 +9,22 @@ Voici l'analyse obtenue pour la collection CACM:
```
****************** Count tokens ******************
Total count of tokens : 108,447
Vocabulary size: 11,627
Total count of tokens : 110,398
Vocabulary size: 9,497
****** Count tokens for half the collection ******
Total count of tokens : 30,052
Vocabulary size: 6,049
Total count of tokens : 30,672
Vocabulary size: 5,299
******** Heap's law parameters estimation ********
b: 0.509
k: 31.7
b: 0.456
k: 47.9
estimation of vocabulary size for 1M tokens : 36034
estimation of vocabulary size for 1M tokens : 25917
```
Graphes pour la loi de Zipf :
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment