index_command.py 1.9 KB
Newer Older
Dos Santos David's avatar
Dos Santos David committed
1

Dos Santos David's avatar
Dos Santos David committed
2
def run(collection, args):
3 4 5
    parser = collection.parser
    tokenizer = collection.tokenizer
    indexer = collection.indexer
Dos Santos David's avatar
Dos Santos David committed
6 7

    if args.index_command == 'build':
Dos Santos David's avatar
Dos Santos David committed
8
        build_index = args.build_no_cache
Dos Santos David's avatar
Dos Santos David committed
9

Dos Santos David's avatar
Dos Santos David committed
10 11 12 13 14 15
        if not args.build_no_cache:
            # try to load index from the disk
            if indexer.load_from_cache():
                print('cache loaded from disk')
            else:
                build_index = True
Dos Santos David's avatar
Dos Santos David committed
16

Dos Santos David's avatar
Dos Santos David committed
17 18
        if build_index:
            print('...   loading the documents...')
Dos Santos David's avatar
Dos Santos David committed
19

Dos Santos David's avatar
Dos Santos David committed
20
            count_documents = 0
Dos Santos David's avatar
Dos Santos David committed
21
            for document in parser.find_documents(limit=None):
Dos Santos David's avatar
Dos Santos David committed
22
                count_documents += 1
23
                counted_tokens = tokenizer.get_counted_tokens(document)
Dos Santos David's avatar
Dos Santos David committed
24

25
                indexer.add_document_tokens(document, counted_tokens)
Dos Santos David's avatar
Dos Santos David committed
26 27 28

            print('...   {} documents loaded'.format(count_documents))

29 30
            _,t = indexer.build_index()
            print('...   index created in {elapsed_time:.2f} ms'.format(elapsed_time=t))
Dos Santos David's avatar
Dos Santos David committed
31 32 33


    if args.index_command == 'lookup':
34
        token = args.token[0]
35
        doc_ids = indexer.token_lookup_with_frequency(token)
Dos Santos David's avatar
Dos Santos David committed
36
        if doc_ids:
37
            print("{token} is present in {nb_docs} documents\n".format(
38
                token=token,
39
                nb_docs=len(doc_ids)
40
            ))
41 42
            for doc_id, frequency in doc_ids.items():
                print("doc : {doc_id} (frequency: {frequency})".format(doc_id=doc_id, frequency=frequency))
Dos Santos David's avatar
Dos Santos David committed
43 44
        else:
            print('no result :(')
45 46 47 48 49 50 51 52 53 54

    if args.index_command == 'stats':
        if collection.indexer.status == collection.indexer.INDEX_STATUS_NOT_CREATED:
            print('index not created')
            return

        print('index created\n')

        print('Size of the index :\t\t\t{size:,} bytes'.format(size=collection.indexer.get_index_size()))
        print('Size of the map token->token_id :\t{size:,} bytes'.format(size=collection.indexer.get_tokens_map_size()))