analyze.py 2.38 KB
Newer Older
Dos Santos David's avatar
Dos Santos David committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law


class AnalyzeCommand:

    ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law']


    @staticmethod
    def load_documents(collection_file, parser_cls=CACMParser, limit=None):
        parser = parser_cls(collection_file)
        return parser.parse(limit=limit)


    @classmethod
    def run_analyze(cls, args):
        commands = args.sub_command

        if 'all' in commands:
            commands = cls.ANALYZE_COMMANDS

        documents = cls.load_documents(args.file)

        tokenizer = SimpleTokenizer(args.stop_words_file)

        tokens_by_document = {doc_id: tokenizer.get_tokens(doc) for doc_id, doc in documents.items() }

        all_tokens = [token for tokens in tokens_by_document.values() for token in tokens]

        if 'count_tokens' in commands or 'heap_law' in commands:
            print("{:*^50}\n".format(" Count tokens "))
            count_tokens = len(all_tokens)
            print("Total count of tokens : \t{:,}".format(count_tokens))

            vocabulary_size = len(set(all_tokens))
            print("Vocabulary size: \t\t{:,}".format(vocabulary_size))

            if 'heap_law' in commands:
                print("\n\n{:*^50}\n".format(" Count tokens for half the collection "))

                # get half the documents
                median_doc_id = sorted(documents.keys())[len(documents.keys())//2]
                tokens_by_document_2 = {doc_id: tokens for doc_id, tokens in tokens_by_document.items() if doc_id <= median_doc_id}

                all_tokens_2 = [token for tokens in tokens_by_document_2.values() for token in tokens]

                count_tokens_2 = len(all_tokens_2)
                print("Total count of tokens : \t{:,}".format(count_tokens_2))

                vocabulary_size_2 = len(set(all_tokens_2))
                print("Vocabulary size: \t\t{:,}".format(vocabulary_size_2))

                b,k = heap_law.compute_parameters(count_tokens, vocabulary_size, count_tokens_2, vocabulary_size_2)


                print("\n\n{:*^50}\n".format(" Heap's law parameters estimation "))
                print("b: \t{0:.3g}".format(b))
                print("k: \t{0:.3g}".format(k))

                print("\nestimation of vocabulary size for 1M tokens : {}".format(heap_law.estimate_vocabulary_size(b, k, 1000*1000)))