from gogole.tokenizer.simple_tokenizer import SimpleTokenizer from gogole.parser.cacm_parser import CACMParser from gogole.utils import heap_law class AnalyzeCommand: ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law'] @staticmethod def load_documents(collection_file, parser_cls=CACMParser, limit=None): parser = parser_cls(collection_file) return parser.parse(limit=limit) @classmethod def run_analyze(cls, args): commands = args.sub_command if 'all' in commands: commands = cls.ANALYZE_COMMANDS documents = cls.load_documents(args.file) tokenizer = SimpleTokenizer(args.stop_words_file) tokens_by_document = {doc_id: tokenizer.get_tokens(doc) for doc_id, doc in documents.items() } all_tokens = [token for tokens in tokens_by_document.values() for token in tokens] if 'count_tokens' in commands or 'heap_law' in commands: print("{:*^50}\n".format(" Count tokens ")) count_tokens = len(all_tokens) print("Total count of tokens : \t{:,}".format(count_tokens)) vocabulary_size = len(set(all_tokens)) print("Vocabulary size: \t\t{:,}".format(vocabulary_size)) if 'heap_law' in commands: print("\n\n{:*^50}\n".format(" Count tokens for half the collection ")) # get half the documents median_doc_id = sorted(documents.keys())[len(documents.keys())//2] tokens_by_document_2 = {doc_id: tokens for doc_id, tokens in tokens_by_document.items() if doc_id <= median_doc_id} all_tokens_2 = [token for tokens in tokens_by_document_2.values() for token in tokens] count_tokens_2 = len(all_tokens_2) print("Total count of tokens : \t{:,}".format(count_tokens_2)) vocabulary_size_2 = len(set(all_tokens_2)) print("Vocabulary size: \t\t{:,}".format(vocabulary_size_2)) b,k = heap_law.compute_parameters(count_tokens, vocabulary_size, count_tokens_2, vocabulary_size_2) print("\n\n{:*^50}\n".format(" Heap's law parameters estimation ")) print("b: \t{0:.3g}".format(b)) print("k: \t{0:.3g}".format(k)) print("\nestimation of vocabulary size for 1M tokens : {}".format(heap_law.estimate_vocabulary_size(b, k, 1000*1000)))