Commit 6a2ab440 authored by Dos Santos David's avatar Dos Santos David

build bsbi index

parent 941d7c2a
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law
class AnalyzeCommand:
ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law']
@staticmethod
def load_documents(collection_file, parser_cls=CACMParser, limit=None):
parser = parser_cls(collection_file)
return parser.parse(limit=limit)
@classmethod
def run_analyze(cls, args):
commands = args.sub_command
if 'all' in commands:
commands = cls.ANALYZE_COMMANDS
documents = cls.load_documents(args.file)
tokenizer = SimpleTokenizer(args.stop_words_file)
tokens_by_document = {doc_id: tokenizer.get_tokens(doc) for doc_id, doc in documents.items() }
all_tokens = [token for tokens in tokens_by_document.values() for token in tokens]
if 'count_tokens' in commands or 'heap_law' in commands:
print("{:*^50}\n".format(" Count tokens "))
count_tokens = len(all_tokens)
print("Total count of tokens : \t{:,}".format(count_tokens))
vocabulary_size = len(set(all_tokens))
print("Vocabulary size: \t\t{:,}".format(vocabulary_size))
if 'heap_law' in commands:
print("\n\n{:*^50}\n".format(" Count tokens for half the collection "))
# get half the documents
median_doc_id = sorted(documents.keys())[len(documents.keys())//2]
tokens_by_document_2 = {doc_id: tokens for doc_id, tokens in tokens_by_document.items() if doc_id <= median_doc_id}
all_tokens_2 = [token for tokens in tokens_by_document_2.values() for token in tokens]
count_tokens_2 = len(all_tokens_2)
print("Total count of tokens : \t{:,}".format(count_tokens_2))
vocabulary_size_2 = len(set(all_tokens_2))
print("Vocabulary size: \t\t{:,}".format(vocabulary_size_2))
b,k = heap_law.compute_parameters(count_tokens, vocabulary_size, count_tokens_2, vocabulary_size_2)
print("\n\n{:*^50}\n".format(" Heap's law parameters estimation "))
print("b: \t{0:.3g}".format(b))
print("k: \t{0:.3g}".format(k))
print("\nestimation of vocabulary size for 1M tokens : {}".format(heap_law.estimate_vocabulary_size(b, k, 1000*1000)))
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law
ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law']
def load_documents(collection_file, parser_cls=CACMParser, limit=None):
parser = parser_cls(collection_file)
return parser.parse_all(limit=limit)
def run_analyze(args):
commands = args.sub_command
if 'all' in commands:
commands = ANALYZE_COMMANDS
documents = load_documents(args.file)
tokenizer = SimpleTokenizer(args.stop_words_file)
tokens_by_document = {doc_id: tokenizer.get_tokens(doc) for doc_id, doc in documents.items() }
all_tokens = [token for tokens in tokens_by_document.values() for token in tokens]
if 'count_tokens' in commands or 'heap_law' in commands:
print("{:*^50}\n".format(" Count tokens "))
count_tokens = len(all_tokens)
print("Total count of tokens : \t{:,}".format(count_tokens))
vocabulary_size = len(set(all_tokens))
print("Vocabulary size: \t\t{:,}".format(vocabulary_size))
if 'heap_law' in commands:
print("\n\n{:*^50}\n".format(" Count tokens for half the collection "))
# get half the documents
median_doc_id = sorted(documents.keys())[len(documents.keys())//2]
tokens_by_document_2 = {doc_id: tokens for doc_id, tokens in tokens_by_document.items() if doc_id <= median_doc_id}
all_tokens_2 = [token for tokens in tokens_by_document_2.values() for token in tokens]
count_tokens_2 = len(all_tokens_2)
print("Total count of tokens : \t{:,}".format(count_tokens_2))
vocabulary_size_2 = len(set(all_tokens_2))
print("Vocabulary size: \t\t{:,}".format(vocabulary_size_2))
b,k = heap_law.compute_parameters(count_tokens, vocabulary_size, count_tokens_2, vocabulary_size_2)
print("\n\n{:*^50}\n".format(" Heap's law parameters estimation "))
print("b: \t{0:.3g}".format(b))
print("k: \t{0:.3g}".format(k))
print("\nestimation of vocabulary size for 1M tokens : {}".format(heap_law.estimate_vocabulary_size(b, k, 1000*1000)))
from gogole.indexer import BSBIIndexer
from gogole.parser.cacm_parser import CACMParser
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
#
# Build the reversed index
#
def run_build(args):
parser = CACMParser(args.file)
tokenizer = SimpleTokenizer(args.stop_words_file)
indexer = BSBIIndexer(maxsize=128)
for document in parser.find_documents(limit=1000):
tokens = tokenizer.get_tokens(document)
indexer.add_document_tokens(document.document_id, tokens)
indexer.build_index()
print('Reversed index has been built')
from gogole.indexer.indexer import Indexer
from gogole.indexer.simple_indexer import SimpleIndexer
from gogole.indexer.bsbi_indexer import BSBIIndexer
import struct
import itertools
import heapq
import os
import math
class BSBIIndexer:
BLOCK_SIZE = 4
def __init__(self,
maxsize=None
):
"""
:param maxsize: max size of the buffer (in bytes)
"""
self.tokens_map = dict()
self.token_id_seq = itertools.count() # next token id
self.maxsize = maxsize
self.buffer = []
self.tmp_filename_format='.cache/tmp_index_{}'
self.tmp_filenames = []
self.tmp_file_id_seq = itertools.count()
def find_or_create_token_id(self, token):
if token not in self.tokens_map:
token_id = next(self.token_id_seq)
self.tokens_map[token] = token_id
return token_id
else:
return self.tokens_map[token]
def flush_buffer(self):
sorted_tuples = sorted(self.buffer)
filename = self.tmp_filename_format.format(next(self.tmp_file_id_seq))
with open(filename, 'wb') as f:
self.tmp_filenames.append(f.name)
for token_id, doc_id in sorted_tuples:
# assume we already are at the end of the file
b = bytearray()
b += struct.pack('H', token_id) # H stands for unsigned short integer ( 2 bytes - up to 65535)
b += struct.pack('H', doc_id)
f.write(b)
# reset the buffer
self.buffer = []
def add_document_tokens(self, doc_id, tokens):
# convert tokens to token ids
token_ids = set()
for token in tokens:
token_id = self.find_or_create_token_id(token)
self.buffer += [(token_id, doc_id)]
if self.maxsize is not None and 2*2*len(self.buffer) >= self.maxsize:
self.flush_buffer()
def _read_in_chunks(self, f, blocksize=4):
while True:
data = f.read(blocksize)
if not data:
break
yield data
def build_index(self):
# 1/ flush the buffer
self.flush_buffer()
tmp_files = [open(f, 'rb') for f in self.tmp_filenames]
try:
merged_tuples_iterator = heapq.merge(*map(self._read_in_chunks, tmp_files))
with open('.cache/index', 'wb') as f:
for t in merged_tuples_iterator:
# TODO: maybe write by block ?
f.write(t)
finally:
for fp in tmp_files:
fp.close()
def token_lookup(self, token):
"""
Returns a list of documents
where a given token is present
:param token: token to search in documents
"""
pass
class Indexer:
def add_documents_token(self, doc_id, tokens):
raise Exception('add_documents_token not implemented')
def build_index(self):
raise Exception('build_index not implemented')
def token_lookup(self, token):
raise Exception('token_lookup not implemented')
from collections import defaultdict
from gogole.indexer.indexer import Indexer
class SimpleIndexer(Indexer):
def __init__(self):
self.tokens_document_map = defaultdict(list)
def add_documents_token(self, doc_id, tokens):
for token in tokens:
self.tokens_document_map[token] += doc_id
def build_index(self):
pass
def token_lookup(self, token):
return self.tokens_document_map.get(token, [])
......@@ -23,42 +23,51 @@ class CACMParser:
"""
return next((m for m in self.ALL_MARKERS if line.startswith(m)), None)
def parse(self, limit=None):
documents = dict()
with open(self.filename, "r") as f:
def find_documents(self, limit=None):
counter = 0 # count documents found
with open(self.filename, 'r') as f:
current_document_id = None
current_document = None
current_marker = None
buffer = ""
for line in f:
# find the current marker
marker = self.find_marker_in_line(line)
if marker == self.DOCUMENT_MARKER:
# this is a new document
if current_document is not None:
yield current_document
current_document_id = int(line[3:])
if limit is not None \
and current_document_id >= limit:
break
if limit is not None and counter >= limit:
return
documents[current_document_id] = Document(current_document_id)
current_document = Document(current_document_id)
counter += 1
elif marker is not None:
# new marker
if current_marker is not None and current_marker in self.MARKERS.keys():
setattr(
documents[current_document_id],
current_document,
self.MARKERS[current_marker],
buffer)
# reset the buffer
buffer = ""
current_marker = marker
else:
buffer += line
return documents
yield current_document
def parse_all(self, limit=None):
return {doc.document_id: doc for doc in self.find_documents(limit)}
import argparse
from gogole.commands.analyze import AnalyzeCommand
from gogole.commands import analyze_command, build_command
......@@ -21,11 +21,17 @@ def main():
argsparser_analyze.add_argument(
"sub_command",
nargs='*',
choices=AnalyzeCommand.ANALYZE_COMMANDS,
choices=analyze_command.ANALYZE_COMMANDS,
metavar="analyze_type",
help="can be any of {}".format(", ".join(AnalyzeCommand.ANALYZE_COMMANDS))
help="can be any of {}".format(", ".join(analyze_command.ANALYZE_COMMANDS))
)
argsparser_analyze.set_defaults(func=AnalyzeCommand.run_analyze)
argsparser_build = subparsers.add_parser(
'build'
)
argsparser_analyze.set_defaults(func=analyze_command.run_analyze)
argsparser_build.set_defaults(func=build_command.run_build)
args = argsparser.parse_args()
args.func(args)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment