Commit 7fc98aa9 authored by Dos Santos David's avatar Dos Santos David

provide collection name to the indexer

the indexer will use the collection name for the name of the files
stored on disk
parent d4a521b5
......@@ -7,7 +7,7 @@ from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
class CACMCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer(maxsize=None)
self._indexer = BSBIIndexer('cacm', maxsize=None)
self._parser = CACMParser()
......
......@@ -7,7 +7,7 @@ from gogole.tokenizer.no_tokenizer import NoTokenizer
class StanfordCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer(maxsize=16*1024*1024)
self._indexer = BSBIIndexer('stanford', maxsize=16*1024*1024)
self._parser = StanfordCollection()
......
......@@ -10,11 +10,7 @@ from gogole.utils import timeit
class BSBIIndexer:
BLOCK_SIZE = 8
INDEX_FILE = '.cache/index'
TOKENS_MAP_FILE = '.cache/tokens_map'
DOCUMENTS_MAP_FILE = '.cache/documents_map'
def __init__(self, maxsize=None):
def __init__(self, collection_name, maxsize=None):
"""
:param maxsize: max size of the buffer (in bytes)
"""
......@@ -25,10 +21,14 @@ class BSBIIndexer:
self.maxsize = maxsize
self.buffer = []
self.tmp_filename_format='.cache/tmp_index_{}'
self.tmp_filename_format='.cache/{}_tmp_index_{{}}'.format(collection_name)
self.tmp_filenames = []
self.tmp_file_id_seq = itertools.count()
self.INDEX_FILE = '.cache/{}_index'.format(collection_name)
self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name)
self.DOCUMENTS_MAP_FILE = '.cache/{}_documents_map'.format(collection_name)
def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment