Ce serveur Gitlab sera éteint le 30 juin 2020, pensez à migrer vos projets vers les serveurs gitlab-research.centralesupelec.fr et gitlab-student.centralesupelec.fr !

Commit c9a0c858 authored by Dos Santos David's avatar Dos Santos David

refactor code using collection class

parent d8871cb7
from gogole.collection.collection import Collection
from gogole.collection.cacm_collection import CACMCollection
from gogole.collection.stanford_collection import StanfordCollection
from gogole.collection import Collection
from gogole.indexer.bsbi_indexer import BSBIIndexer
from gogole.parser.cacm_parser import CACMParser
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
class CACMCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer(maxsize=None)
self._parser = CACMParser()
self._tokenizer = SimpleTokenizer()
class Collection:
Describe a standard collection
def __init__(self):
self._parser = None
self._indexer = None
self._tokenizer = None
def parser(self):
return self._parser
def indexer(self):
return self._indexer
def tokenizer(self):
return self._tokenizer
from gogole.collection import Collection
from gogole.indexer.bsbi_indexer import BSBIIndexer
from gogole.parser.stanford_parser import StanfordParser
from gogole.tokenizer.no_tokenizer import NoTokenizer
class StanfordCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer(maxsize=16*1024*1024)
self._parser = StanfordCollection()
self._tokenizer = NoTokenizer()
...@@ -7,7 +7,11 @@ from gogole.utils import heap_law ...@@ -7,7 +7,11 @@ from gogole.utils import heap_law
COMMANDS = ['all', 'count_tokens', 'heap_law'] COMMANDS = ['all', 'count_tokens', 'heap_law']
def run_analyze_command(parser, tokenizer, index, args): def run_analyze_command(collection, args):
parser = collection.parser
tokenizer = collection.tokenizer
commands = args.analyze_command commands = args.analyze_command
if 'all' in commands: if 'all' in commands:
def run_index_command(parser, tokenizer, indexer, args): def run_index_command(collection, args):
parser = collection.parser
tokenizer = collection.tokenizer
indexer = collection.indexer
if args.index_command == 'build': if args.index_command == 'build':
print('... loading the documents...') print('... loading the documents...')
from gogole.parser import CACMParser, StanfordParser from gogole import collection
COLLECTIONS = {"cacm": CACMParser, "stanford": StanfordParser} COLLECTIONS = {
"cacm": collection.CACMCollection,
"stanford": collection.StanfordCollection
...@@ -74,23 +74,18 @@ if __name__ == "__main__": ...@@ -74,23 +74,18 @@ if __name__ == "__main__":
help="collection to use") help="collection to use")
main_args = main_parser.parse_args() main_args = main_parser.parse_args()
parser_cls = COLLECTIONS[main_args.collection] collection = COLLECTIONS[main_args.collection]()
cli_parser = build_cli_parser() cli_parser = build_cli_parser()
parser = parser_cls()
tokenizer = NoTokenizer()
indexer = BSBIIndexer()
while True: while True:
print('') print('')
raw_input = input("gogole > ") raw_input = input("gogole > ")
try: try:
args = cli_parser.parse_args(raw_input.split(' ')) args = cli_parser.parse_args(raw_input.split(' '))
commands.MAIN_COMMANDS_MAP[args.main_command](parser, tokenizer, indexer, args) commands.MAIN_COMMANDS_MAP[args.main_command](collection, args)
except SystemExit: except SystemExit:
pass pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment