Ce serveur Gitlab sera éteint le 30 juin 2020, pensez à migrer vos projets vers les serveurs gitlab-research.centralesupelec.fr et gitlab-student.centralesupelec.fr !

Commit c9a0c858 authored by Dos Santos David's avatar Dos Santos David

refactor code using collection class

parent d8871cb7
from gogole.collection.collection import Collection
from gogole.collection.cacm_collection import CACMCollection
from gogole.collection.stanford_collection import StanfordCollection
from gogole.collection import Collection
from gogole.indexer.bsbi_indexer import BSBIIndexer
from gogole.parser.cacm_parser import CACMParser
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
class CACMCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer(maxsize=None)
self._parser = CACMParser()
self._tokenizer = SimpleTokenizer()
class Collection:
Describe a standard collection
def __init__(self):
self._parser = None
self._indexer = None
self._tokenizer = None
def parser(self):
return self._parser
def indexer(self):
return self._indexer
def tokenizer(self):
return self._tokenizer
from gogole.collection import Collection
from gogole.indexer.bsbi_indexer import BSBIIndexer
from gogole.parser.stanford_parser import StanfordParser
from gogole.tokenizer.no_tokenizer import NoTokenizer
class StanfordCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer(maxsize=16*1024*1024)
self._parser = StanfordCollection()
self._tokenizer = NoTokenizer()
......@@ -7,7 +7,11 @@ from gogole.utils import heap_law
COMMANDS = ['all', 'count_tokens', 'heap_law']
def run_analyze_command(parser, tokenizer, index, args):
def run_analyze_command(collection, args):
parser = collection.parser
tokenizer = collection.tokenizer
commands = args.analyze_command
if 'all' in commands:
def run_index_command(parser, tokenizer, indexer, args):
def run_index_command(collection, args):
parser = collection.parser
tokenizer = collection.tokenizer
indexer = collection.indexer
if args.index_command == 'build':
print('... loading the documents...')
from gogole.parser import CACMParser, StanfordParser
from gogole import collection
COLLECTIONS = {"cacm": CACMParser, "stanford": StanfordParser}
"cacm": collection.CACMCollection,
"stanford": collection.StanfordCollection
......@@ -74,23 +74,18 @@ if __name__ == "__main__":
help="collection to use")
main_args = main_parser.parse_args()
parser_cls = COLLECTIONS[main_args.collection]
collection = COLLECTIONS[main_args.collection]()
cli_parser = build_cli_parser()
parser = parser_cls()
tokenizer = NoTokenizer()
indexer = BSBIIndexer()
while True:
raw_input = input("gogole > ")
args = cli_parser.parse_args(raw_input.split(' '))
commands.MAIN_COMMANDS_MAP[args.main_command](parser, tokenizer, indexer, args)
commands.MAIN_COMMANDS_MAP[args.main_command](collection, args)
except SystemExit:
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment