Commit 091334e3 authored by Dos Santos David's avatar Dos Santos David

build cli

parent f5b729ed
from gogole.commands import analyze_command
from gogole.commands import index_command
MAIN_COMMANDS_MAP = {
'analyze': analyze_command.run_analyze_command,
'index': index_command.run_index_command
}
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law from gogole.utils import heap_law
from gogole.config import COLLECTIONS
ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law']
COMMANDS = ['all', 'count_tokens', 'heap_law']
def load_documents(parser_cls, limit=None):
parser = parser_cls()
return parser.parse_all(limit=limit)
def run_analyze_command(parser, tokenizer, index, args):
def run_analyze(args): commands = args.analyze_command
parser_cls = COLLECTIONS[args.collection]
commands = args.sub_command
if 'all' in commands: if 'all' in commands:
commands = ANALYZE_COMMANDS commands = COMMANDS
documents = load_documents(parser_cls)
tokenizer = SimpleTokenizer(args.stop_words_file) documents = parser.parse_all(limit=None)
tokens_by_document = {doc_id: tokenizer.get_tokens(doc) for doc_id, doc in documents.items() } tokens_by_document = {doc_id: tokenizer.get_tokens(doc) for doc_id, doc in documents.items() }
......
from gogole.indexer import BSBIIndexer
from gogole.parser.cacm_parser import CACMParser
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
#
# Build the reversed index
#
def run_build(args):
parser = CACMParser()
tokenizer = SimpleTokenizer(args.stop_words_file)
indexer = BSBIIndexer(maxsize=128)
for document in parser.find_documents(limit=1000):
tokens = tokenizer.get_tokens(document)
indexer.add_document_tokens(document.document_id, tokens)
indexer.build_index()
print('Reversed index has been built')
def run_index_command(parser, tokenizer, indexer, args):
if args.index_command == 'build':
print('loading the documents...')
count_documents = 0
for document in parser.find_documents(limit=10):
count_documents += 1
tokens = tokenizer.get_tokens(document)
indexer.add_document_tokens(document.document_id, tokens)
print('{} documents loaded'.format(count_documents))
indexer.build_index()
print('index built')
if args.index_command == 'lookup':
doc_ids = indexer.token_lookup(args.token[0])
if doc_ids:
print(doc_ids)
else:
print('no result :(')
import argparse import argparse
from gogole.commands import analyze_command, build_command from gogole import commands
from gogole.config import COLLECTIONS
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.indexer import BSBIIndexer
def main():
argsparser = argparse.ArgumentParser(prog="gogole")
# top-level parser def build_cli_analyze_parser(root_parser):
# collection information args_parser_analyze = root_parser.add_parser(
argsparser.add_argument(
"-c", "--collection",
help="collection to use")
argsparser.add_argument("--stop-words-file", help="stop words list filename")
subparsers = argsparser.add_subparsers()
argsparser_analyze = subparsers.add_parser(
'analyze', 'analyze',
description="Run an analyze like tokens count or find heap's law parameters", description="Run an analyze like tokens count or find heap's law parameters",
help="additional help for analyze", help="additional help for analyze",
) )
argsparser_analyze.add_argument(
"sub_command", args_parser_analyze.add_argument("analyze_command",
nargs='*', nargs='*',
choices=analyze_command.ANALYZE_COMMANDS, choices=commands.analyze_command.COMMANDS,
metavar="analyze_type", metavar="command",
help="can be any of {}".format(", ".join(analyze_command.ANALYZE_COMMANDS)) help="can be any of {}".format(", ".join(commands.analyze_command.COMMANDS))
) )
argsparser_build = subparsers.add_parser(
'build'
)
argsparser_analyze.set_defaults(func=analyze_command.run_analyze) def build_cli_index_parser(root_parser):
argsparser_build.set_defaults(func=build_command.run_build)
index_parser = root_parser.add_parser('index', description="run commands on the index", aliases=['i'])
index_subparser = index_parser.add_subparsers(dest="index_command")
lookup_parser = index_subparser.add_parser('lookup', help="find the documents where a token is")
lookup_parser.add_argument('token', nargs=1)
build_parser = index_subparser.add_parser('build', help="build the index")
def build_cli_parser():
# cli parser
cli_parser = argparse.ArgumentParser(prog="", add_help=False)
cli_subparser = cli_parser.add_subparsers(dest="main_command")
args = argsparser.parse_args() build_cli_analyze_parser(cli_subparser)
args.func(args) build_cli_index_parser(cli_subparser)
return cli_parser
def main():
print('not supported yet')
if __name__ == "__main__": if __name__ == "__main__":
main() # top-level parser
# mainly collection information and stop-words
main_parser = argparse.ArgumentParser(prog="gogole")
main_parser.add_argument(
"-c", "--collection",
help="collection to use")
main_args = main_parser.parse_args()
parser_cls = COLLECTIONS[main_args.collection]
print("Welcome to Gogole !")
cli_parser = build_cli_parser()
parser = parser_cls()
tokenizer = SimpleTokenizer()
indexer = BSBIIndexer()
while True:
raw_input = input("google > ")
try:
args = cli_parser.parse_args(raw_input.split(' '))
commands.MAIN_COMMANDS_MAP[args.main_command](parser, tokenizer, indexer, args)
except SystemExit:
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment