Commit 091334e3 authored by Dos Santos David's avatar Dos Santos David

build cli

parent f5b729ed
from gogole.commands import analyze_command
from gogole.commands import index_command
MAIN_COMMANDS_MAP = {
'analyze': analyze_command.run_analyze_command,
'index': index_command.run_index_command
}
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.parser.cacm_parser import CACMParser
from gogole.utils import heap_law
from gogole.config import COLLECTIONS
ANALYZE_COMMANDS = ['all', 'count_tokens', 'heap_law']
COMMANDS = ['all', 'count_tokens', 'heap_law']
def load_documents(parser_cls, limit=None):
parser = parser_cls()
return parser.parse_all(limit=limit)
def run_analyze(args):
parser_cls = COLLECTIONS[args.collection]
commands = args.sub_command
def run_analyze_command(parser, tokenizer, index, args):
commands = args.analyze_command
if 'all' in commands:
commands = ANALYZE_COMMANDS
documents = load_documents(parser_cls)
commands = COMMANDS
tokenizer = SimpleTokenizer(args.stop_words_file)
documents = parser.parse_all(limit=None)
tokens_by_document = {doc_id: tokenizer.get_tokens(doc) for doc_id, doc in documents.items() }
......
from gogole.indexer import BSBIIndexer
from gogole.parser.cacm_parser import CACMParser
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
#
# Build the reversed index
#
def run_build(args):
parser = CACMParser()
tokenizer = SimpleTokenizer(args.stop_words_file)
indexer = BSBIIndexer(maxsize=128)
for document in parser.find_documents(limit=1000):
tokens = tokenizer.get_tokens(document)
indexer.add_document_tokens(document.document_id, tokens)
indexer.build_index()
print('Reversed index has been built')
def run_index_command(parser, tokenizer, indexer, args):
if args.index_command == 'build':
print('loading the documents...')
count_documents = 0
for document in parser.find_documents(limit=10):
count_documents += 1
tokens = tokenizer.get_tokens(document)
indexer.add_document_tokens(document.document_id, tokens)
print('{} documents loaded'.format(count_documents))
indexer.build_index()
print('index built')
if args.index_command == 'lookup':
doc_ids = indexer.token_lookup(args.token[0])
if doc_ids:
print(doc_ids)
else:
print('no result :(')
import argparse
from gogole.commands import analyze_command, build_command
from gogole import commands
from gogole.config import COLLECTIONS
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
from gogole.indexer import BSBIIndexer
def main():
argsparser = argparse.ArgumentParser(prog="gogole")
# top-level parser
# collection information
argsparser.add_argument(
"-c", "--collection",
help="collection to use")
argsparser.add_argument("--stop-words-file", help="stop words list filename")
subparsers = argsparser.add_subparsers()
argsparser_analyze = subparsers.add_parser(
def build_cli_analyze_parser(root_parser):
args_parser_analyze = root_parser.add_parser(
'analyze',
description="Run an analyze like tokens count or find heap's law parameters",
help="additional help for analyze",
)
argsparser_analyze.add_argument(
"sub_command",
args_parser_analyze.add_argument("analyze_command",
nargs='*',
choices=analyze_command.ANALYZE_COMMANDS,
metavar="analyze_type",
help="can be any of {}".format(", ".join(analyze_command.ANALYZE_COMMANDS))
choices=commands.analyze_command.COMMANDS,
metavar="command",
help="can be any of {}".format(", ".join(commands.analyze_command.COMMANDS))
)
argsparser_build = subparsers.add_parser(
'build'
)
argsparser_analyze.set_defaults(func=analyze_command.run_analyze)
argsparser_build.set_defaults(func=build_command.run_build)
def build_cli_index_parser(root_parser):
index_parser = root_parser.add_parser('index', description="run commands on the index", aliases=['i'])
index_subparser = index_parser.add_subparsers(dest="index_command")
lookup_parser = index_subparser.add_parser('lookup', help="find the documents where a token is")
lookup_parser.add_argument('token', nargs=1)
build_parser = index_subparser.add_parser('build', help="build the index")
def build_cli_parser():
# cli parser
cli_parser = argparse.ArgumentParser(prog="", add_help=False)
cli_subparser = cli_parser.add_subparsers(dest="main_command")
args = argsparser.parse_args()
args.func(args)
build_cli_analyze_parser(cli_subparser)
build_cli_index_parser(cli_subparser)
return cli_parser
def main():
print('not supported yet')
if __name__ == "__main__":
main()
# top-level parser
# mainly collection information and stop-words
main_parser = argparse.ArgumentParser(prog="gogole")
main_parser.add_argument(
"-c", "--collection",
help="collection to use")
main_args = main_parser.parse_args()
parser_cls = COLLECTIONS[main_args.collection]
print("Welcome to Gogole !")
cli_parser = build_cli_parser()
parser = parser_cls()
tokenizer = SimpleTokenizer()
indexer = BSBIIndexer()
while True:
raw_input = input("google > ")
try:
args = cli_parser.parse_args(raw_input.split(' '))
commands.MAIN_COMMANDS_MAP[args.main_command](parser, tokenizer, indexer, args)
except SystemExit:
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment