Commit 1bbc82e8 authored by Prot Alexandre's avatar Prot Alexandre
parents 6843891c 38991e0b
## Recherche d'informations web ## Recherche d'informations web
### Run our program ### Lancement du programme
If you have docker, you can run Pour lancer le programme en mode interactif :
```shell
$ # lancer le programme pour la collection stanford ou cacm
$ bin/gogole --collection <stanford|cacm>
```
#### Statistiques sur la collection
Vous pouvez ensuite :
* obtenir des statistiques sur les collections: nombre de tokens, taille du vocabulaire, loi de Heap et de Zipf :
```shell
gogole > analyze all
```
#### Opérations sur l'index
Sur l'index, vous pouvez :
* faire des opérations sur l'index. En général, la première opération dont on a besoin est la construction de l'index :
```shell
gogole > index build
``` ```
docker-compose up Vous pouvez forcer la construction de l'index sans utiliser le cache :
```shell
gogole > index build --no-cache
``` ```
But this is broken for now, don't use it ;) Vous pouvez aussi rechercher les document ids dans lesquels aparait un mot :
```shell
gogole > index lookup <token>
```
#### Recherche
##### Recherche booléenne
La requête doit être en forme normale conjonctive. Par exemple `a OR b OR c AND d OR e` sera traité en `(a OR b OR c) AND (d OR e)`.
Pour effectuer une requête booléenne (attention l'index doit être construit à l'aide de `index build` auparavant):
```shell
gogole > search -b <query>
gogole > search --bolean <query>
```
Example direcly with python: ##### Recherche vectorielle
```shell
gogole > search -v <query>
gogole > search --vectorial <query>
``` ```
python3 main.py --file-name data/cacm.all --stop-words-file data/common_words analyze all
```
\ No newline at end of file
from gogole.collection.collection import Collection
from gogole.collection.cacm_collection import CACMCollection
from gogole.collection.stanford_collection import StanfordCollection
from gogole.collection import Collection
from gogole.indexer.bsbi_indexer import BSBIIndexer
from gogole.parser.cacm_parser import CACMParser
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
class CACMCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer('cacm', maxsize=None)
self._parser = CACMParser()
self._tokenizer = SimpleTokenizer()
class Collection:
"""
Describe a standard collection
"""
def __init__(self):
self._parser = None
self._indexer = None
self._tokenizer = None
@property
def parser(self):
return self._parser
@property
def indexer(self):
return self._indexer
@property
def tokenizer(self):
return self._tokenizer
from gogole.collection import Collection
from gogole.indexer.bsbi_indexer import BSBIIndexer
from gogole.parser.stanford_parser import StanfordParser
from gogole.tokenizer.no_tokenizer import NoTokenizer
class StanfordCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer('stanford', maxsize=16*1024*1024)
self._parser = StanfordParser()
self._tokenizer = NoTokenizer()
from gogole.commands import analyze_command from gogole.commands import analyze_command, index_command, search_command
from gogole.commands import index_command
MAIN_COMMANDS_MAP = { MAIN_COMMANDS_MAP = {
'analyze': analyze_command.run_analyze_command, 'analyze': analyze_command.run,
'index': index_command.run_index_command 'index': index_command.run,
'search': search_command.run
} }
...@@ -7,7 +7,11 @@ from gogole.utils import heap_law ...@@ -7,7 +7,11 @@ from gogole.utils import heap_law
COMMANDS = ['all', 'count_tokens', 'heap_law'] COMMANDS = ['all', 'count_tokens', 'heap_law']
def run_analyze_command(parser, tokenizer, index, args): def run(collection, args):
parser = collection.parser
tokenizer = collection.tokenizer
commands = args.analyze_command commands = args.analyze_command
if 'all' in commands: if 'all' in commands:
......
def run_index_command(parser, tokenizer, indexer, args): def run(collection, args):
parser = collection.parser
tokenizer = collection.tokenizer
indexer = collection.indexer
if args.index_command == 'build': if args.index_command == 'build':
print('loading the documents...') build_index = args.build_no_cache
count_documents = 0 if not args.build_no_cache:
for document in parser.find_documents(limit=10): # try to load index from the disk
count_documents += 1 if indexer.load_from_cache():
tokens = tokenizer.get_tokens(document) print('cache loaded from disk')
else:
build_index = True
indexer.add_document_tokens(document.document_id, tokens) if build_index:
print('... loading the documents...')
print('{} documents loaded'.format(count_documents)) count_documents = 0
for document in parser.find_documents(limit=None):
count_documents += 1
counted_tokens = tokenizer.get_counted_tokens(document)
indexer.build_index() indexer.add_document_tokens(document, counted_tokens)
print('index built')
print('... {} documents loaded'.format(count_documents))
_,t = indexer.build_index()
print('... index created in {elapsed_time:.2f} ms'.format(elapsed_time=t))
if args.index_command == 'lookup': if args.index_command == 'lookup':
doc_ids = indexer.token_lookup(args.token[0]) token = args.token[0]
doc_ids = indexer.token_lookup_with_frequency(token)
if doc_ids: if doc_ids:
print(doc_ids) print("{token} is present in {nb_docs} documents\n".format(
token=token,
nb_docs=len(doc_ids)
))
for doc_id, frequency in doc_ids.items():
print("doc : {doc_id} (frequency: {frequency})".format(doc_id=doc_id, frequency=frequency))
else: else:
print('no result :(') print('no result :(')
if args.index_command == 'stats':
if collection.indexer.status == collection.indexer.INDEX_STATUS_NOT_CREATED:
print('index not created')
return
print('index created\n')
print('Size of the index :\t\t\t{size:,} bytes'.format(size=collection.indexer.get_index_size()))
print('Size of the map token->token_id :\t{size:,} bytes'.format(size=collection.indexer.get_tokens_map_size()))
from gogole import query
def run(collection, args):
q = " ".join(args.query)
query_cls = query.QUERY_MAP[args.search_query_type]
query_browser = query_cls(collection)
print("searching {query} using {model} model".format(
query=q,
model=args.search_query_type
))
query_browser.search(q)
from gogole.parser import CACMParser, StanfordParser from gogole import collection
COLLECTIONS = {"cacm": CACMParser, "stanford": StanfordParser} COLLECTIONS = {
"cacm": collection.CACMCollection,
"stanford": collection.StanfordCollection
}
...@@ -3,25 +3,43 @@ import itertools ...@@ -3,25 +3,43 @@ import itertools
import heapq import heapq
import os import os
import math import math
import pickle
class BSBIIndexer: from gogole.utils import timeit
BLOCK_SIZE = 4 from gogole.indexer import Indexer
def __init__(self, maxsize=None): class BSBIIndexer(Indexer):
BLOCK_SIZE = 12
def __init__(self, collection_name, maxsize=None):
""" """
:param maxsize: max size of the buffer (in bytes) :param maxsize: max size of the buffer (in bytes)
""" """
self.tokens_map = dict() self.tokens_map = dict()
self.token_id_seq = itertools.count() # next token id self.init_token_id_seq() # next token id
self.maxsize = maxsize self.maxsize = maxsize
self.buffer = [] self.buffer = []
self.tmp_filename_format='.cache/tmp_index_{}' self.tmp_filename_format='.cache/{}_tmp_index_{{}}'.format(collection_name)
self.tmp_filenames = [] self.tmp_filenames = []
self.tmp_file_id_seq = itertools.count() self.tmp_file_id_seq = itertools.count()
self.document_norms = dict()
self.INDEX_FILE = '.cache/{}_index'.format(collection_name)
self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name)
self.DOCUMENT_NORMS_FILE = '.cache/{}_document_norms'.format(collection_name)
self.status = self.INDEX_STATUS_NOT_CREATED
def get_collection_size(self):
return len(self.document_norms)
def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start)
def find_or_create_token_id(self, token): def find_or_create_token_id(self, token):
if token not in self.tokens_map: if token not in self.tokens_map:
token_id = next(self.token_id_seq) token_id = next(self.token_id_seq)
...@@ -33,6 +51,10 @@ class BSBIIndexer: ...@@ -33,6 +51,10 @@ class BSBIIndexer:
return self.tokens_map[token] return self.tokens_map[token]
def cleanup(self):
for filename in self.tmp_filenames:
os.remove(filename)
def flush_buffer(self): def flush_buffer(self):
sorted_tuples = sorted(self.buffer) sorted_tuples = sorted(self.buffer)
...@@ -41,30 +63,42 @@ class BSBIIndexer: ...@@ -41,30 +63,42 @@ class BSBIIndexer:
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
self.tmp_filenames.append(f.name) self.tmp_filenames.append(f.name)
for token_id, doc_id in sorted_tuples: for token_id, doc_id, frequency in sorted_tuples:
# assume we already are at the end of the file # assume we already are at the end of the file
b = bytearray() b = bytearray()
b += struct.pack('H', token_id) # H stands for unsigned short integer ( 2 bytes - up to 65535) b += struct.pack('i', token_id)
b += struct.pack('H', doc_id) b += struct.pack('i', doc_id)
b += struct.pack('i', frequency)
f.write(b) f.write(b)
# reset the buffer # reset the buffer
self.buffer = [] self.buffer = []
def add_document_tokens(self, doc_id, tokens): def compute_document_vector_norm(self, counted_tokens):
norm = 0
for token, count in counted_tokens.items():
norm += (1 + math.log10(count))**2
return math.sqrt(norm)
def add_document_tokens(self, document, counted_tokens):
doc_id = document.document_id
# convert tokens to token ids # convert tokens to token ids
token_ids = set() token_ids = set()
for token in tokens: self.document_norms[doc_id] = self.compute_document_vector_norm(counted_tokens)
for token, frequency in counted_tokens.items():
token_id = self.find_or_create_token_id(token) token_id = self.find_or_create_token_id(token)
self.buffer += [(token_id, doc_id)] self.buffer += [(token_id, doc_id, frequency)]
if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize: if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
self.flush_buffer() self.flush_buffer()
def _read_in_chunks(self, f, blocksize=4): def _read_in_chunks(self, f, blocksize=8):
while True: while True:
data = f.read(blocksize) data = f.read(blocksize)
if not data: if not data:
...@@ -72,7 +106,7 @@ class BSBIIndexer: ...@@ -72,7 +106,7 @@ class BSBIIndexer:
yield data yield data
@timeit
def build_index(self): def build_index(self):
# 1/ flush the buffer # 1/ flush the buffer
...@@ -82,7 +116,7 @@ class BSBIIndexer: ...@@ -82,7 +116,7 @@ class BSBIIndexer:
try: try:
merged_tuples_iterator = heapq.merge(*map(self._read_in_chunks, tmp_files)) merged_tuples_iterator = heapq.merge(*map(self._read_in_chunks, tmp_files))
with open('.cache/index', 'wb') as f: with open(self.INDEX_FILE, 'wb') as f:
for t in merged_tuples_iterator: for t in merged_tuples_iterator:
# TODO: maybe write by block ? # TODO: maybe write by block ?
f.write(t) f.write(t)
...@@ -90,25 +124,30 @@ class BSBIIndexer: ...@@ -90,25 +124,30 @@ class BSBIIndexer:
for fp in tmp_files: for fp in tmp_files:
fp.close() fp.close()
self.save_to_disk()
self.cleanup() # cleanup temporary files
self.status = self.INDEX_STATUS_CREATED
def _read_token_id(self, file, pos): def _read_token_id(self, file, pos):
file.seek(pos*self.BLOCK_SIZE, 0) file.seek(pos*self.BLOCK_SIZE, 0)
token_id = struct.unpack('H', file.read(2))[0] token_id = struct.unpack('i', file.read(4))[0]
return token_id return token_id
def token_lookup(self, token): def token_lookup_with_frequency(self, token):
""" """
Returns a list of documents Returns a list of documents
where a given token is present where a given token is present
:param token: token to search in documents :param token: token to search in documents
""" """
document_ids = dict()
if token not in self.tokens_map: if token not in self.tokens_map:
return [] return document_ids
token_id = self.tokens_map[token] token_id = self.tokens_map[token]
with open('.cache/index', 'rb') as f: with open(self.INDEX_FILE, 'rb') as f:
upper_bound = (os.fstat(f.fileno()).st_size) // self.BLOCK_SIZE upper_bound = (os.fstat(f.fileno()).st_size) // self.BLOCK_SIZE
lower_bound = 0 lower_bound = 0
...@@ -119,7 +158,6 @@ class BSBIIndexer: ...@@ -119,7 +158,6 @@ class BSBIIndexer:
to_visit = {mid} to_visit = {mid}
visited = set() visited = set()
t_id = None t_id = None
document_ids = set()
while to_visit: while to_visit:
pos = to_visit.pop() pos = to_visit.pop()
...@@ -127,8 +165,11 @@ class BSBIIndexer: ...@@ -127,8 +165,11 @@ class BSBIIndexer:
t_id = self._read_token_id(f, pos) t_id = self._read_token_id(f, pos)
if t_id == token_id: if t_id == token_id:
doc_id = struct.unpack('H', f.read(2))[0] doc_id = struct.unpack('i', f.read(4))[0]
document_ids.add(doc_id) frequency = struct.unpack('i', f.read(4))[0]
document_ids[doc_id] = frequency
for p in [pos+1, pos-1]: for p in [pos+1, pos-1]:
if p not in visited and lower_bound <= p and upper_bound >= p: if p not in visited and lower_bound <= p and upper_bound >= p:
to_visit.add(p) to_visit.add(p)
...@@ -141,3 +182,36 @@ class BSBIIndexer: ...@@ -141,3 +182,36 @@ class BSBIIndexer:
lower_bound = mid+1 lower_bound = mid+1
else: else:
upper_bound = mid-1 upper_bound = mid-1
def token_lookup(self, token):
return set(self.token_lookup_with_frequency(token).keys())
def save_to_disk(self):
with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.tokens_map, f, pickle.HIGHEST_PROTOCOL)
with open(self.DOCUMENT_NORMS_FILE, 'wb') as f:
pickle.dump(self.document_norms, f, pickle.HIGHEST_PROTOCOL)
def load_from_cache(self):
try:
with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.tokens_map = pickle.load(f)
with open(self.DOCUMENT_NORMS_FILE, 'rb') as f:
self.document_norms = pickle.load(f)
self.status = self.INDEX_STATUS_CREATED
return True
except FileNotFoundError:
return False
self.init_token_id_seq(max(self.tokens_map.keys()))
def get_index_size(self):
return os.stat(self.INDEX_FILE).st_size
def get_tokens_map_size(self):
return os.stat(self.TOKENS_MAP_FILE).st_size
class Indexer: class Indexer:
INDEX_STATUS_NOT_CREATED = 0
INDEX_STATUS_CREATED = 1
def add_documents_token(self, doc_id, tokens): def add_documents_token(self, document, tokens):
raise Exception('add_documents_token not implemented') raise Exception('add_documents_token not implemented')
def build_index(self): def build_index(self):
...@@ -9,3 +11,12 @@ class Indexer: ...@@ -9,3 +11,12 @@ class Indexer:
def token_lookup(self, token): def token_lookup(self, token):
raise Exception('token_lookup not implemented') raise Exception('token_lookup not implemented')
def load_from_cache(self):
return Exception("load_from_cache not implemented")
def get_index_size(self):
return 0
def get_tokens_map_size(self):
return 0
...@@ -7,12 +7,15 @@ class SimpleIndexer(Indexer): ...@@ -7,12 +7,15 @@ class SimpleIndexer(Indexer):
def __init__(self): def __init__(self):
self.tokens_document_map = defaultdict(list) self.tokens_document_map = defaultdict(list)
def add_documents_token(self, doc_id, tokens): def add_documents_token(self, document, counted_tokens):
for token in tokens: for token, count in counted_tokens.items():
self.tokens_document_map[token] += doc_id self.tokens_document_map[token] += [(document.document_id,count)]
def build_index(self): def build_index(self):
pass pass
def token_lookup(self, token): def token_lookup(self, token):
return self.tokens_document_map.get(token, []) return dict(self.tokens_document_map.get(token, []))
def load_from_cache(self):
return False
from gogole.document import StanfordDocument from gogole.document import StanfordDocument
from os import listdir from os import listdir
import itertools
class StanfordParser: class StanfordParser:
...@@ -7,7 +8,7 @@ class StanfordParser: ...@@ -7,7 +8,7 @@ class StanfordParser:
def find_documents(self, limit=None): def find_documents(self, limit=None):
counter = 0 # count documents found counter = itertools.count() # count documents found
for collection_index in range(10): for collection_index in range(10):
collection_dir = self.DIRECTORY + "/" + str(collection_index) collection_dir = self.DIRECTORY + "/" + str(collection_index)
...@@ -17,9 +18,10 @@ class StanfordParser: ...@@ -17,9 +18,10 @@ class StanfordParser:
return return
with open(collection_dir + "/" + filename, 'r') as f: with open(collection_dir + "/" + filename, 'r') as f:
current_document_id = str(collection_index) + filename url = str(collection_index) + filename
current_document_id = next(counter)
current_document = StanfordDocument(current_document_id) current_document = StanfordDocument(current_document_id)
counter += 1
content = "" content = ""
for line in f: for line in f:
......
from gogole.query.query import Query
from gogole.query.boolean_query import BooleanQuery
from gogole.query.vectorial_query import VectorialQuery
QUERY_TYPE_BOOLEAN = 'boolean'
QUERY_TYPE_VECTORIAL = 'vectorial'
QUERY_MAP = {
QUERY_TYPE_BOOLEAN: BooleanQuery,
QUERY_TYPE_VECTORIAL: VectorialQuery
}
from gogole.query import Query
class BooleanQuery(Query):
OPERATOR_AND = ' and '
OPERATOR_OR = ' or '
OPERATOR_NOT = 'not '
def search_documents(self, query):
# Assume the expression
# is in the conjunctive normal form
last_doc_id = self.collection.indexer.get_collection_size()-1
and_queries = query.split(self.OPERATOR_AND)
doc_ids_by_conjunction = list()
for and_query in and_queries:
doc_ids_disjonction = set()
for query_term in and_query.split(self.OPERATOR_OR):
query_term = query_term.strip()
is_not_query = query_term.startswith(self.OPERATOR_NOT)
if is_not_query:
query_term = query_term[len(self.OPERATOR_NOT):]
doc_ids = self.collection.indexer.token_lookup(query_term)
if is_not_query:
doc_ids_disjonction.update(set(range(last_doc_id)) - doc_ids)
else:
doc_ids_disjonction.update(doc_ids)
doc_ids_by_conjunction.append(doc_ids_disjonction)
return set.intersection(*doc_ids_by_conjunction)
def search(self, query):
"""
Parse a boolean query
and return a list of documents relevant for this query
"""
doc_ids = self.search_documents(query.lower())
print("Document ids : {}".format(", ".join(str(x) for x in doc_ids)))
class Query:
def __init__(self, collection):
self.collection = collection
def search(self, query):
raise Exception('search not implemented')
from collections import defaultdict
import math
from gogole.query import Query
from gogole.utils import tim