Commit 1bbc82e8 authored by Prot Alexandre's avatar Prot Alexandre
parents 6843891c 38991e0b
## Recherche d'informations web
### Run our program
### Lancement du programme
If you have docker, you can run
Pour lancer le programme en mode interactif :
```shell
$ # lancer le programme pour la collection stanford ou cacm
$ bin/gogole --collection <stanford|cacm>
```
docker-compose up
#### Statistiques sur la collection
Vous pouvez ensuite :
* obtenir des statistiques sur les collections: nombre de tokens, taille du vocabulaire, loi de Heap et de Zipf :
```shell
gogole > analyze all
```
But this is broken for now, don't use it ;)
#### Opérations sur l'index
Sur l'index, vous pouvez :
* faire des opérations sur l'index. En général, la première opération dont on a besoin est la construction de l'index :
```shell
gogole > index build
```
Vous pouvez forcer la construction de l'index sans utiliser le cache :
```shell
gogole > index build --no-cache
```
Example direcly with python:
Vous pouvez aussi rechercher les document ids dans lesquels aparait un mot :
```shell
gogole > index lookup <token>
```
python3 main.py --file-name data/cacm.all --stop-words-file data/common_words analyze all
#### Recherche
##### Recherche booléenne
La requête doit être en forme normale conjonctive. Par exemple `a OR b OR c AND d OR e` sera traité en `(a OR b OR c) AND (d OR e)`.
Pour effectuer une requête booléenne (attention l'index doit être construit à l'aide de `index build` auparavant):
```shell
gogole > search -b <query>
gogole > search --bolean <query>
```
##### Recherche vectorielle
```shell
gogole > search -v <query>
gogole > search --vectorial <query>
```
from gogole.collection.collection import Collection
from gogole.collection.cacm_collection import CACMCollection
from gogole.collection.stanford_collection import StanfordCollection
from gogole.collection import Collection
from gogole.indexer.bsbi_indexer import BSBIIndexer
from gogole.parser.cacm_parser import CACMParser
from gogole.tokenizer.simple_tokenizer import SimpleTokenizer
class CACMCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer('cacm', maxsize=None)
self._parser = CACMParser()
self._tokenizer = SimpleTokenizer()
class Collection:
"""
Describe a standard collection
"""
def __init__(self):
self._parser = None
self._indexer = None
self._tokenizer = None
@property
def parser(self):
return self._parser
@property
def indexer(self):
return self._indexer
@property
def tokenizer(self):
return self._tokenizer
from gogole.collection import Collection
from gogole.indexer.bsbi_indexer import BSBIIndexer
from gogole.parser.stanford_parser import StanfordParser
from gogole.tokenizer.no_tokenizer import NoTokenizer
class StanfordCollection(Collection):
def __init__(self):
# BSBI indexer with single block
self._indexer = BSBIIndexer('stanford', maxsize=16*1024*1024)
self._parser = StanfordParser()
self._tokenizer = NoTokenizer()
from gogole.commands import analyze_command
from gogole.commands import index_command
from gogole.commands import analyze_command, index_command, search_command
MAIN_COMMANDS_MAP = {
'analyze': analyze_command.run_analyze_command,
'index': index_command.run_index_command
'analyze': analyze_command.run,
'index': index_command.run,
'search': search_command.run
}
......@@ -7,7 +7,11 @@ from gogole.utils import heap_law
COMMANDS = ['all', 'count_tokens', 'heap_law']
def run_analyze_command(parser, tokenizer, index, args):
def run(collection, args):
parser = collection.parser
tokenizer = collection.tokenizer
commands = args.analyze_command
if 'all' in commands:
......
def run_index_command(parser, tokenizer, indexer, args):
def run(collection, args):
parser = collection.parser
tokenizer = collection.tokenizer
indexer = collection.indexer
if args.index_command == 'build':
print('loading the documents...')
build_index = args.build_no_cache
if not args.build_no_cache:
# try to load index from the disk
if indexer.load_from_cache():
print('cache loaded from disk')
else:
build_index = True
if build_index:
print('... loading the documents...')
count_documents = 0
for document in parser.find_documents(limit=10):
for document in parser.find_documents(limit=None):
count_documents += 1
tokens = tokenizer.get_tokens(document)
counted_tokens = tokenizer.get_counted_tokens(document)
indexer.add_document_tokens(document.document_id, tokens)
indexer.add_document_tokens(document, counted_tokens)
print('{} documents loaded'.format(count_documents))
print('... {} documents loaded'.format(count_documents))
indexer.build_index()
print('index built')
_,t = indexer.build_index()
print('... index created in {elapsed_time:.2f} ms'.format(elapsed_time=t))
if args.index_command == 'lookup':
doc_ids = indexer.token_lookup(args.token[0])
token = args.token[0]
doc_ids = indexer.token_lookup_with_frequency(token)
if doc_ids:
print(doc_ids)
print("{token} is present in {nb_docs} documents\n".format(
token=token,
nb_docs=len(doc_ids)
))
for doc_id, frequency in doc_ids.items():
print("doc : {doc_id} (frequency: {frequency})".format(doc_id=doc_id, frequency=frequency))
else:
print('no result :(')
if args.index_command == 'stats':
if collection.indexer.status == collection.indexer.INDEX_STATUS_NOT_CREATED:
print('index not created')
return
print('index created\n')
print('Size of the index :\t\t\t{size:,} bytes'.format(size=collection.indexer.get_index_size()))
print('Size of the map token->token_id :\t{size:,} bytes'.format(size=collection.indexer.get_tokens_map_size()))
from gogole import query
def run(collection, args):
q = " ".join(args.query)
query_cls = query.QUERY_MAP[args.search_query_type]
query_browser = query_cls(collection)
print("searching {query} using {model} model".format(
query=q,
model=args.search_query_type
))
query_browser.search(q)
from gogole.parser import CACMParser, StanfordParser
from gogole import collection
COLLECTIONS = {"cacm": CACMParser, "stanford": StanfordParser}
COLLECTIONS = {
"cacm": collection.CACMCollection,
"stanford": collection.StanfordCollection
}
......@@ -3,25 +3,43 @@ import itertools
import heapq
import os
import math
import pickle
class BSBIIndexer:
BLOCK_SIZE = 4
from gogole.utils import timeit
from gogole.indexer import Indexer
def __init__(self, maxsize=None):
class BSBIIndexer(Indexer):
BLOCK_SIZE = 12
def __init__(self, collection_name, maxsize=None):
"""
:param maxsize: max size of the buffer (in bytes)
"""
self.tokens_map = dict()
self.token_id_seq = itertools.count() # next token id
self.init_token_id_seq() # next token id
self.maxsize = maxsize
self.buffer = []
self.tmp_filename_format='.cache/tmp_index_{}'
self.tmp_filename_format='.cache/{}_tmp_index_{{}}'.format(collection_name)
self.tmp_filenames = []
self.tmp_file_id_seq = itertools.count()
self.document_norms = dict()
self.INDEX_FILE = '.cache/{}_index'.format(collection_name)
self.TOKENS_MAP_FILE = '.cache/{}_tokens_map'.format(collection_name)
self.DOCUMENT_NORMS_FILE = '.cache/{}_document_norms'.format(collection_name)
self.status = self.INDEX_STATUS_NOT_CREATED
def get_collection_size(self):
return len(self.document_norms)
def init_token_id_seq(self, start=0):
self.token_id_seq = itertools.count(start=start)
def find_or_create_token_id(self, token):
if token not in self.tokens_map:
token_id = next(self.token_id_seq)
......@@ -33,6 +51,10 @@ class BSBIIndexer:
return self.tokens_map[token]
def cleanup(self):
for filename in self.tmp_filenames:
os.remove(filename)
def flush_buffer(self):
sorted_tuples = sorted(self.buffer)
......@@ -41,30 +63,42 @@ class BSBIIndexer:
with open(filename, 'wb') as f:
self.tmp_filenames.append(f.name)
for token_id, doc_id in sorted_tuples:
for token_id, doc_id, frequency in sorted_tuples:
# assume we already are at the end of the file
b = bytearray()
b += struct.pack('H', token_id) # H stands for unsigned short integer ( 2 bytes - up to 65535)
b += struct.pack('H', doc_id)
b += struct.pack('i', token_id)
b += struct.pack('i', doc_id)
b += struct.pack('i', frequency)
f.write(b)
# reset the buffer
self.buffer = []
def add_document_tokens(self, doc_id, tokens):
def compute_document_vector_norm(self, counted_tokens):
norm = 0
for token, count in counted_tokens.items():
norm += (1 + math.log10(count))**2
return math.sqrt(norm)
def add_document_tokens(self, document, counted_tokens):
doc_id = document.document_id
# convert tokens to token ids
token_ids = set()
for token in tokens:
self.document_norms[doc_id] = self.compute_document_vector_norm(counted_tokens)
for token, frequency in counted_tokens.items():
token_id = self.find_or_create_token_id(token)
self.buffer += [(token_id, doc_id)]
self.buffer += [(token_id, doc_id, frequency)]
if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
self.flush_buffer()
def _read_in_chunks(self, f, blocksize=4):
def _read_in_chunks(self, f, blocksize=8):
while True:
data = f.read(blocksize)
if not data:
......@@ -72,7 +106,7 @@ class BSBIIndexer:
yield data
@timeit
def build_index(self):
# 1/ flush the buffer
......@@ -82,7 +116,7 @@ class BSBIIndexer:
try:
merged_tuples_iterator = heapq.merge(*map(self._read_in_chunks, tmp_files))
with open('.cache/index', 'wb') as f:
with open(self.INDEX_FILE, 'wb') as f:
for t in merged_tuples_iterator:
# TODO: maybe write by block ?
f.write(t)
......@@ -90,25 +124,30 @@ class BSBIIndexer:
for fp in tmp_files:
fp.close()
self.save_to_disk()
self.cleanup() # cleanup temporary files
self.status = self.INDEX_STATUS_CREATED
def _read_token_id(self, file, pos):
file.seek(pos*self.BLOCK_SIZE, 0)
token_id = struct.unpack('H', file.read(2))[0]
token_id = struct.unpack('i', file.read(4))[0]
return token_id
def token_lookup(self, token):
def token_lookup_with_frequency(self, token):
"""
Returns a list of documents
where a given token is present
:param token: token to search in documents
"""
document_ids = dict()
if token not in self.tokens_map:
return []
return document_ids
token_id = self.tokens_map[token]
with open('.cache/index', 'rb') as f:
with open(self.INDEX_FILE, 'rb') as f:
upper_bound = (os.fstat(f.fileno()).st_size) // self.BLOCK_SIZE
lower_bound = 0
......@@ -119,7 +158,6 @@ class BSBIIndexer:
to_visit = {mid}
visited = set()
t_id = None
document_ids = set()
while to_visit:
pos = to_visit.pop()
......@@ -127,8 +165,11 @@ class BSBIIndexer:
t_id = self._read_token_id(f, pos)
if t_id == token_id:
doc_id = struct.unpack('H', f.read(2))[0]
document_ids.add(doc_id)
doc_id = struct.unpack('i', f.read(4))[0]
frequency = struct.unpack('i', f.read(4))[0]
document_ids[doc_id] = frequency
for p in [pos+1, pos-1]:
if p not in visited and lower_bound <= p and upper_bound >= p:
to_visit.add(p)
......@@ -141,3 +182,36 @@ class BSBIIndexer:
lower_bound = mid+1
else:
upper_bound = mid-1
def token_lookup(self, token):
return set(self.token_lookup_with_frequency(token).keys())
def save_to_disk(self):
with open(self.TOKENS_MAP_FILE, 'wb') as f:
pickle.dump(self.tokens_map, f, pickle.HIGHEST_PROTOCOL)
with open(self.DOCUMENT_NORMS_FILE, 'wb') as f:
pickle.dump(self.document_norms, f, pickle.HIGHEST_PROTOCOL)
def load_from_cache(self):
try:
with open(self.TOKENS_MAP_FILE, 'rb') as f:
self.tokens_map = pickle.load(f)
with open(self.DOCUMENT_NORMS_FILE, 'rb') as f:
self.document_norms = pickle.load(f)
self.status = self.INDEX_STATUS_CREATED
return True
except FileNotFoundError:
return False
self.init_token_id_seq(max(self.tokens_map.keys()))
def get_index_size(self):
return os.stat(self.INDEX_FILE).st_size
def get_tokens_map_size(self):
return os.stat(self.TOKENS_MAP_FILE).st_size
class Indexer:
INDEX_STATUS_NOT_CREATED = 0
INDEX_STATUS_CREATED = 1
def add_documents_token(self, doc_id, tokens):
def add_documents_token(self, document, tokens):
raise Exception('add_documents_token not implemented')
def build_index(self):
......@@ -9,3 +11,12 @@ class Indexer:
def token_lookup(self, token):
raise Exception('token_lookup not implemented')
def load_from_cache(self):
return Exception("load_from_cache not implemented")
def get_index_size(self):
return 0
def get_tokens_map_size(self):
return 0
......@@ -7,12 +7,15 @@ class SimpleIndexer(Indexer):
def __init__(self):
self.tokens_document_map = defaultdict(list)
def add_documents_token(self, doc_id, tokens):
for token in tokens:
self.tokens_document_map[token] += doc_id
def add_documents_token(self, document, counted_tokens):
for token, count in counted_tokens.items():
self.tokens_document_map[token] += [(document.document_id,count)]
def build_index(self):
pass
def token_lookup(self, token):
return self.tokens_document_map.get(token, [])
return dict(self.tokens_document_map.get(token, []))
def load_from_cache(self):
return False
from gogole.document import StanfordDocument
from os import listdir
import itertools
class StanfordParser:
......@@ -7,7 +8,7 @@ class StanfordParser:
def find_documents(self, limit=None):
counter = 0 # count documents found
counter = itertools.count() # count documents found
for collection_index in range(10):
collection_dir = self.DIRECTORY + "/" + str(collection_index)
......@@ -17,9 +18,10 @@ class StanfordParser:
return
with open(collection_dir + "/" + filename, 'r') as f:
current_document_id = str(collection_index) + filename
url = str(collection_index) + filename
current_document_id = next(counter)
current_document = StanfordDocument(current_document_id)
counter += 1
content = ""
for line in f:
......
from gogole.query.query import Query
from gogole.query.boolean_query import BooleanQuery
from gogole.query.vectorial_query import VectorialQuery
QUERY_TYPE_BOOLEAN = 'boolean'
QUERY_TYPE_VECTORIAL = 'vectorial'
QUERY_MAP = {
QUERY_TYPE_BOOLEAN: BooleanQuery,
QUERY_TYPE_VECTORIAL: VectorialQuery
}
from gogole.query import Query
class BooleanQuery(Query):
OPERATOR_AND = ' and '
OPERATOR_OR = ' or '
OPERATOR_NOT = 'not '
def search_documents(self, query):
# Assume the expression
# is in the conjunctive normal form
last_doc_id = self.collection.indexer.get_collection_size()-1
and_queries = query.split(self.OPERATOR_AND)
doc_ids_by_conjunction = list()
for and_query in and_queries:
doc_ids_disjonction = set()
for query_term in and_query.split(self.OPERATOR_OR):
query_term = query_term.strip()
is_not_query = query_term.startswith(self.OPERATOR_NOT)
if is_not_query:
query_term = query_term[len(self.OPERATOR_NOT):]
doc_ids = self.collection.indexer.token_lookup(query_term)
if is_not_query:
doc_ids_disjonction.update(set(range(last_doc_id)) - doc_ids)
else:
doc_ids_disjonction.update(doc_ids)
doc_ids_by_conjunction.append(doc_ids_disjonction)
return set.intersection(*doc_ids_by_conjunction)
def search(self, query):
"""
Parse a boolean query
and return a list of documents relevant for this query
"""
doc_ids = self.search_documents(query.lower())
print("Document ids : {}".format(", ".join(str(x) for x in doc_ids)))
class Query:
def __init__(self, collection):
self.collection = collection
def search(self, query):
raise Exception('search not implemented')
from collections import defaultdict
import math
from gogole.query import Query
from gogole.utils import timeit
class VectorialQuery(Query):
def find_n_first_elements(self, similarities, n=10):
sorted_docs = [x[0] for x in sorted(similarities.items(), key=lambda x: x[1], reverse=True)]
return sorted_docs[:n]
def search(self,query):
results,t = self.timed_search(query)
print("Found {count_results} results in {elapsed_time:.2f} ms".format(
elapsed_time=t,
count_results=len(results)
))
for position, doc_id in enumerate(self.find_n_first_elements(results, n=10), start=1):
print('{}: doc id {}\n'.format(position, doc_id))
@timeit
def timed_search(self, query):
tokens = query.split(' ')
tf_query = defaultdict(int)
for token in tokens:
tf_query[token] += 1
df = defaultdict(int)
tf = defaultdict(lambda: defaultdict(int))
N = self.collection.indexer.get_collection_size()
for token in tokens:
doc_ids = self.collection.indexer.token_lookup_with_frequency(token)
if len(doc_ids) > 0:
df[token] += len(doc_ids)
for doc_id,freq in doc_ids.items():
tf[doc_id][token] += freq
similarities = defaultdict(int)
for doc_id, tokens_frequency in tf.items():
dot_product = 0
squared_norm_query = 0
for token, token_df in df.items():
doc_weight = 0
if token in tokens_frequency:
if tokens_frequency[token] == 0:
raise Exception("frequency of {} is 0".format(token))
doc_weight = (1 + math.log10(tokens_frequency[token])) * math.log10(N/token_df)
query_weight = (1 + math.log10(tf_query[token]))
squared_norm_query += (query_weight**2)
dot_product += doc_weight * query_weight
norm_doc = self.collection.indexer.document_norms[doc_id]
similarities[doc_id] = dot_product / (norm_doc + math.sqrt(squared_norm_query))
return similarities
class AbstractTokenizer:
"""
Abstract Tokenizer
"""
@staticmethod
def get_tokens(document: 'Document'):
raise Error("Method get_tokens not implemented")
import collections
from gogole.tokenizer.tokenizer import Tokenizer
class NoTokenizer(Tokenizer):
def get_tokens(self, document):
return document.get_raw_content().strip().split()
from itertools import chain
from gogole.tokenizer.abstract_tokenizer import AbstractTokenizer
from gogole.tokenizer.tokenizer import Tokenizer
class SimpleTokenizer(AbstractTokenizer):
class SimpleTokenizer(Tokenizer):
"""
Simple tokenizer using any space or punctuation sign as separator
"""
......
import collections
class Tokenizer:
"""
Basic Tokenizer
"""
def get_tokens(self, document: 'gogole.document.Document'):
raise Exception("method get_all_tokens not implemented")
def get_counted_tokens(self, document: 'Document'):
return collections.Counter(self.get_tokens(document))
from gogole.utils.timeit import timeit