Commit 4af00efd authored by Dos Santos David's avatar Dos Santos David

add vectorial model

parent 319fa131
from collections import defaultdict
import math
from gogole.query import Query
class VectorialQuery(Query):
def find_n_first_elements(self, similarities, n=10):
sorted_docs = [x[0] for x in sorted(similarities.items(), key=lambda x: x[1], reverse=True)]
return sorted_docs[:n]
def search(self, query):
print('coucou')
tokens = query.split(' ')
tf_query = defaultdict(int)
for token in tokens:
tf_query[token] += 1
df = defaultdict(int)
tf = defaultdict(lambda: defaultdict(int))
N = self.collection.indexer.count_documents
for token in tokens:
doc_ids = self.collection.indexer.token_lookup_with_frequency(token)
if len(doc_ids) > 0:
df[token] += len(doc_ids)
for doc_id,freq in doc_ids.items():
tf[doc_id][token] += freq
similarities = defaultdict(int)
for doc_id, tokens_frequency in tf.items():
dot_product = 0
squared_norm_query = 0
squared_norm_doc = 0
for token, token_df in df.items():
doc_weight = 0
if token in tokens_frequency:
if tokens_frequency[token] == 0:
raise Exception("frequency of {} is 0".format(token))
doc_weight = (1 + math.log10(tokens_frequency[token])) * math.log10(N/token_df)
query_weight = (1 + math.log10(tf_query[token]))
squared_norm_query += (query_weight**2)
squared_norm_doc += (doc_weight**2)
dot_product += doc_weight * query_weight
similarities[doc_id] = dot_product / (math.sqrt(squared_norm_query) + math.sqrt(squared_norm_doc))
for position, doc_id in enumerate(self.find_n_first_elements(similarities, n=10), start=1):
print('{}: doc id {}'.format(position, doc_id))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment