Ce serveur Gitlab sera éteint le 30 juin 2020, pensez à migrer vos projets vers les serveurs gitlab-research.centralesupelec.fr et gitlab-student.centralesupelec.fr !

Commit 8bcfab93 authored by Dos Santos David's avatar Dos Santos David

improve tokenization of the query

parent c86431fa
from collections import defaultdict
import math
import re
from gogole.query import Query
from gogole.utils import timeit
class VectorialQuery(Query):
def _split_query(self, query):
return re.sub("[^\w]", " ", query).split()
def find_n_first_elements(self, similarities, n=10):
sorted_docs = [x[0] for x in sorted(similarities.items(), key=lambda x: x[1], reverse=True)]
return sorted_docs[:n]
......@@ -30,7 +34,7 @@ class VectorialQuery(Query):
def timed_search(self, query):
tokens = query.split(' ')
tokens = self._split_query(query)
tf_query = defaultdict(int)
for token in tokens:
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment