improve tokenization of the query

from collections import defaultdict
import math
import re
from gogole.query import Query
from gogole.utils import timeit
class VectorialQuery(Query):
def _split_query(self, query):
return re.sub("[^\w]", " ", query).split()
def find_n_first_elements(self, similarities, n=10):
sorted_docs = [x[0] for x in sorted(similarities.items(), key=lambda x: x[1], reverse=True)]
return sorted_docs[:n]
......@@ -30,7 +34,7 @@ class VectorialQuery(Query):
def timed_search(self, query):
tokens = query.split(' ')
tokens = self._split_query(query)
tf_query = defaultdict(int)
for token in tokens:
