Commit 8bcfab93 authored by Dos Santos David's avatar Dos Santos David

improve tokenization of the query

parent c86431fa
from collections import defaultdict
import math
import re
from gogole.query import Query
from gogole.utils import timeit
......@@ -12,6 +13,9 @@ WEIGHTING_TYPES = [WEIGHTING_TYPE_NORM_FREQ, WEIGHTING_TYPE_TF_IDF, WEIGHTING_TY
class VectorialQuery(Query):
def _split_query(self, query):
return re.sub("[^\w]", " ", query).split()
def find_n_first_elements(self, similarities, n=10):
sorted_docs = [x[0] for x in sorted(similarities.items(), key=lambda x: x[1], reverse=True)]
return sorted_docs[:n]
......@@ -30,7 +34,7 @@ class VectorialQuery(Query):
@timeit
def timed_search(self, query):
tokens = query.split(' ')
tokens = self._split_query(query)
tf_query = defaultdict(int)
for token in tokens:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment