Commit b382723f authored by Prot Alexandre's avatar Prot Alexandre

adding normalized frequency weight type to vectorial search

parent 1b57dbc4
from gogole import query
from gogole.query import vectorial_query
def run(collection, args):
......@@ -6,12 +7,13 @@ def run(collection, args):
query_cls = query.QUERY_MAP[args.search_query_type]
query_browser = query_cls(collection)
query_browser = query_cls(collection, args.weight_type)
print("searching {query} using {model} model".format(
print("searching {query} using {model} model and {weight} weight".format(
query=q,
model=args.search_query_type
model=args.search_query_type,
weight=args.weight_type
))
......
......@@ -9,7 +9,7 @@ from gogole.utils import timeit
from gogole.indexer import Indexer
class BSBIIndexer(Indexer):
BLOCK_SIZE = 12
BLOCK_SIZE = 16
def __init__(self, collection_name, maxsize=None):
"""
......@@ -63,12 +63,13 @@ class BSBIIndexer(Indexer):
with open(filename, 'wb') as f:
self.tmp_filenames.append(f.name)
for token_id, doc_id, frequency in sorted_tuples:
for token_id, doc_id, frequency, doc_max_frequency in sorted_tuples:
# assume we already are at the end of the file
b = bytearray()
b += struct.pack('i', token_id)
b += struct.pack('i', doc_id)
b += struct.pack('i', frequency)
b += struct.pack('i', doc_max_frequency)
f.write(b)
......@@ -87,12 +88,15 @@ class BSBIIndexer(Indexer):
# convert tokens to token ids
token_ids = set()
# get max frequency among tokens
_, max_frequency = counted_tokens.most_common(1)[0]
self.document_norms[doc_id] = self.compute_document_vector_norm(counted_tokens)
for token, frequency in counted_tokens.items():
token_id = self.find_or_create_token_id(token)
self.buffer += [(token_id, doc_id, frequency)]
self.buffer += [(token_id, doc_id, frequency, max_frequency)]
if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
self.flush_buffer()
......@@ -167,8 +171,9 @@ class BSBIIndexer(Indexer):
if t_id == token_id:
doc_id = struct.unpack('i', f.read(4))[0]
frequency = struct.unpack('i', f.read(4))[0]
max_frequency = struct.unpack('i', f.read(4))[0]
document_ids[doc_id] = frequency
document_ids[doc_id] = frequency, max_frequency
for p in [pos+1, pos-1]:
if p not in visited and lower_bound <= p and upper_bound >= p:
......
class Query:
def __init__(self, collection):
def __init__(self, collection, weight_type):
self.collection = collection
self.weight_type = weight_type
def search(self, query):
raise Exception('search not implemented')
......@@ -4,6 +4,11 @@ import math
from gogole.query import Query
from gogole.utils import timeit
WEIGHTING_TYPE_TF_IDF = "tf-idf"
WEIGHTING_TYPE_NORM_TF_IDF = "norm-tf-idf"
WEIGHTING_TYPE_NORM_FREQ = "norm-freq"
class VectorialQuery(Query):
def find_n_first_elements(self, similarities, n=10):
......@@ -42,8 +47,12 @@ class VectorialQuery(Query):
if len(doc_ids) > 0:
df[token] += len(doc_ids)
for doc_id,freq in doc_ids.items():
tf[doc_id][token] += freq
for doc_id, freq in doc_ids.items():
token_freq, max_freq = freq
if self.weight_type == WEIGHTING_TYPE_NORM_FREQ:
tf[doc_id][token] += (token_freq / max_freq)
else:
tf[doc_id][token] += token_freq
similarities = defaultdict(int)
......@@ -58,7 +67,10 @@ class VectorialQuery(Query):
if tokens_frequency[token] == 0:
raise Exception("frequency of {} is 0".format(token))
doc_weight = (1 + math.log10(tokens_frequency[token])) * math.log10(N/token_df)
if self.weight_type == WEIGHTING_TYPE_NORM_FREQ:
doc_weight = 1 + math.log10(tokens_frequency[token])
else:
doc_weight = (1 + math.log10(tokens_frequency[token])) * math.log10(N/token_df)
query_weight = (1 + math.log10(tf_query[token]))
......
......@@ -65,10 +65,13 @@ def build_cli_search_parser(root_parser):
search_parser = root_parser.add_parser('search', description='search for documents')
search_parser.add_argument('-b', '--boolean', action='store_const', const='boolean', dest='search_query_type', help="use the booolean model")
search_parser.add_argument('-v', '--vectorial', action='store_const', const='vectorial', dest='search_query_type', help="use the vectorial model")
search_parser.add_argument('--tf-idf', action='store_const', const='tf-idf', dest='weight_type', help="use the tf-idf weight type")
search_parser.add_argument('--norm-tf-idf', action='store_const', const='norm-tf-idf', dest='weight_type', help="use the normalized tf-idf weight type")
search_parser.add_argument('--norm-freq', action='store_const', const='norm-freq', dest='weight_type', help="use the normalized frequency weight type")
search_parser.add_argument('query', help="your query", nargs='*')
search_parser.set_defaults(weight_type='tf-idf')
search_parser.set_defaults(search_query_type='boolean')
def build_cli_parser():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment