Commit b382723f authored by Prot Alexandre's avatar Prot Alexandre

adding normalized frequency weight type to vectorial search

parent 1b57dbc4
from gogole import query from gogole import query
from gogole.query import vectorial_query
def run(collection, args): def run(collection, args):
...@@ -6,12 +7,13 @@ def run(collection, args): ...@@ -6,12 +7,13 @@ def run(collection, args):
query_cls = query.QUERY_MAP[args.search_query_type] query_cls = query.QUERY_MAP[args.search_query_type]
query_browser = query_cls(collection) query_browser = query_cls(collection, args.weight_type)
print("searching {query} using {model} model".format( print("searching {query} using {model} model and {weight} weight".format(
query=q, query=q,
model=args.search_query_type model=args.search_query_type,
weight=args.weight_type
)) ))
......
...@@ -9,7 +9,7 @@ from gogole.utils import timeit ...@@ -9,7 +9,7 @@ from gogole.utils import timeit
from gogole.indexer import Indexer from gogole.indexer import Indexer
class BSBIIndexer(Indexer): class BSBIIndexer(Indexer):
BLOCK_SIZE = 12 BLOCK_SIZE = 16
def __init__(self, collection_name, maxsize=None): def __init__(self, collection_name, maxsize=None):
""" """
...@@ -63,12 +63,13 @@ class BSBIIndexer(Indexer): ...@@ -63,12 +63,13 @@ class BSBIIndexer(Indexer):
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
self.tmp_filenames.append(f.name) self.tmp_filenames.append(f.name)
for token_id, doc_id, frequency in sorted_tuples: for token_id, doc_id, frequency, doc_max_frequency in sorted_tuples:
# assume we already are at the end of the file # assume we already are at the end of the file
b = bytearray() b = bytearray()
b += struct.pack('i', token_id) b += struct.pack('i', token_id)
b += struct.pack('i', doc_id) b += struct.pack('i', doc_id)
b += struct.pack('i', frequency) b += struct.pack('i', frequency)
b += struct.pack('i', doc_max_frequency)
f.write(b) f.write(b)
...@@ -87,12 +88,15 @@ class BSBIIndexer(Indexer): ...@@ -87,12 +88,15 @@ class BSBIIndexer(Indexer):
# convert tokens to token ids # convert tokens to token ids
token_ids = set() token_ids = set()
# get max frequency among tokens
_, max_frequency = counted_tokens.most_common(1)[0]
self.document_norms[doc_id] = self.compute_document_vector_norm(counted_tokens) self.document_norms[doc_id] = self.compute_document_vector_norm(counted_tokens)
for token, frequency in counted_tokens.items(): for token, frequency in counted_tokens.items():
token_id = self.find_or_create_token_id(token) token_id = self.find_or_create_token_id(token)
self.buffer += [(token_id, doc_id, frequency)] self.buffer += [(token_id, doc_id, frequency, max_frequency)]
if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize: if self.maxsize is not None and self.BLOCK_SIZE*len(self.buffer) >= self.maxsize:
self.flush_buffer() self.flush_buffer()
...@@ -167,8 +171,9 @@ class BSBIIndexer(Indexer): ...@@ -167,8 +171,9 @@ class BSBIIndexer(Indexer):
if t_id == token_id: if t_id == token_id:
doc_id = struct.unpack('i', f.read(4))[0] doc_id = struct.unpack('i', f.read(4))[0]
frequency = struct.unpack('i', f.read(4))[0] frequency = struct.unpack('i', f.read(4))[0]
max_frequency = struct.unpack('i', f.read(4))[0]
document_ids[doc_id] = frequency document_ids[doc_id] = frequency, max_frequency
for p in [pos+1, pos-1]: for p in [pos+1, pos-1]:
if p not in visited and lower_bound <= p and upper_bound >= p: if p not in visited and lower_bound <= p and upper_bound >= p:
......
class Query: class Query:
def __init__(self, collection): def __init__(self, collection, weight_type):
self.collection = collection self.collection = collection
self.weight_type = weight_type
def search(self, query): def search(self, query):
raise Exception('search not implemented') raise Exception('search not implemented')
...@@ -4,6 +4,11 @@ import math ...@@ -4,6 +4,11 @@ import math
from gogole.query import Query from gogole.query import Query
from gogole.utils import timeit from gogole.utils import timeit
WEIGHTING_TYPE_TF_IDF = "tf-idf"
WEIGHTING_TYPE_NORM_TF_IDF = "norm-tf-idf"
WEIGHTING_TYPE_NORM_FREQ = "norm-freq"
class VectorialQuery(Query): class VectorialQuery(Query):
def find_n_first_elements(self, similarities, n=10): def find_n_first_elements(self, similarities, n=10):
...@@ -42,8 +47,12 @@ class VectorialQuery(Query): ...@@ -42,8 +47,12 @@ class VectorialQuery(Query):
if len(doc_ids) > 0: if len(doc_ids) > 0:
df[token] += len(doc_ids) df[token] += len(doc_ids)
for doc_id,freq in doc_ids.items(): for doc_id, freq in doc_ids.items():
tf[doc_id][token] += freq token_freq, max_freq = freq
if self.weight_type == WEIGHTING_TYPE_NORM_FREQ:
tf[doc_id][token] += (token_freq / max_freq)
else:
tf[doc_id][token] += token_freq
similarities = defaultdict(int) similarities = defaultdict(int)
...@@ -58,7 +67,10 @@ class VectorialQuery(Query): ...@@ -58,7 +67,10 @@ class VectorialQuery(Query):
if tokens_frequency[token] == 0: if tokens_frequency[token] == 0:
raise Exception("frequency of {} is 0".format(token)) raise Exception("frequency of {} is 0".format(token))
doc_weight = (1 + math.log10(tokens_frequency[token])) * math.log10(N/token_df) if self.weight_type == WEIGHTING_TYPE_NORM_FREQ:
doc_weight = 1 + math.log10(tokens_frequency[token])
else:
doc_weight = (1 + math.log10(tokens_frequency[token])) * math.log10(N/token_df)
query_weight = (1 + math.log10(tf_query[token])) query_weight = (1 + math.log10(tf_query[token]))
......
...@@ -65,10 +65,13 @@ def build_cli_search_parser(root_parser): ...@@ -65,10 +65,13 @@ def build_cli_search_parser(root_parser):
search_parser = root_parser.add_parser('search', description='search for documents') search_parser = root_parser.add_parser('search', description='search for documents')
search_parser.add_argument('-b', '--boolean', action='store_const', const='boolean', dest='search_query_type', help="use the booolean model") search_parser.add_argument('-b', '--boolean', action='store_const', const='boolean', dest='search_query_type', help="use the booolean model")
search_parser.add_argument('-v', '--vectorial', action='store_const', const='vectorial', dest='search_query_type', help="use the vectorial model") search_parser.add_argument('-v', '--vectorial', action='store_const', const='vectorial', dest='search_query_type', help="use the vectorial model")
search_parser.add_argument('--tf-idf', action='store_const', const='tf-idf', dest='weight_type', help="use the tf-idf weight type")
search_parser.add_argument('--norm-tf-idf', action='store_const', const='norm-tf-idf', dest='weight_type', help="use the normalized tf-idf weight type")
search_parser.add_argument('--norm-freq', action='store_const', const='norm-freq', dest='weight_type', help="use the normalized frequency weight type")
search_parser.add_argument('query', help="your query", nargs='*') search_parser.add_argument('query', help="your query", nargs='*')
search_parser.set_defaults(weight_type='tf-idf')
search_parser.set_defaults(search_query_type='boolean') search_parser.set_defaults(search_query_type='boolean')
def build_cli_parser(): def build_cli_parser():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment