Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
gogole
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Container Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Dos Santos David
gogole
Commits
b382723f
Commit
b382723f
authored
Jan 30, 2018
by
Prot Alexandre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
adding normalized frequency weight type to vectorial search
parent
1b57dbc4
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
35 additions
and
12 deletions
+35
-12
search_command.py
gogole/commands/search_command.py
+5
-3
bsbi_indexer.py
gogole/indexer/bsbi_indexer.py
+9
-4
query.py
gogole/query/query.py
+2
-1
vectorial_query.py
gogole/query/vectorial_query.py
+15
-3
main.py
main.py
+4
-1
No files found.
gogole/commands/search_command.py
View file @
b382723f
from
gogole
import
query
from
gogole.query
import
vectorial_query
def
run
(
collection
,
args
):
...
...
@@ -6,12 +7,13 @@ def run(collection, args):
query_cls
=
query
.
QUERY_MAP
[
args
.
search_query_type
]
query_browser
=
query_cls
(
collection
)
query_browser
=
query_cls
(
collection
,
args
.
weight_type
)
print
(
"searching {query} using {model} model"
.
format
(
print
(
"searching {query} using {model} model
and {weight} weight
"
.
format
(
query
=
q
,
model
=
args
.
search_query_type
model
=
args
.
search_query_type
,
weight
=
args
.
weight_type
))
...
...
gogole/indexer/bsbi_indexer.py
View file @
b382723f
...
...
@@ -9,7 +9,7 @@ from gogole.utils import timeit
from
gogole.indexer
import
Indexer
class
BSBIIndexer
(
Indexer
):
BLOCK_SIZE
=
1
2
BLOCK_SIZE
=
1
6
def
__init__
(
self
,
collection_name
,
maxsize
=
None
):
"""
...
...
@@ -63,12 +63,13 @@ class BSBIIndexer(Indexer):
with
open
(
filename
,
'wb'
)
as
f
:
self
.
tmp_filenames
.
append
(
f
.
name
)
for
token_id
,
doc_id
,
frequency
in
sorted_tuples
:
for
token_id
,
doc_id
,
frequency
,
doc_max_frequency
in
sorted_tuples
:
# assume we already are at the end of the file
b
=
bytearray
()
b
+=
struct
.
pack
(
'i'
,
token_id
)
b
+=
struct
.
pack
(
'i'
,
doc_id
)
b
+=
struct
.
pack
(
'i'
,
frequency
)
b
+=
struct
.
pack
(
'i'
,
doc_max_frequency
)
f
.
write
(
b
)
...
...
@@ -87,12 +88,15 @@ class BSBIIndexer(Indexer):
# convert tokens to token ids
token_ids
=
set
()
# get max frequency among tokens
_
,
max_frequency
=
counted_tokens
.
most_common
(
1
)[
0
]
self
.
document_norms
[
doc_id
]
=
self
.
compute_document_vector_norm
(
counted_tokens
)
for
token
,
frequency
in
counted_tokens
.
items
():
token_id
=
self
.
find_or_create_token_id
(
token
)
self
.
buffer
+=
[(
token_id
,
doc_id
,
frequency
)]
self
.
buffer
+=
[(
token_id
,
doc_id
,
frequency
,
max_frequency
)]
if
self
.
maxsize
is
not
None
and
self
.
BLOCK_SIZE
*
len
(
self
.
buffer
)
>=
self
.
maxsize
:
self
.
flush_buffer
()
...
...
@@ -167,8 +171,9 @@ class BSBIIndexer(Indexer):
if
t_id
==
token_id
:
doc_id
=
struct
.
unpack
(
'i'
,
f
.
read
(
4
))[
0
]
frequency
=
struct
.
unpack
(
'i'
,
f
.
read
(
4
))[
0
]
max_frequency
=
struct
.
unpack
(
'i'
,
f
.
read
(
4
))[
0
]
document_ids
[
doc_id
]
=
frequency
document_ids
[
doc_id
]
=
frequency
,
max_frequency
for
p
in
[
pos
+
1
,
pos
-
1
]:
if
p
not
in
visited
and
lower_bound
<=
p
and
upper_bound
>=
p
:
...
...
gogole/query/query.py
View file @
b382723f
class
Query
:
def
__init__
(
self
,
collection
):
def
__init__
(
self
,
collection
,
weight_type
):
self
.
collection
=
collection
self
.
weight_type
=
weight_type
def
search
(
self
,
query
):
raise
Exception
(
'search not implemented'
)
gogole/query/vectorial_query.py
View file @
b382723f
...
...
@@ -4,6 +4,11 @@ import math
from
gogole.query
import
Query
from
gogole.utils
import
timeit
WEIGHTING_TYPE_TF_IDF
=
"tf-idf"
WEIGHTING_TYPE_NORM_TF_IDF
=
"norm-tf-idf"
WEIGHTING_TYPE_NORM_FREQ
=
"norm-freq"
class
VectorialQuery
(
Query
):
def
find_n_first_elements
(
self
,
similarities
,
n
=
10
):
...
...
@@ -42,8 +47,12 @@ class VectorialQuery(Query):
if
len
(
doc_ids
)
>
0
:
df
[
token
]
+=
len
(
doc_ids
)
for
doc_id
,
freq
in
doc_ids
.
items
():
tf
[
doc_id
][
token
]
+=
freq
for
doc_id
,
freq
in
doc_ids
.
items
():
token_freq
,
max_freq
=
freq
if
self
.
weight_type
==
WEIGHTING_TYPE_NORM_FREQ
:
tf
[
doc_id
][
token
]
+=
(
token_freq
/
max_freq
)
else
:
tf
[
doc_id
][
token
]
+=
token_freq
similarities
=
defaultdict
(
int
)
...
...
@@ -58,7 +67,10 @@ class VectorialQuery(Query):
if
tokens_frequency
[
token
]
==
0
:
raise
Exception
(
"frequency of {} is 0"
.
format
(
token
))
doc_weight
=
(
1
+
math
.
log10
(
tokens_frequency
[
token
]))
*
math
.
log10
(
N
/
token_df
)
if
self
.
weight_type
==
WEIGHTING_TYPE_NORM_FREQ
:
doc_weight
=
1
+
math
.
log10
(
tokens_frequency
[
token
])
else
:
doc_weight
=
(
1
+
math
.
log10
(
tokens_frequency
[
token
]))
*
math
.
log10
(
N
/
token_df
)
query_weight
=
(
1
+
math
.
log10
(
tf_query
[
token
]))
...
...
main.py
View file @
b382723f
...
...
@@ -65,10 +65,13 @@ def build_cli_search_parser(root_parser):
search_parser
=
root_parser
.
add_parser
(
'search'
,
description
=
'search for documents'
)
search_parser
.
add_argument
(
'-b'
,
'--boolean'
,
action
=
'store_const'
,
const
=
'boolean'
,
dest
=
'search_query_type'
,
help
=
"use the booolean model"
)
search_parser
.
add_argument
(
'-v'
,
'--vectorial'
,
action
=
'store_const'
,
const
=
'vectorial'
,
dest
=
'search_query_type'
,
help
=
"use the vectorial model"
)
search_parser
.
add_argument
(
'--tf-idf'
,
action
=
'store_const'
,
const
=
'tf-idf'
,
dest
=
'weight_type'
,
help
=
"use the tf-idf weight type"
)
search_parser
.
add_argument
(
'--norm-tf-idf'
,
action
=
'store_const'
,
const
=
'norm-tf-idf'
,
dest
=
'weight_type'
,
help
=
"use the normalized tf-idf weight type"
)
search_parser
.
add_argument
(
'--norm-freq'
,
action
=
'store_const'
,
const
=
'norm-freq'
,
dest
=
'weight_type'
,
help
=
"use the normalized frequency weight type"
)
search_parser
.
add_argument
(
'query'
,
help
=
"your query"
,
nargs
=
'*'
)
search_parser
.
set_defaults
(
weight_type
=
'tf-idf'
)
search_parser
.
set_defaults
(
search_query_type
=
'boolean'
)
def
build_cli_parser
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment