Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
gogole
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Container Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Dos Santos David
gogole
Commits
383d7664
Commit
383d7664
authored
Jan 28, 2018
by
Dos Santos David
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
store document norms
parent
afa3163c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
23 additions
and
6 deletions
+23
-6
bsbi_indexer.py
gogole/indexer/bsbi_indexer.py
+21
-4
boolean_query.py
gogole/query/boolean_query.py
+1
-1
vectorial_query.py
gogole/query/vectorial_query.py
+1
-1
No files found.
gogole/indexer/bsbi_indexer.py
View file @
383d7664
...
@@ -26,14 +26,17 @@ class BSBIIndexer(Indexer):
...
@@ -26,14 +26,17 @@ class BSBIIndexer(Indexer):
self
.
tmp_filenames
=
[]
self
.
tmp_filenames
=
[]
self
.
tmp_file_id_seq
=
itertools
.
count
()
self
.
tmp_file_id_seq
=
itertools
.
count
()
self
.
count_documents
=
0
self
.
document_norms
=
dict
()
self
.
INDEX_FILE
=
'.cache/{}_index'
.
format
(
collection_name
)
self
.
INDEX_FILE
=
'.cache/{}_index'
.
format
(
collection_name
)
self
.
TOKENS_MAP_FILE
=
'.cache/{}_tokens_map'
.
format
(
collection_name
)
self
.
TOKENS_MAP_FILE
=
'.cache/{}_tokens_map'
.
format
(
collection_name
)
self
.
DOCUMENT
S_MAP_FILE
=
'.cache/{}_documents_map
'
.
format
(
collection_name
)
self
.
DOCUMENT
_NORMS_FILE
=
'.cache/{}_document_norms
'
.
format
(
collection_name
)
self
.
status
=
self
.
INDEX_STATUS_NOT_CREATED
self
.
status
=
self
.
INDEX_STATUS_NOT_CREATED
def
get_collection_size
(
self
):
return
len
(
self
.
document_norms
)
def
init_token_id_seq
(
self
,
start
=
0
):
def
init_token_id_seq
(
self
,
start
=
0
):
self
.
token_id_seq
=
itertools
.
count
(
start
=
start
)
self
.
token_id_seq
=
itertools
.
count
(
start
=
start
)
...
@@ -72,14 +75,22 @@ class BSBIIndexer(Indexer):
...
@@ -72,14 +75,22 @@ class BSBIIndexer(Indexer):
# reset the buffer
# reset the buffer
self
.
buffer
=
[]
self
.
buffer
=
[]
def
compute_document_vector_norm
(
self
,
counted_tokens
):
norm
=
0
for
token
,
count
in
counted_tokens
.
items
():
norm
+=
(
1
+
math
.
log10
(
count
))
**
2
return
math
.
sqrt
(
norm
)
def
add_document_tokens
(
self
,
document
,
counted_tokens
):
def
add_document_tokens
(
self
,
document
,
counted_tokens
):
doc_id
=
document
.
document_id
doc_id
=
document
.
document_id
# convert tokens to token ids
# convert tokens to token ids
token_ids
=
set
()
token_ids
=
set
()
self
.
count_documents
+=
1
self
.
document_norms
[
doc_id
]
=
self
.
compute_document_vector_norm
(
counted_tokens
)
for
token
,
frequency
in
counted_tokens
.
items
():
for
token
,
frequency
in
counted_tokens
.
items
():
token_id
=
self
.
find_or_create_token_id
(
token
)
token_id
=
self
.
find_or_create_token_id
(
token
)
self
.
buffer
+=
[(
token_id
,
doc_id
,
frequency
)]
self
.
buffer
+=
[(
token_id
,
doc_id
,
frequency
)]
...
@@ -180,12 +191,18 @@ class BSBIIndexer(Indexer):
...
@@ -180,12 +191,18 @@ class BSBIIndexer(Indexer):
with
open
(
self
.
TOKENS_MAP_FILE
,
'wb'
)
as
f
:
with
open
(
self
.
TOKENS_MAP_FILE
,
'wb'
)
as
f
:
pickle
.
dump
(
self
.
tokens_map
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
pickle
.
dump
(
self
.
tokens_map
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
with
open
(
self
.
DOCUMENT_NORMS_FILE
,
'wb'
)
as
f
:
pickle
.
dump
(
self
.
document_norms
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
def
load_from_cache
(
self
):
def
load_from_cache
(
self
):
try
:
try
:
with
open
(
self
.
TOKENS_MAP_FILE
,
'rb'
)
as
f
:
with
open
(
self
.
TOKENS_MAP_FILE
,
'rb'
)
as
f
:
self
.
status
=
self
.
INDEX_STATUS_CREATED
self
.
tokens_map
=
pickle
.
load
(
f
)
self
.
tokens_map
=
pickle
.
load
(
f
)
with
open
(
self
.
DOCUMENT_NORMS_FILE
,
'rb'
)
as
f
:
self
.
document_norms
=
pickle
.
load
(
f
)
self
.
status
=
self
.
INDEX_STATUS_CREATED
return
True
return
True
except
FileNotFoundError
:
except
FileNotFoundError
:
...
...
gogole/query/boolean_query.py
View file @
383d7664
...
@@ -9,7 +9,7 @@ class BooleanQuery(Query):
...
@@ -9,7 +9,7 @@ class BooleanQuery(Query):
# Assume the expression
# Assume the expression
# is in the conjunctive normal form
# is in the conjunctive normal form
last_doc_id
=
self
.
collection
.
indexer
.
count_documents
-
1
last_doc_id
=
self
.
collection
.
indexer
.
get_collection_size
()
-
1
and_queries
=
query
.
split
(
self
.
OPERATOR_AND
)
and_queries
=
query
.
split
(
self
.
OPERATOR_AND
)
doc_ids_by_conjunction
=
list
()
doc_ids_by_conjunction
=
list
()
...
...
gogole/query/vectorial_query.py
View file @
383d7664
...
@@ -34,7 +34,7 @@ class VectorialQuery(Query):
...
@@ -34,7 +34,7 @@ class VectorialQuery(Query):
tf
=
defaultdict
(
lambda
:
defaultdict
(
int
))
tf
=
defaultdict
(
lambda
:
defaultdict
(
int
))
N
=
self
.
collection
.
indexer
.
count_documents
N
=
self
.
collection
.
indexer
.
get_collection_size
()
for
token
in
tokens
:
for
token
in
tokens
:
doc_ids
=
self
.
collection
.
indexer
.
token_lookup_with_frequency
(
token
)
doc_ids
=
self
.
collection
.
indexer
.
token_lookup_with_frequency
(
token
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment