Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
gogole
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Container Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Dos Santos David
gogole
Commits
6815b51c
Commit
6815b51c
authored
Feb 01, 2018
by
Dos Santos David
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add docs and renaming
parent
5ddcd259
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
68 additions
and
19 deletions
+68
-19
bsbi_indexer.py
gogole/indexer/bsbi_indexer.py
+68
-19
No files found.
gogole/indexer/bsbi_indexer.py
View file @
6815b51c
...
@@ -4,6 +4,7 @@ import heapq
...
@@ -4,6 +4,7 @@ import heapq
import
os
import
os
import
math
import
math
import
pickle
import
pickle
from
collections
import
defaultdict
from
gogole.utils
import
timeit
from
gogole.utils
import
timeit
from
gogole.indexer
import
Indexer
from
gogole.indexer
import
Indexer
...
@@ -12,46 +13,85 @@ class BSBIIndexer(Indexer):
...
@@ -12,46 +13,85 @@ class BSBIIndexer(Indexer):
BLOCK_SIZE
=
16
BLOCK_SIZE
=
16
def
__init__
(
self
,
collection_name
,
maxsize
=
None
):
def
__init__
(
self
,
collection_name
,
maxsize
=
None
):
""" BSBIIndexer constructs a inverted index on disk
Arguments:
collection_name {string} -- the identifier of the collection
it will be useful to retrieve the correct index on the disk
Keyword Arguments:
maxsize {integer} -- [Maxsize of the buffer] (default: {None})
"""
"""
:param maxsize: max size of the buffer (in bytes)
"""
self
.
tokens_map
=
dict
()
# map a token id to a token
self
.
init_token_id_seq
()
# next token id
self
.
token_to_token_id
=
dict
()
# initialize the token id sequence.
# it basically initializes an iterator producing the integers 0,1,2...
self
.
init_token_id_seq
()
# max size of the buffer
# if the buffer exceed this limit, then the buffer will be written to the disk
self
.
maxsize
=
maxsize
self
.
maxsize
=
maxsize
self
.
buffer
=
[]
self
.
buffer
=
[]
# temporary files for the inverted index
self
.
tmp_filename_format
=
'.cache/{}_tmp_index_{{}}'
.
format
(
collection_name
)
self
.
tmp_filename_format
=
'.cache/{}_tmp_index_{{}}'
.
format
(
collection_name
)
self
.
tmp_filenames
=
[]
self
.
tmp_filenames
=
[]
self
.
tmp_file_id_seq
=
itertools
.
count
()
self
.
tmp_file_id_seq
=
itertools
.
count
()
self
.
document_norms
=
dict
()
# name of the index files written to the disk
# we write several files to the disk
# * the inverted index is a map token_id -> postings (described as (doc_id, frequency))
self
.
INVERTED_INDEX_FILE
=
'.cache/{}_index'
.
format
(
collection_name
)
self
.
INDEX_FILE
=
'.cache/{}_index'
.
format
(
collection_name
)
# * it maps the token id to a token
self
.
TOKENS_MAP_FILE
=
'.cache/{}_tokens_map'
.
format
(
collection_name
)
self
.
TOKENS_MAP_FILE
=
'.cache/{}_tokens_map'
.
format
(
collection_name
)
self
.
DOCUMENT_NORMS_FILE
=
'.cache/{}_document_norms'
.
format
(
collection_name
)
# * it stores for each document:
# - its norm
# - the max frequency of its tokens
self
.
DOCUMENT_METADATA_FILE
=
'.cache/{}_documents_metadata'
.
format
(
collection_name
)
# by default, the status of an index is NOT_CREATED
self
.
status
=
self
.
INDEX_STATUS_NOT_CREATED
self
.
status
=
self
.
INDEX_STATUS_NOT_CREATED
# dict: token_id -> number of documents having this token
self
.
token_id_to_df
=
defaultdict
(
int
)
def
get_collection_size
(
self
):
def
get_collection_size
(
self
):
# TODO: use the size of the documents_norms file
return
len
(
self
.
document_norms
)
return
len
(
self
.
document_norms
)
def
init_token_id_seq
(
self
,
start
=
0
):
def
init_token_id_seq
(
self
,
start
=
0
):
self
.
token_id_seq
=
itertools
.
count
(
start
=
start
)
self
.
token_id_seq
=
itertools
.
count
(
start
=
start
)
def
find_or_create_token_id
(
self
,
token
):
def
find_or_create_token_id
(
self
,
token
):
if
token
not
in
self
.
tokens_map
:
"""hash function of the token. Returns the id of the token
If the token was already analyzed, it returns its id
Otherwise it returns an id never used before
Arguments:
token {string} -- the token to hash
Returns:
[int] -- the token_id corresponding to the token
"""
if
token
not
in
self
.
token_to_token_id
:
token_id
=
next
(
self
.
token_id_seq
)
token_id
=
next
(
self
.
token_id_seq
)
self
.
token
s_map
[
token
]
=
token_id
self
.
token
_to_token_id
[
token
]
=
token_id
return
token_id
return
token_id
else
:
else
:
return
self
.
token
s_map
[
token
]
return
self
.
token
_to_token_id
[
token
]
def
cleanup
(
self
):
def
cleanup
(
self
):
"""Cleanup temporary files
"""
for
filename
in
self
.
tmp_filenames
:
for
filename
in
self
.
tmp_filenames
:
os
.
remove
(
filename
)
os
.
remove
(
filename
)
...
@@ -84,6 +124,13 @@ class BSBIIndexer(Indexer):
...
@@ -84,6 +124,13 @@ class BSBIIndexer(Indexer):
return
math
.
sqrt
(
norm
)
return
math
.
sqrt
(
norm
)
def
add_document_tokens
(
self
,
document
,
counted_tokens
):
def
add_document_tokens
(
self
,
document
,
counted_tokens
):
"""Add a document and its token to the buffer
Arguments:
document {Document} -- document
counted_tokens {Counter} -- counted tokens
"""
doc_id
=
document
.
document_id
doc_id
=
document
.
document_id
# convert tokens to token ids
# convert tokens to token ids
token_ids
=
set
()
token_ids
=
set
()
...
@@ -96,6 +143,8 @@ class BSBIIndexer(Indexer):
...
@@ -96,6 +143,8 @@ class BSBIIndexer(Indexer):
for
token
,
frequency
in
counted_tokens
.
items
():
for
token
,
frequency
in
counted_tokens
.
items
():
token_id
=
self
.
find_or_create_token_id
(
token
)
token_id
=
self
.
find_or_create_token_id
(
token
)
self
.
token_id_to_df
[
token_id
]
+=
1
self
.
buffer
+=
[(
token_id
,
doc_id
,
frequency
,
max_frequency
)]
self
.
buffer
+=
[(
token_id
,
doc_id
,
frequency
,
max_frequency
)]
if
self
.
maxsize
is
not
None
and
self
.
BLOCK_SIZE
*
len
(
self
.
buffer
)
>=
self
.
maxsize
:
if
self
.
maxsize
is
not
None
and
self
.
BLOCK_SIZE
*
len
(
self
.
buffer
)
>=
self
.
maxsize
:
...
@@ -120,7 +169,7 @@ class BSBIIndexer(Indexer):
...
@@ -120,7 +169,7 @@ class BSBIIndexer(Indexer):
try
:
try
:
merged_tuples_iterator
=
heapq
.
merge
(
*
map
(
self
.
_read_in_chunks
,
tmp_files
))
merged_tuples_iterator
=
heapq
.
merge
(
*
map
(
self
.
_read_in_chunks
,
tmp_files
))
with
open
(
self
.
INDEX_FILE
,
'wb'
)
as
f
:
with
open
(
self
.
IN
VERTED_IN
DEX_FILE
,
'wb'
)
as
f
:
for
t
in
merged_tuples_iterator
:
for
t
in
merged_tuples_iterator
:
# TODO: maybe write by block ?
# TODO: maybe write by block ?
f
.
write
(
t
)
f
.
write
(
t
)
...
@@ -146,12 +195,12 @@ class BSBIIndexer(Indexer):
...
@@ -146,12 +195,12 @@ class BSBIIndexer(Indexer):
"""
"""
document_ids
=
dict
()
document_ids
=
dict
()
if
token
not
in
self
.
token
s_map
:
if
token
not
in
self
.
token
_to_token_id
:
return
document_ids
return
document_ids
token_id
=
self
.
token
s_map
[
token
]
token_id
=
self
.
token
_to_token_id
[
token
]
with
open
(
self
.
INDEX_FILE
,
'rb'
)
as
f
:
with
open
(
self
.
IN
VERTED_IN
DEX_FILE
,
'rb'
)
as
f
:
upper_bound
=
(
os
.
fstat
(
f
.
fileno
())
.
st_size
)
//
self
.
BLOCK_SIZE
upper_bound
=
(
os
.
fstat
(
f
.
fileno
())
.
st_size
)
//
self
.
BLOCK_SIZE
lower_bound
=
0
lower_bound
=
0
...
@@ -194,7 +243,7 @@ class BSBIIndexer(Indexer):
...
@@ -194,7 +243,7 @@ class BSBIIndexer(Indexer):
def
save_to_disk
(
self
):
def
save_to_disk
(
self
):
with
open
(
self
.
TOKENS_MAP_FILE
,
'wb'
)
as
f
:
with
open
(
self
.
TOKENS_MAP_FILE
,
'wb'
)
as
f
:
pickle
.
dump
(
self
.
token
s_map
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
pickle
.
dump
(
self
.
token
_to_token_id
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
with
open
(
self
.
DOCUMENT_NORMS_FILE
,
'wb'
)
as
f
:
with
open
(
self
.
DOCUMENT_NORMS_FILE
,
'wb'
)
as
f
:
pickle
.
dump
(
self
.
document_norms
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
pickle
.
dump
(
self
.
document_norms
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
...
@@ -202,7 +251,7 @@ class BSBIIndexer(Indexer):
...
@@ -202,7 +251,7 @@ class BSBIIndexer(Indexer):
def
load_from_cache
(
self
):
def
load_from_cache
(
self
):
try
:
try
:
with
open
(
self
.
TOKENS_MAP_FILE
,
'rb'
)
as
f
:
with
open
(
self
.
TOKENS_MAP_FILE
,
'rb'
)
as
f
:
self
.
token
s_map
=
pickle
.
load
(
f
)
self
.
token
_to_token_id
=
pickle
.
load
(
f
)
with
open
(
self
.
DOCUMENT_NORMS_FILE
,
'rb'
)
as
f
:
with
open
(
self
.
DOCUMENT_NORMS_FILE
,
'rb'
)
as
f
:
self
.
document_norms
=
pickle
.
load
(
f
)
self
.
document_norms
=
pickle
.
load
(
f
)
...
@@ -213,10 +262,10 @@ class BSBIIndexer(Indexer):
...
@@ -213,10 +262,10 @@ class BSBIIndexer(Indexer):
except
FileNotFoundError
:
except
FileNotFoundError
:
return
False
return
False
self
.
init_token_id_seq
(
max
(
self
.
token
s_map
.
keys
()))
self
.
init_token_id_seq
(
max
(
self
.
token
_to_token_id
.
keys
()))
def
get_index_size
(
self
):
def
get_index_size
(
self
):
return
os
.
stat
(
self
.
INDEX_FILE
)
.
st_size
return
os
.
stat
(
self
.
IN
VERTED_IN
DEX_FILE
)
.
st_size
def
get_token
s_map
_size
(
self
):
def
get_token
_to_token_id
_size
(
self
):
return
os
.
stat
(
self
.
TOKENS_MAP_FILE
)
.
st_size
return
os
.
stat
(
self
.
TOKENS_MAP_FILE
)
.
st_size
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment