Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
gogole
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Container Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Dos Santos David
gogole
Commits
78703046
Commit
78703046
authored
Jan 30, 2018
by
Prot Alexandre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
adding evaluation for cacm with precision and recall metrics
parent
d0b334e3
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
109 additions
and
9 deletions
+109
-9
README.md
README.md
+8
-0
__init__.py
gogole/commands/__init__.py
+2
-1
eval_command.py
gogole/commands/eval_command.py
+51
-0
cacm_document.py
gogole/document/cacm_document.py
+11
-6
__init__.py
gogole/parser/__init__.py
+1
-0
cacm_parser.py
gogole/parser/cacm_parser.py
+3
-2
qrels_parser.py
gogole/parser/qrels_parser.py
+21
-0
vectorial_query.py
gogole/query/vectorial_query.py
+2
-0
main.py
main.py
+10
-0
No files found.
README.md
View file @
78703046
...
...
@@ -67,3 +67,11 @@ gogole > search -v --tf-idf <query> # tf-idf
gogole
>
search
-v
--norm-tf-idf
<query>
# tf-idf normalisée
gogole
>
search
-v
--norm-freq
<query>
# fréquences normalisées
```
#### Evaluation
Sur la collection cacm uniquement, il est possible d'évaluer la pertinence des recherches vectorielles effectuées avec les différentes pondérations.
NB: L'évaluation demande aussi d'avoir construit l'index avec la commande
`index build`
\ No newline at end of file
gogole/commands/__init__.py
View file @
78703046
from
gogole.commands
import
analyze_command
,
index_command
,
search_command
from
gogole.commands
import
analyze_command
,
eval_command
,
index_command
,
search_command
MAIN_COMMANDS_MAP
=
{
'analyze'
:
analyze_command
.
run
,
'eval'
:
eval_command
.
run
,
'index'
:
index_command
.
run
,
'search'
:
search_command
.
run
}
gogole/commands/eval_command.py
0 → 100644
View file @
78703046
from
gogole
import
query
from
gogole.query
import
vectorial_query
from
gogole.parser
import
CACMParser
from
gogole.parser
import
QRelsParser
def
run
(
collection
,
args
):
# Runs the CACM Parser on the queries file with the same structure
cacm_parser
=
CACMParser
(
"data/query.text"
)
nrequests
=
int
(
args
.
nrequests
[
0
])
qrels_parser
=
QRelsParser
()
relevant_docs_by_query
=
qrels_parser
.
parse_all
(
nrequests
)
for
weight_type
in
vectorial_query
.
WEIGHTING_TYPES
:
precision_sum
=
0
recall_sum
=
0
nb_queries
=
0
# Here a document is a query wrapped in a CACMDocument
for
document
in
cacm_parser
.
find_documents
(
limit
=
nrequests
):
q
=
document
.
abstract
query_cls
=
query
.
QUERY_MAP
[
query
.
QUERY_TYPE_VECTORIAL
]
query_browser
=
query_cls
(
collection
,
weight_type
)
all_results
,
t
=
query_browser
.
timed_search
(
q
)
n_results
=
[
res
for
idx
,
res
in
enumerate
(
query_browser
.
find_n_first_elements
(
all_results
,
n
=
10
),
start
=
1
)]
# If there is nothing for this query id, drop it
if
document
.
document_id
not
in
relevant_docs_by_query
:
continue
relevant_docs
=
relevant_docs_by_query
[
document
.
document_id
]
intersection_docs
=
[
res
for
res
in
n_results
if
res
in
relevant_docs
]
if
len
(
n_results
)
!=
0
:
precision
=
len
(
intersection_docs
)
/
len
(
n_results
)
else
:
precision
=
0
recall
=
len
(
intersection_docs
)
/
len
(
relevant_docs
)
precision_sum
+=
precision
recall_sum
+=
recall
nb_queries
+=
1
precision
=
precision_sum
/
nb_queries
recall
=
recall_sum
/
nb_queries
print
(
"for weight {weight}: precision: {precision}, rappel: {recall}"
.
format
(
weight
=
weight_type
,
precision
=
precision
,
recall
=
recall
))
gogole/document/cacm_document.py
View file @
78703046
...
...
@@ -36,6 +36,7 @@ class CACMDocument:
self
.
_keywords
=
value
.
strip
()
def
__str__
(
self
):
try
:
return
"[ID #{doc_id}] {title}
\n
{keywords}{abstract}"
.
format
(
doc_id
=
self
.
document_id
,
title
=
self
.
title
,
...
...
@@ -43,6 +44,10 @@ class CACMDocument:
abstract
=
"abstract : {}
\n
"
.
format
(
self
.
abstract
)
if
self
.
abstract
!=
""
else
""
)
# Queries are CACM documents with no title
except
AttributeError
:
return
"[QUERY #{doc_id}] {abstract}"
.
format
(
doc_id
=
self
.
document_id
,
abstract
=
self
.
abstract
)
def
get_raw_content
(
self
):
return
" "
.
join
([
self
.
title
,
self
.
abstract
,
self
.
keywords
])
...
...
gogole/parser/__init__.py
View file @
78703046
from
gogole.parser.cacm_parser
import
CACMParser
from
gogole.parser.stanford_parser
import
StanfordParser
from
gogole.parser.qrels_parser
import
QRelsParser
gogole/parser/cacm_parser.py
View file @
78703046
from
gogole.document
import
CACMDocument
class
CACMParser
:
FILENAME
=
"data/cacm.all"
class
CACMParser
:
MARKERS
=
{
'.I'
:
'document'
,
...
...
@@ -15,6 +14,8 @@ class CACMParser:
DOCUMENT_MARKER
=
'.I'
def
__init__
(
self
,
filename
=
"data/cacm.all"
):
self
.
FILENAME
=
filename
def
find_marker_in_line
(
self
,
line
):
"""
...
...
gogole/parser/qrels_parser.py
0 → 100644
View file @
78703046
class
QRelsParser
:
FILENAME
=
"data/qrels.text"
def
parse_all
(
self
,
limit
=
None
):
docs
=
dict
()
with
open
(
self
.
FILENAME
)
as
f
:
for
line
in
f
:
parsed
=
line
.
split
()
query_id
=
int
(
parsed
[
0
])
if
limit
is
not
None
and
query_id
>
limit
:
return
docs
doc_id
=
int
(
parsed
[
1
])
if
query_id
in
docs
:
docs
[
query_id
]
.
append
(
doc_id
)
else
:
docs
[
query_id
]
=
[
doc_id
]
return
docs
gogole/query/vectorial_query.py
View file @
78703046
...
...
@@ -8,6 +8,8 @@ WEIGHTING_TYPE_TF_IDF = "tf-idf"
WEIGHTING_TYPE_NORM_TF_IDF
=
"norm-tf-idf"
WEIGHTING_TYPE_NORM_FREQ
=
"norm-freq"
# TODO: Implement norm-tf-idf weight type and add it to weighting types
WEIGHTING_TYPES
=
[
WEIGHTING_TYPE_NORM_FREQ
,
WEIGHTING_TYPE_TF_IDF
]
class
VectorialQuery
(
Query
):
...
...
main.py
View file @
78703046
...
...
@@ -43,6 +43,15 @@ def build_cli_analyze_parser(root_parser):
help
=
"can be any of {}"
.
format
(
", "
.
join
(
commands
.
analyze_command
.
COMMANDS
))
)
def
build_cli_eval_parser
(
root_parser
):
eval_parser
=
root_parser
.
add_parser
(
'eval'
,
description
=
'evaluate for documents'
)
eval_parser
.
add_argument
(
'nrequests'
,
nargs
=
1
)
eval_parser
.
add_argument
(
'--tf-idf'
,
action
=
'store_const'
,
const
=
'tf-idf'
,
dest
=
'weight_type'
,
help
=
"use the tf-idf weight type"
)
eval_parser
.
add_argument
(
'--norm-tf-idf'
,
action
=
'store_const'
,
const
=
'norm-tf-idf'
,
dest
=
'weight_type'
,
help
=
"use the normalized tf-idf weight type"
)
eval_parser
.
add_argument
(
'--norm-freq'
,
action
=
'store_const'
,
const
=
'norm-freq'
,
dest
=
'weight_type'
,
help
=
"use the normalized frequency weight type"
)
eval_parser
.
set_defaults
(
weight_type
=
'tf-idf'
)
def
build_cli_index_parser
(
root_parser
):
...
...
@@ -80,6 +89,7 @@ def build_cli_parser():
cli_subparser
=
cli_parser
.
add_subparsers
(
dest
=
"main_command"
)
build_cli_analyze_parser
(
cli_subparser
)
build_cli_eval_parser
(
cli_subparser
)
build_cli_index_parser
(
cli_subparser
)
build_cli_search_parser
(
cli_subparser
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment