Commit 78703046 authored by Prot Alexandre's avatar Prot Alexandre

adding evaluation for cacm with precision and recall metrics

parent d0b334e3
......@@ -67,3 +67,11 @@ gogole > search -v --tf-idf <query> # tf-idf
gogole > search -v --norm-tf-idf <query> # tf-idf normalisée
gogole > search -v --norm-freq <query> # fréquences normalisées
```
#### Evaluation
Sur la collection cacm uniquement, il est possible d'évaluer la pertinence des recherches vectorielles effectuées avec les différentes pondérations.
NB: L'évaluation demande aussi d'avoir construit l'index avec la commande `index build`
\ No newline at end of file
from gogole.commands import analyze_command, index_command, search_command
from gogole.commands import analyze_command, eval_command, index_command, search_command
MAIN_COMMANDS_MAP = {
'analyze': analyze_command.run,
'eval': eval_command.run,
'index': index_command.run,
'search': search_command.run
}
from gogole import query
from gogole.query import vectorial_query
from gogole.parser import CACMParser
from gogole.parser import QRelsParser
def run(collection, args):
# Runs the CACM Parser on the queries file with the same structure
cacm_parser = CACMParser("data/query.text")
nrequests = int(args.nrequests[0])
qrels_parser = QRelsParser()
relevant_docs_by_query = qrels_parser.parse_all(nrequests)
for weight_type in vectorial_query.WEIGHTING_TYPES:
precision_sum = 0
recall_sum = 0
nb_queries = 0
# Here a document is a query wrapped in a CACMDocument
for document in cacm_parser.find_documents(limit=nrequests):
q = document.abstract
query_cls = query.QUERY_MAP[query.QUERY_TYPE_VECTORIAL]
query_browser = query_cls(collection, weight_type)
all_results , t = query_browser.timed_search(q)
n_results = [res for idx, res in enumerate(query_browser.find_n_first_elements(all_results, n=10), start=1)]
# If there is nothing for this query id, drop it
if document.document_id not in relevant_docs_by_query:
continue
relevant_docs = relevant_docs_by_query[document.document_id]
intersection_docs = [res for res in n_results if res in relevant_docs]
if len(n_results) != 0:
precision = len(intersection_docs) / len(n_results)
else:
precision = 0
recall = len(intersection_docs) / len(relevant_docs)
precision_sum += precision
recall_sum += recall
nb_queries += 1
precision = precision_sum / nb_queries
recall = recall_sum / nb_queries
print("for weight {weight}: precision: {precision}, rappel: {recall}".format(weight=weight_type, precision=precision, recall=recall))
......@@ -36,12 +36,17 @@ class CACMDocument:
self._keywords = value.strip()
def __str__(self):
return "[ID #{doc_id}] {title}\n{keywords}{abstract}".format(
doc_id=self.document_id,
title=self.title,
keywords="keywords: {}\n".format(self.keywords) if self.keywords != "" else "",
abstract="abstract : {}\n".format(self.abstract) if self.abstract != "" else ""
)
try:
return "[ID #{doc_id}] {title}\n{keywords}{abstract}".format(
doc_id=self.document_id,
title=self.title,
keywords="keywords: {}\n".format(self.keywords) if self.keywords != "" else "",
abstract="abstract : {}\n".format(self.abstract) if self.abstract != "" else ""
)
# Queries are CACM documents with no title
except AttributeError:
return "[QUERY #{doc_id}] {abstract}".format(doc_id=self.document_id, abstract=self.abstract)
def get_raw_content(self):
......
from gogole.parser.cacm_parser import CACMParser
from gogole.parser.stanford_parser import StanfordParser
from gogole.parser.qrels_parser import QRelsParser
from gogole.document import CACMDocument
class CACMParser:
FILENAME = "data/cacm.all"
class CACMParser:
MARKERS = {
'.I': 'document',
......@@ -15,6 +14,8 @@ class CACMParser:
DOCUMENT_MARKER = '.I'
def __init__(self, filename="data/cacm.all"):
self.FILENAME = filename
def find_marker_in_line(self, line):
"""
......
class QRelsParser:
FILENAME = "data/qrels.text"
def parse_all(self, limit=None):
docs = dict()
with open(self.FILENAME) as f:
for line in f:
parsed = line.split()
query_id = int(parsed[0])
if limit is not None and query_id > limit:
return docs
doc_id = int(parsed[1])
if query_id in docs:
docs[query_id].append(doc_id)
else:
docs[query_id] = [doc_id]
return docs
......@@ -8,6 +8,8 @@ WEIGHTING_TYPE_TF_IDF = "tf-idf"
WEIGHTING_TYPE_NORM_TF_IDF = "norm-tf-idf"
WEIGHTING_TYPE_NORM_FREQ = "norm-freq"
# TODO: Implement norm-tf-idf weight type and add it to weighting types
WEIGHTING_TYPES = [WEIGHTING_TYPE_NORM_FREQ, WEIGHTING_TYPE_TF_IDF]
class VectorialQuery(Query):
......
......@@ -43,6 +43,15 @@ def build_cli_analyze_parser(root_parser):
help="can be any of {}".format(", ".join(commands.analyze_command.COMMANDS))
)
def build_cli_eval_parser(root_parser):
eval_parser = root_parser.add_parser('eval', description='evaluate for documents')
eval_parser.add_argument('nrequests', nargs=1)
eval_parser.add_argument('--tf-idf', action='store_const', const='tf-idf', dest='weight_type', help="use the tf-idf weight type")
eval_parser.add_argument('--norm-tf-idf', action='store_const', const='norm-tf-idf', dest='weight_type', help="use the normalized tf-idf weight type")
eval_parser.add_argument('--norm-freq', action='store_const', const='norm-freq', dest='weight_type', help="use the normalized frequency weight type")
eval_parser.set_defaults(weight_type='tf-idf')
def build_cli_index_parser(root_parser):
......@@ -80,6 +89,7 @@ def build_cli_parser():
cli_subparser = cli_parser.add_subparsers(dest="main_command")
build_cli_analyze_parser(cli_subparser)
build_cli_eval_parser(cli_subparser)
build_cli_index_parser(cli_subparser)
build_cli_search_parser(cli_subparser)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment