Ce serveur Gitlab sera éteint le 30 juin 2020, pensez à migrer vos projets vers les serveurs gitlab-research.centralesupelec.fr et gitlab-student.centralesupelec.fr !

Commit 78703046 authored by Prot Alexandre's avatar Prot Alexandre

adding evaluation for cacm with precision and recall metrics

parent d0b334e3
......@@ -67,3 +67,11 @@ gogole > search -v --tf-idf <query> # tf-idf
gogole > search -v --norm-tf-idf <query> # tf-idf normalisée
gogole > search -v --norm-freq <query> # fréquences normalisées
#### Evaluation
Sur la collection cacm uniquement, il est possible d'évaluer la pertinence des recherches vectorielles effectuées avec les différentes pondérations.
NB: L'évaluation demande aussi d'avoir construit l'index avec la commande `index build`
\ No newline at end of file
from gogole.commands import analyze_command, index_command, search_command
from gogole.commands import analyze_command, eval_command, index_command, search_command
'analyze': analyze_command.run,
'eval': eval_command.run,
'index': index_command.run,
'search': search_command.run
from gogole import query
from gogole.query import vectorial_query
from gogole.parser import CACMParser
from gogole.parser import QRelsParser
def run(collection, args):
# Runs the CACM Parser on the queries file with the same structure
cacm_parser = CACMParser("data/query.text")
nrequests = int(args.nrequests[0])
qrels_parser = QRelsParser()
relevant_docs_by_query = qrels_parser.parse_all(nrequests)
for weight_type in vectorial_query.WEIGHTING_TYPES:
precision_sum = 0
recall_sum = 0
nb_queries = 0
# Here a document is a query wrapped in a CACMDocument
for document in cacm_parser.find_documents(limit=nrequests):
q = document.abstract
query_cls = query.QUERY_MAP[query.QUERY_TYPE_VECTORIAL]
query_browser = query_cls(collection, weight_type)
all_results , t = query_browser.timed_search(q)
n_results = [res for idx, res in enumerate(query_browser.find_n_first_elements(all_results, n=10), start=1)]
# If there is nothing for this query id, drop it
if document.document_id not in relevant_docs_by_query:
relevant_docs = relevant_docs_by_query[document.document_id]
intersection_docs = [res for res in n_results if res in relevant_docs]
if len(n_results) != 0:
precision = len(intersection_docs) / len(n_results)
precision = 0
recall = len(intersection_docs) / len(relevant_docs)
precision_sum += precision
recall_sum += recall
nb_queries += 1
precision = precision_sum / nb_queries
recall = recall_sum / nb_queries
print("for weight {weight}: precision: {precision}, rappel: {recall}".format(weight=weight_type, precision=precision, recall=recall))
......@@ -36,12 +36,17 @@ class CACMDocument:
self._keywords = value.strip()
def __str__(self):
return "[ID #{doc_id}] {title}\n{keywords}{abstract}".format(
keywords="keywords: {}\n".format(self.keywords) if self.keywords != "" else "",
abstract="abstract : {}\n".format(self.abstract) if self.abstract != "" else ""
return "[ID #{doc_id}] {title}\n{keywords}{abstract}".format(
keywords="keywords: {}\n".format(self.keywords) if self.keywords != "" else "",
abstract="abstract : {}\n".format(self.abstract) if self.abstract != "" else ""
# Queries are CACM documents with no title
except AttributeError:
return "[QUERY #{doc_id}] {abstract}".format(doc_id=self.document_id, abstract=self.abstract)
def get_raw_content(self):
from gogole.parser.cacm_parser import CACMParser
from gogole.parser.stanford_parser import StanfordParser
from gogole.parser.qrels_parser import QRelsParser
from gogole.document import CACMDocument
class CACMParser:
FILENAME = "data/cacm.all"
class CACMParser:
'.I': 'document',
......@@ -15,6 +14,8 @@ class CACMParser:
def __init__(self, filename="data/cacm.all"):
self.FILENAME = filename
def find_marker_in_line(self, line):
class QRelsParser:
FILENAME = "data/qrels.text"
def parse_all(self, limit=None):
docs = dict()
with open(self.FILENAME) as f:
for line in f:
parsed = line.split()
query_id = int(parsed[0])
if limit is not None and query_id > limit:
return docs
doc_id = int(parsed[1])
if query_id in docs:
docs[query_id] = [doc_id]
return docs
......@@ -8,6 +8,8 @@ WEIGHTING_TYPE_TF_IDF = "tf-idf"
# TODO: Implement norm-tf-idf weight type and add it to weighting types
class VectorialQuery(Query):
......@@ -43,6 +43,15 @@ def build_cli_analyze_parser(root_parser):
help="can be any of {}".format(", ".join(commands.analyze_command.COMMANDS))
def build_cli_eval_parser(root_parser):
eval_parser = root_parser.add_parser('eval', description='evaluate for documents')
eval_parser.add_argument('nrequests', nargs=1)
eval_parser.add_argument('--tf-idf', action='store_const', const='tf-idf', dest='weight_type', help="use the tf-idf weight type")
eval_parser.add_argument('--norm-tf-idf', action='store_const', const='norm-tf-idf', dest='weight_type', help="use the normalized tf-idf weight type")
eval_parser.add_argument('--norm-freq', action='store_const', const='norm-freq', dest='weight_type', help="use the normalized frequency weight type")
def build_cli_index_parser(root_parser):
......@@ -80,6 +89,7 @@ def build_cli_parser():
cli_subparser = cli_parser.add_subparsers(dest="main_command")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment