Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix document id missing in farm inference output #174

Merged
merged 1 commit into from
Jun 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions haystack/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,3 +358,39 @@ def eval(
}

return results

@staticmethod
def print_eval_results(finder_eval_results: Dict):
print("\n___Retriever Metrics in Finder___")
print(f"Retriever Recall : {finder_eval_results['retriever_recall']:.3f}")
print(f"Retriever Mean Avg Precision: {finder_eval_results['retriever_map']:.3f}")

# Reader is only evaluated with those questions, where the correct document is among the retrieved ones
print("\n___Reader Metrics in Finder___")
print("Top-k accuracy")
print(f"Reader Top-1 accuracy : {finder_eval_results['reader_top1_accuracy']:.3f}")
print(f"Reader Top-1 accuracy (has answer): {finder_eval_results['reader_top1_accuracy_has_answer']:.3f}")
print(f"Reader Top-k accuracy : {finder_eval_results['reader_top_k_accuracy']:.3f}")
print(f"Reader Top-k accuracy (has answer): {finder_eval_results['reader_topk_accuracy_has_answer']:.3f}")
print("Exact Match")
print(f"Reader Top-1 EM : {finder_eval_results['reader_top1_em']:.3f}")
print(f"Reader Top-1 EM (has answer) : {finder_eval_results['reader_top1_em_has_answer']:.3f}")
print(f"Reader Top-k EM : {finder_eval_results['reader_topk_em']:.3f}")
print(f"Reader Top-k EM (has answer) : {finder_eval_results['reader_topk_em_has_answer']:.3f}")
print("F1 score")
print(f"Reader Top-1 F1 : {finder_eval_results['reader_top1_f1']:.3f}")
print(f"Reader Top-1 F1 (has answer) : {finder_eval_results['reader_top1_f1_has_answer']:.3f}")
print(f"Reader Top-k F1 : {finder_eval_results['reader_topk_f1']:.3f}")
print(f"Reader Top-k F1 (has answer) : {finder_eval_results['reader_topk_f1_has_answer']:.3f}")
print("No Answer")
print(f"Reader Top-1 no-answer accuracy : {finder_eval_results['reader_top1_no_answer_accuracy']:.3f}")
print(f"Reader Top-k no-answer accuracy : {finder_eval_results['reader_topk_no_answer_accuracy']:.3f}")

# Time measurements
print("\n___Time Measurements___")
print(f"Total retrieve time : {finder_eval_results['total_retrieve_time']:.3f}")
print(f"Avg retrieve time per question: {finder_eval_results['avg_retrieve_time']:.3f}")
print(f"Total reader timer : {finder_eval_results['total_reader_time']:.3f}")
print(f"Avg read time per question : {finder_eval_results['avg_reader_time']:.3f}")
print(f"Total Finder time : {finder_eval_results['total_finder_time']:.3f}")

29 changes: 15 additions & 14 deletions haystack/reader/farm.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,28 +250,29 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
answers = []
no_ans_gaps = []
best_score_answer = 0
for pred in predictions:
# TODO once FARM returns doc ids again we can revert to using them inside the preds and remove
for pred, inp in zip(predictions, input_dicts):
answers_per_document = []
no_ans_gaps.append(pred["predictions"][0]["no_ans_gap"])
for a in pred["predictions"][0]["answers"]:
for ans in pred["predictions"][0]["answers"]:
# skip "no answers" here
if self._check_no_answer(d=a):
if self._check_no_answer(ans):
pass
else:
cur = {"answer": a["answer"],
"score": a["score"],
cur = {"answer": ans["answer"],
"score": ans["score"],
# just a pseudo prob for now
"probability": float(expit(np.asarray([a["score"]]) / 8)), # type: ignore
"context": a["context"],
"offset_start": a["offset_answer_start"] - a["offset_context_start"],
"offset_end": a["offset_answer_end"] - a["offset_context_start"],
"offset_start_in_doc": a["offset_answer_start"],
"offset_end_in_doc": a["offset_answer_end"],
"document_id": a["document_id"]}
"probability": float(expit(np.asarray([ans["score"]]) / 8)), # type: ignore
"context": ans["context"],
"offset_start": ans["offset_answer_start"] - ans["offset_context_start"],
"offset_end": ans["offset_answer_end"] - ans["offset_context_start"],
"offset_start_in_doc": ans["offset_answer_start"],
"offset_end_in_doc": ans["offset_answer_end"],
"document_id": inp["document_id"]} #TODO revert to ans["docid"] once it is populated
answers_per_document.append(cur)

if a["score"] > best_score_answer:
best_score_answer = a["score"]
if ans["score"] > best_score_answer:
best_score_answer = ans["score"]
# only take n best candidates. Answers coming back from FARM are sorted with decreasing relevance.
answers += answers_per_document[:self.top_k_per_candidate]

Expand Down
90 changes: 41 additions & 49 deletions tutorials/Tutorial5_Evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,21 @@
import subprocess
import time

LAUNCH_ELASTICSEARCH = False
device, n_gpu = initialize_device_settings(use_cuda=True)
logger = logging.getLogger(__name__)

##############################################
# Settings
##############################################
LAUNCH_ELASTICSEARCH = True

eval_retriever_only = False
eval_reader_only = False
eval_both = True

##############################################
# Code
##############################################
device, n_gpu = initialize_device_settings(use_cuda=True)
# Start an Elasticsearch server
# You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
# your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
Expand All @@ -33,7 +45,11 @@
# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", create_index=False)
# Add evaluation data to Elasticsearch database
document_store.add_eval_data("../data/nq/nq_dev_subset.json")
if LAUNCH_ELASTICSEARCH:
document_store.add_eval_data("../data/nq/nq_dev_subset.json")
else:
logger.warning("Since we already have a running ES instance we should not index the same documents again."
"If you still want to do this call: 'document_store.add_eval_data('../data/nq/nq_dev_subset.json')' manually ")

# Initialize Retriever
retriever = ElasticsearchRetriever(document_store=document_store)
Expand All @@ -44,55 +60,31 @@
# Initialize Finder which sticks together Reader and Retriever
finder = Finder(reader, retriever)

# Evaluate Retriever on its own
retriever_eval_results = retriever.eval()
## Retriever Recall is the proportion of questions for which the correct document containing the answer is
## among the correct documents
print("Retriever Recall:", retriever_eval_results["recall"])
## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
print("Retriever Mean Avg Precision:", retriever_eval_results["map"])

## Evaluate Retriever on its own
if eval_retriever_only:
retriever_eval_results = retriever.eval()
## Retriever Recall is the proportion of questions for which the correct document containing the answer is
## among the correct documents
print("Retriever Recall:", retriever_eval_results["recall"])
## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
print("Retriever Mean Avg Precision:", retriever_eval_results["map"])

# Evaluate Reader on its own
reader_eval_results = reader.eval(document_store=document_store, device=device)
# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
#reader_eval_results = reader.eval_on_file("../data/natural_questions", "dev_subset.json", device=device)
if eval_reader_only:
reader_eval_results = reader.eval(document_store=document_store, device=device)
# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
#reader_eval_results = reader.eval_on_file("../data/natural_questions", "dev_subset.json", device=device)

## Reader Top-N-Recall is the proportion of predicted answers that overlap with their corresponding correct answer
print("Reader Top-N-Recall:", reader_eval_results["top_n_recall"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])
## Reader Top-N-Recall is the proportion of predicted answers that overlap with their corresponding correct answer
print("Reader Top-N-Recall:", reader_eval_results["top_n_recall"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])


# Evaluate combination of Reader and Retriever through Finder
finder_eval_results = finder.eval()

print("\n___Retriever Metrics in Finder___")
print("Retriever Recall:", finder_eval_results["retriever_recall"])
print("Retriever Mean Avg Precision:", finder_eval_results["retriever_map"])

# Reader is only evaluated with those questions, where the correct document is among the retrieved ones
print("\n___Reader Metrics in Finder___")
print("Reader Top-1 accuracy:", finder_eval_results["reader_top1_accuracy"])
print("Reader Top-1 accuracy (has answer):", finder_eval_results["reader_top1_accuracy_has_answer"])
print("Reader Top-k accuracy:", finder_eval_results["reader_top_k_accuracy"])
print("Reader Top-k accuracy (has answer):", finder_eval_results["reader_topk_accuracy_has_answer"])
print("Reader Top-1 EM:", finder_eval_results["reader_top1_em"])
print("Reader Top-1 EM (has answer):", finder_eval_results["reader_top1_em_has_answer"])
print("Reader Top-k EM:", finder_eval_results["reader_topk_em"])
print("Reader Top-k EM (has answer):", finder_eval_results["reader_topk_em_has_answer"])
print("Reader Top-1 F1:", finder_eval_results["reader_top1_f1"])
print("Reader Top-1 F1 (has answer):", finder_eval_results["reader_top1_f1_has_answer"])
print("Reader Top-k F1:", finder_eval_results["reader_topk_f1"])
print("Reader Top-k F1 (has answer):", finder_eval_results["reader_topk_f1_has_answer"])
print("Reader Top-1 no-answer accuracy:", finder_eval_results["reader_top1_no_answer_accuracy"])
print("Reader Top-k no-answer accuracy:", finder_eval_results["reader_topk_no_answer_accuracy"])

# Time measurements
print("\n___Time Measurements___")
print("Total retrieve time:", finder_eval_results["total_retrieve_time"])
print("Avg retrieve time per question:", finder_eval_results["avg_retrieve_time"])
print("Total reader timer:", finder_eval_results["total_reader_time"])
print("Avg read time per question:", finder_eval_results["avg_reader_time"])
print("Total Finder time:", finder_eval_results["total_finder_time"])
if eval_both:
finder_eval_results = finder.eval(top_k_retriever = 10, top_k_reader = 10)
finder.print_eval_results(finder_eval_results)