diff --git a/docs/_src/usage/usage/reader.md b/docs/_src/usage/usage/reader.md index f764805ffb..2cd26d352a 100644 --- a/docs/_src/usage/usage/reader.md +++ b/docs/_src/usage/usage/reader.md @@ -247,7 +247,7 @@ When printing the full results of a Reader, you will see that each prediction is accompanied by a value in the range of 0 to 1 reflecting the model's confidence in that prediction -In the output of `print_answers()`, you will find the model confidence in dictionary key called `probability`. +In the output of `print_answers()`, you will find the model confidence in dictionary key called `confidence`. ```python from haystack.utils import print_answers @@ -263,7 +263,7 @@ print_answers(prediction, details="all") 'She travels with her father, Eddard, to ' "King's Landing when he is made Hand of the " 'King. Before she leaves,', - 'probability': 0.9899835586547852, + 'confidence': 0.9899835586547852, ... }, ] @@ -271,9 +271,14 @@ print_answers(prediction, details="all") ``` In order to align this probability score with the model's accuracy, finetuning needs to be performed -on a specific dataset. Have a look at this [FARM tutorial](https://github.com/deepset-ai/FARM/blob/master/examples/question_answering_confidence.py) -to see how this is done. -Note that a finetuned confidence score is specific to the domain that its finetuned on. +on a specific dataset. +To this end, the reader has a method `calibrate_confidence_scores(document_store, device, label_index, doc_index, label_origin)`. +The parameters of this method are the same as for the `eval()` method because the calibration of confidence scores is performed on a dataset that comes with gold labels. +The calibration calls the `eval()` method internally and therefore needs a DocumentStore containing labeled questions and evaluation documents. + +Have a look at this [FARM tutorial](https://github.com/deepset-ai/FARM/blob/master/examples/question_answering_confidence.py) +to see how to compare calibrated confidence scores with uncalibrated confidence scores within FARM. +Note that a finetuned confidence score is specific to the domain that it is finetuned on. There is no guarantee that this performance can transfer to a new domain. Having a confidence score is particularly useful in cases where you need Haystack to work with a certain accuracy threshold. diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index 8f77d2cb23..03ddcef4ad 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -449,6 +449,7 @@ def eval( label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", + calibrate_conf_scores: bool = False ): """ Performs evaluation on evaluation documents in the DocumentStore. @@ -461,6 +462,8 @@ def eval( :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". :param label_index: Index/Table name where labeled questions are stored :param doc_index: Index/Table name where documents that are used for evaluation are stored + :param label_origin: Field name where the gold labels are stored + :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores """ if self.top_k_per_candidate != 4: @@ -545,7 +548,7 @@ def eval( evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device) - eval_results = evaluator.eval(self.inferencer.model) + eval_results = evaluator.eval(self.inferencer.model, calibrate_conf_scores=calibrate_conf_scores) toc = perf_counter() reader_time = toc - tic results = { @@ -605,6 +608,30 @@ def _extract_answers_of_predictions(self, predictions: List[QAPred], top_k: Opti return answers, max_no_ans_gap + def calibrate_confidence_scores( + self, + document_store: BaseDocumentStore, + device: str, + label_index: str = "label", + doc_index: str = "eval_document", + label_origin: str = "gold_label" + ): + """ + Calibrates confidence scores on evaluation documents in the DocumentStore. + + :param document_store: DocumentStore containing the evaluation documents + :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda". + :param label_index: Index/Table name where labeled questions are stored + :param doc_index: Index/Table name where documents that are used for evaluation are stored + :param label_origin: Field name where the gold labels are stored + """ + self.eval(document_store=document_store, + device=device, + label_index=label_index, + doc_index=doc_index, + label_origin=label_origin, + calibrate_conf_scores=True) + @staticmethod def _get_pseudo_prob(score: float): return float(expit(np.asarray(score) / 8))