deepset-ai · julian-risch · Jun 3, 2021 · Jun 2, 2021 · Jun 2, 2021 · Jun 2, 2021
diff --git a/docs/_src/usage/usage/reader.md b/docs/_src/usage/usage/reader.md
@@ -247,7 +247,7 @@ When printing the full results of a Reader,
 you will see that each prediction is accompanied 
 by a value in the range of 0 to 1 reflecting the model's confidence in that prediction
 
-In the output of `print_answers()`, you will find the model confidence in dictionary key called `probability`.
+In the output of `print_answers()`, you will find the model confidence in dictionary key called `confidence`.
 
 ```python
 from haystack.utils import print_answers
@@ -263,17 +263,22 @@ print_answers(prediction, details="all")
                        'She travels with her father, Eddard, to '
                        "King's Landing when he is made Hand of the "
                        'King. Before she leaves,',
-            'probability': 0.9899835586547852,
+            'confidence': 0.9899835586547852,
             ...
         },
     ]
 }
 ```
 
 In order to align this probability score with the model's accuracy, finetuning needs to be performed
-on a specific dataset. Have a look at this [FARM tutorial](https://github.com/deepset-ai/FARM/blob/master/examples/question_answering_confidence.py)
-to see how this is done. 
-Note that a finetuned confidence score is specific to the domain that its finetuned on. 
+on a specific dataset. 
+To this end, the reader has a method `calibrate_confidence_scores(document_store, device, label_index, doc_index, label_origin)`.
+The parameters of this method are the same as for the `eval()` method because the calibration of confidence scores is performed on a dataset that comes with gold labels.
+The calibration calls the `eval()` method internally and therefore needs a DocumentStore containing labeled questions and evaluation documents.
+
+Have a look at this [FARM tutorial](https://github.com/deepset-ai/FARM/blob/master/examples/question_answering_confidence.py)
+to see how to compare calibrated confidence scores with uncalibrated confidence scores within FARM. 
+Note that a finetuned confidence score is specific to the domain that it is finetuned on. 
 There is no guarantee that this performance can transfer to a new domain.
 
 Having a confidence score is particularly useful in cases where you need Haystack to work with a certain accuracy threshold.

diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py
@@ -449,6 +449,7 @@ def eval(
             label_index: str = "label",
             doc_index: str = "eval_document",
             label_origin: str = "gold_label",
+            calibrate_conf_scores: bool = False
     ):
         """
         Performs evaluation on evaluation documents in the DocumentStore.
@@ -461,6 +462,8 @@ def eval(
         :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
         :param label_index: Index/Table name where labeled questions are stored
         :param doc_index: Index/Table name where documents that are used for evaluation are stored
+        :param label_origin: Field name where the gold labels are stored
+        :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores
         """
 
         if self.top_k_per_candidate != 4:
@@ -543,7 +546,7 @@ def eval(
 
         evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device)
 
-        eval_results = evaluator.eval(self.inferencer.model)
+        eval_results = evaluator.eval(self.inferencer.model, calibrate_conf_scores=calibrate_conf_scores)
         toc = perf_counter()
         reader_time = toc - tic
         results = {
@@ -603,6 +606,30 @@ def _extract_answers_of_predictions(self, predictions: List[QAPred], top_k: Opti
 
         return answers, max_no_ans_gap
 
+    def calibrate_confidence_scores(
+            self,
+            document_store: BaseDocumentStore,
+            device: str,
+            label_index: str = "label",
+            doc_index: str = "eval_document",
+            label_origin: str = "gold_label"
+    ):
+        """
+        Calibrates confidence scores on evaluation documents in the DocumentStore.
+
+        :param document_store: DocumentStore containing the evaluation documents
+        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
+        :param label_index: Index/Table name where labeled questions are stored
+        :param doc_index: Index/Table name where documents that are used for evaluation are stored
+        :param label_origin: Field name where the gold labels are stored
+        """
+        self.eval(document_store=document_store,
+                  device=device,
+                  label_index=label_index,
+                  doc_index=doc_index,
+                  label_origin=label_origin,
+                  calibrate_conf_scores=True)
+
     @staticmethod
     def _get_pseudo_prob(score: float):
         return float(expit(np.asarray(score) / 8))