deepset-ai · tstadel · Dec 14, 2021 · Dec 7, 2021 · Dec 7, 2021 · Dec 13, 2021
diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md
@@ -293,7 +293,7 @@ The DataFrames have the following schema:
 #### calculate\_metrics
 
 ```python
- | calculate_metrics(simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, doc_relevance_col: str = "gold_id_match") -> Dict[str, Dict[str, float]]
+ | calculate_metrics(simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, doc_relevance_col: str = "gold_id_match", node_input: str = "prediction") -> Dict[str, Dict[str, float]]
 ```
 
 Calculates proper metrics for each node.
@@ -323,12 +323,19 @@ as there are situations the result can heavily differ from an actual eval run wi
     remarks: there might be a discrepancy between simulated reader metrics and an actual pipeline run with retriever top_k
 - `doc_relevance_col`: column in the underlying eval table that contains the relevance criteria for documents.
     values can be: 'gold_id_match', 'answer_match', 'gold_id_or_answer_match'
+- `node_input`: the input on which the node was evaluated on.
+    Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='prediction').
+    However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality,
+    you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node.
+    For example when evaluating the reader use value='label' to simulate a perfect retriever in an ExtractiveQAPipeline.
+    Values can be 'prediction', 'label'. 
+    Default value is 'prediction'.
 
 <a name="schema.EvaluationResult.wrong_examples"></a>
 #### wrong\_examples
 
 ```python
- | wrong_examples(node: str, n: int = 3, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, doc_relevance_col: str = "gold_id_match", document_metric: str = "recall_single_hit", answer_metric: str = "f1") -> List[Dict]
+ | wrong_examples(node: str, n: int = 3, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, doc_relevance_col: str = "gold_id_match", document_metric: str = "recall_single_hit", answer_metric: str = "f1", node_input: str = "prediction") -> List[Dict]
 ```
 
 Returns the worst performing queries.
@@ -349,6 +356,13 @@ See calculate_metrics() for more information.
     values can be: 'recall_single_hit', 'recall_multi_hit', 'mrr', 'map', 'precision'
 - `document_metric`: the answer metric worst queries are calculated with.
     values can be: 'f1', 'exact_match' and 'sas' if the evaluation was made using a SAS model.
+- `node_input`: the input on which the node was evaluated on.
+    Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='prediction').
+    However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality,
+    you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node.
+    For example when evaluating the reader use value='label' to simulate a perfect retriever in an ExtractiveQAPipeline.
+    Values can be 'prediction', 'label'. 
+    Default value is 'prediction'.
 
 <a name="schema.EvaluationResult.save"></a>
 #### save

diff --git a/docs/_src/api/api/ranker.md b/docs/_src/api/api/ranker.md
@@ -64,7 +64,9 @@ class SentenceTransformersRanker(BaseRanker)
 Sentence Transformer based pre-trained Cross-Encoder model for Document Re-ranking (https://huggingface.co/cross-encoder).
 Re-Ranking can be used on top of a retriever to boost the performance for document search. This is particularly useful if the retriever has a high recall but is bad in sorting the documents by relevance.
 
-SentenceTransformerRanker handles Cross-Encoder models that use a single logit as similarity score.
+SentenceTransformerRanker handles Cross-Encoder models
+    - use a single logit as similarity score e.g.  cross-encoder/ms-marco-MiniLM-L-12-v2
+    - use two output logits (no_answer, has_answer) e.g. deepset/gbert-base-germandpr-reranking
 https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers
 
 |  With a SentenceTransformersRanker, you can:

diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py
@@ -505,6 +505,7 @@ def _build_eval_dataframe(self,
         # add general info
         df["node"] = node_name
         df["query"] = query
+        df["node_input"] = "prediction"
 
         return df
 

diff --git a/haystack/schema.py b/haystack/schema.py
@@ -638,7 +638,8 @@ def calculate_metrics(
         self, 
         simulated_top_k_reader: int = -1,
         simulated_top_k_retriever: int = -1,
-        doc_relevance_col: str = "gold_id_match"
+        doc_relevance_col: str = "gold_id_match",
+        node_input: str = "prediction"
     ) -> Dict[str, Dict[str, float]]:
         """
         Calculates proper metrics for each node.
@@ -666,11 +667,19 @@ def calculate_metrics(
             remarks: there might be a discrepancy between simulated reader metrics and an actual pipeline run with retriever top_k
         :param doc_relevance_col: column in the underlying eval table that contains the relevance criteria for documents.
             values can be: 'gold_id_match', 'answer_match', 'gold_id_or_answer_match'
+        :param node_input: the input on which the node was evaluated on.
+            Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='prediction').
+            However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality,
+            you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node.
+            For example when evaluating the reader use value='label' to simulate a perfect retriever in an ExtractiveQAPipeline.
+            Values can be 'prediction', 'label'. 
+            Default value is 'prediction'.
         """
         return {node: self._calculate_node_metrics(df, 
                     simulated_top_k_reader=simulated_top_k_reader, 
                     simulated_top_k_retriever=simulated_top_k_retriever,
-                    doc_relevance_col=doc_relevance_col) 
+                    doc_relevance_col=doc_relevance_col,
+                    node_input=node_input) 
             for node, df in self.node_results.items()}
 
     def wrong_examples(
@@ -681,7 +690,8 @@ def wrong_examples(
         simulated_top_k_retriever: int = -1,
         doc_relevance_col: str = "gold_id_match",
         document_metric: str = "recall_single_hit",
-        answer_metric: str = "f1"
+        answer_metric: str = "f1",
+        node_input: str = "prediction"
     ) -> List[Dict]:
         """
         Returns the worst performing queries. 
@@ -700,8 +710,16 @@ def wrong_examples(
             values can be: 'recall_single_hit', 'recall_multi_hit', 'mrr', 'map', 'precision'
         :param document_metric: the answer metric worst queries are calculated with.
             values can be: 'f1', 'exact_match' and 'sas' if the evaluation was made using a SAS model.
+        :param node_input: the input on which the node was evaluated on.
+            Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='prediction').
+            However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality,
+            you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node.
+            For example when evaluating the reader use value='label' to simulate a perfect retriever in an ExtractiveQAPipeline.
+            Values can be 'prediction', 'label'. 
+            Default value is 'prediction'.
         """
         node_df = self.node_results[node]
+        node_df = self._filter_node_input(node_df, node_input)
 
         answers = node_df[node_df["type"] == "answer"]
         if len(answers) > 0:
@@ -752,8 +770,11 @@ def _calculate_node_metrics(
         df: pd.DataFrame,
         simulated_top_k_reader: int = -1,
         simulated_top_k_retriever: int = -1,
-        doc_relevance_col: str = "gold_id_match"
+        doc_relevance_col: str = "gold_id_match",
+        node_input: str = "prediction"
     ) -> Dict[str, float]:
+        df = self._filter_node_input(df, node_input)
+
         answer_metrics = self._calculate_answer_metrics(df, 
             simulated_top_k_reader=simulated_top_k_reader, 
             simulated_top_k_retriever=simulated_top_k_retriever)
@@ -764,6 +785,13 @@ def _calculate_node_metrics(
 
         return {**answer_metrics, **document_metrics}
 
+    def _filter_node_input(self, df: pd.DataFrame, node_input: str) -> pd.DataFrame:
+        if "node_input" in df.columns:
+            df = df[df["node_input"] == node_input]
+        else:
+            logger.warning("eval dataframe has no node_input column. node_input param will be ignored.")
+        return df
+
     def _calculate_answer_metrics(
         self, 
         df: pd.DataFrame,