From d577189f04c7f3d7746eacb853427b7db3a1d128 Mon Sep 17 00:00:00 2001
From: Adinda de Wit <adinda.maite.de.wit@cern.ch>
Date: Mon, 4 Apr 2022 10:20:45 +0200
Subject: [PATCH 1/3] Address rare case of NaN test statistics for AD GoF

---
 docs/part3/commonstatsmethods.md | 2 +-
 src/GoodnessOfFit.cc             | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/part3/commonstatsmethods.md b/docs/part3/commonstatsmethods.md
index 085cfd24a15..080fcd6d264 100644
--- a/docs/part3/commonstatsmethods.md
+++ b/docs/part3/commonstatsmethods.md
@@ -659,7 +659,7 @@ The following algorithms are supported:
 
 - **`AD`**: Compute a goodness-of-fit measure for binned fits using the *Anderson-Darling* test. It is based on the integral of the difference between the cumulative distribution function and the empirical distribution function over all bins. It also gives the tail ends of the distribution a higher weighting.
 
-The output tree will contain a branch called **`limit`** which contains the value of the test-statistic in each toy. You can make a histogram of this test-statistic $t$ and from this distribution ($f(t)$) and the single value obtained in the data ($t_{0}$) you can calculate the p-value $$p = \int_{t=t_{0}}^{\mathrm{+inf}} f(t) dt $$.
+The output tree will contain a branch called **`limit`** which contains the value of the test-statistic in each toy. You can make a histogram of this test-statistic $t$ and from this distribution ($f(t)$) and the single value obtained in the data ($t_{0}$) you can calculate the p-value $$p = \int_{t=t_{0}}^{\mathrm{+inf}} f(t) dt $$. Note: in rare cases where the test statistic value for the toys would end up being NaN, we set the test statistic value to -1. When plotting the test statistic distribution, such toys should be excluded. This is automatically taken care of if you use the GoF collection script in CombineHarvester described below.
 
 When generating toys, the default behavior will be used. See the section on [toy generation](http://cms-analysis.github.io/HiggsAnalysis-CombinedLimit/part3/runningthetool/#toy-data-generation) for options on how to generate/fit nuisance parameters in these tests. It is recomended to use the *frequentist toys* (`--toysFreq`) when running the **`saturated`** model, and the default toys for the other two tests.
 
diff --git a/src/GoodnessOfFit.cc b/src/GoodnessOfFit.cc
index 30f71af6e95..fe2555366a2 100644
--- a/src/GoodnessOfFit.cc
+++ b/src/GoodnessOfFit.cc
@@ -353,7 +353,11 @@ Double_t GoodnessOfFit::EvaluateADDistance(RooAbsPdf& pdf, RooAbsData& data, Roo
         observable.setVal(observableval);
         // observable.bin
         current_cdf_val = cdf->getVal();
-        empirical_df += d->second/s_data;
+        if (d->second==0 && s_data ==0){
+          empirical_df = -1.;
+        } else {
+          empirical_df += d->second/s_data;
+        }
 
         if (plotDir_ && makePlots_) {
           hCdf->SetBinContent(bin+1, current_cdf_val);
@@ -366,6 +370,7 @@ Double_t GoodnessOfFit::EvaluateADDistance(RooAbsPdf& pdf, RooAbsData& data, Roo
               std::cout << "Observable: " << observableval << "\tdata: " << d->second << "\tedf: " << empirical_df << "\tcdf: " << current_cdf_val << "\tdistance: " << distance << "\n";
             }
             if (distance > test_stat) test_stat = distance;
+            if (empirical_df < 0. ) test_stat = empirical_df; //To set negative test stat in case the sum of data entries is 0.
         }else{
             bin_prob = current_cdf_val-last_cdf_val;
             distance = s_data*pow((empirical_df-current_cdf_val), 2)/current_cdf_val/(1.-current_cdf_val)*bin_prob;
@@ -377,6 +382,7 @@ Double_t GoodnessOfFit::EvaluateADDistance(RooAbsPdf& pdf, RooAbsData& data, Roo
             }
             // from L. Demortier, CDF/ANAL/JET/CDFR/3419
             test_stat += distance;
+            if(empirical_df < 0. ) test_stat = empirical_df; //To set negative test stat in case the sum of data entries is 0.
         }
         if (plotDir_ && makePlots_) {
           hDiff->SetBinContent(bin+1, distance);

From 0c9ec13cb3585f794bcdc36a8c46e8c59d7bce59 Mon Sep 17 00:00:00 2001
From: Adinda de Wit <adinda.maite.de.wit@cern.ch>
Date: Wed, 6 Apr 2022 11:59:59 +0200
Subject: [PATCH 2/3] Update comment about toys with -ve GoF test statistic for
 AS and KD

---
 docs/part3/commonstatsmethods.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/part3/commonstatsmethods.md b/docs/part3/commonstatsmethods.md
index 080fcd6d264..7237c042893 100644
--- a/docs/part3/commonstatsmethods.md
+++ b/docs/part3/commonstatsmethods.md
@@ -659,7 +659,7 @@ The following algorithms are supported:
 
 - **`AD`**: Compute a goodness-of-fit measure for binned fits using the *Anderson-Darling* test. It is based on the integral of the difference between the cumulative distribution function and the empirical distribution function over all bins. It also gives the tail ends of the distribution a higher weighting.
 
-The output tree will contain a branch called **`limit`** which contains the value of the test-statistic in each toy. You can make a histogram of this test-statistic $t$ and from this distribution ($f(t)$) and the single value obtained in the data ($t_{0}$) you can calculate the p-value $$p = \int_{t=t_{0}}^{\mathrm{+inf}} f(t) dt $$. Note: in rare cases where the test statistic value for the toys would end up being NaN, we set the test statistic value to -1. When plotting the test statistic distribution, such toys should be excluded. This is automatically taken care of if you use the GoF collection script in CombineHarvester described below.
+The output tree will contain a branch called **`limit`** which contains the value of the test-statistic in each toy. You can make a histogram of this test-statistic $t$ and from this distribution ($f(t)$) and the single value obtained in the data ($t_{0}$) you can calculate the p-value $$p = \int_{t=t_{0}}^{\mathrm{+inf}} f(t) dt $$. Note: in rare cases the test statistic value for the toys can be undefined (for AS and KD), and in this case we set the test statistic value to -1. When plotting the test statistic distribution, those toys should be excluded. This is automatically taken care of if you use the GoF collection script in CombineHarvester described below.
 
 When generating toys, the default behavior will be used. See the section on [toy generation](http://cms-analysis.github.io/HiggsAnalysis-CombinedLimit/part3/runningthetool/#toy-data-generation) for options on how to generate/fit nuisance parameters in these tests. It is recomended to use the *frequentist toys* (`--toysFreq`) when running the **`saturated`** model, and the default toys for the other two tests.
 

From 750d148dd7d77ca8e39a34b44e27c33bbb5dd212 Mon Sep 17 00:00:00 2001
From: Adinda de Wit <adinda.maite.de.wit@cern.ch>
Date: Wed, 6 Apr 2022 13:47:19 +0200
Subject: [PATCH 3/3] Print warning when KS/AD test statistic is undefined and
 therefore set to 1

---
 src/GoodnessOfFit.cc | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/GoodnessOfFit.cc b/src/GoodnessOfFit.cc
index fe2555366a2..6a355b7d173 100644
--- a/src/GoodnessOfFit.cc
+++ b/src/GoodnessOfFit.cc
@@ -370,7 +370,12 @@ Double_t GoodnessOfFit::EvaluateADDistance(RooAbsPdf& pdf, RooAbsData& data, Roo
               std::cout << "Observable: " << observableval << "\tdata: " << d->second << "\tedf: " << empirical_df << "\tcdf: " << current_cdf_val << "\tdistance: " << distance << "\n";
             }
             if (distance > test_stat) test_stat = distance;
-            if (empirical_df < 0. ) test_stat = empirical_df; //To set negative test stat in case the sum of data entries is 0.
+            if (empirical_df < 0.){
+              if(bin<1){
+                std::cout << "Warning, KS statistic not well defined in absence of data events. Setting test statistic to -1\n";
+              }
+              test_stat = empirical_df; //To set negative test stat in case the sum of data entries is 0.
+            }
         }else{
             bin_prob = current_cdf_val-last_cdf_val;
             distance = s_data*pow((empirical_df-current_cdf_val), 2)/current_cdf_val/(1.-current_cdf_val)*bin_prob;
@@ -382,7 +387,12 @@ Double_t GoodnessOfFit::EvaluateADDistance(RooAbsPdf& pdf, RooAbsData& data, Roo
             }
             // from L. Demortier, CDF/ANAL/JET/CDFR/3419
             test_stat += distance;
-            if(empirical_df < 0. ) test_stat = empirical_df; //To set negative test stat in case the sum of data entries is 0.
+            if(empirical_df < 0.){
+              if(bin<1){
+                std::cout << "Warning, AD statistic not well defined in absence of data events. Setting test statistic to -1\n";
+              }
+              test_stat = empirical_df; //To set negative test stat in case the sum of data entries is 0.
+            }
         }
         if (plotDir_ && makePlots_) {
           hDiff->SetBinContent(bin+1, distance);