Merge pull request #91 from e10v/dev

Add family-wise error rate (FWER) for multiple comparison problem
e10v · Sep 7, 2024 · 50eac0f · 50eac0f
2 parents 6da4441 + 0b5794c
commit 50eac0f
Show file tree

Hide file tree

Showing 3 changed files with 213 additions and 5 deletions.
diff --git a/src/tea_tasting/__init__.py b/src/tea_tasting/__init__.py
@@ -21,5 +21,5 @@
 from tea_tasting.datasets import make_sessions_data, make_users_data
 from tea_tasting.experiment import Experiment
 from tea_tasting.metrics import Bootstrap, Mean, Quantile, RatioOfMeans, SampleRatio
-from tea_tasting.multiplicity import adjust_fdr
+from tea_tasting.multiplicity import adjust_fdr, adjust_fwer
 from tea_tasting.version import __version__
diff --git a/src/tea_tasting/multiplicity.py b/src/tea_tasting/multiplicity.py
@@ -13,6 +13,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
+    from typing import Literal
 
 
 NO_NAME_COMPARISON = "-"
@@ -82,7 +83,7 @@ def adjust_fdr(
             and Benjamini-Hochberg procedure is performed.
 
     Returns:
-        The experiments results with adjusted p-values and alpha.
+        The experiments results with adjusted p-values and alphas.
     """
     alpha = (
         tea_tasting.utils.auto_check(alpha, "alpha")
@@ -100,7 +101,64 @@ def adjust_fdr(
         arbitrary_dependence=arbitrary_dependence,
     )
     # In-place update.
-    _adjust_stepup(metric_results, method.adjust)
+    _hochberg_stepup(metric_results, method.adjust)
+
+    return MultipleComparisonsResults(results)
+
+
+def adjust_fwer(
+    experiment_results: tea_tasting.experiment.ExperimentResult | dict[
+        Any, tea_tasting.experiment.ExperimentResult],
+    metrics: str | set[str] | Sequence[str] | None = None,
+    *,
+    alpha: float | None = None,
+    method: Literal["sidak", "bonferroni"] = "sidak",
+    arbitrary_dependence: bool = True,
+) -> MultipleComparisonsResults:
+    """Adjust p-value and alpha to control the family-wise error rate (FWER).
+
+    The number of hypotheses tested is the total number of metrics included in
+    the comparison in all experiment results. For example, if there are
+    3 experiments with 2 metrics in each, the number of hypotheses is 6.
+
+    The function performs one of the following procedures, depending on parameters:
+
+    - Holm's step-down procedure, assuming arbitrary dependence between
+        hypotheses (`arbitrary_dependence=True`).
+    - Hochberg's step-up procedure, assuming non-negative correlation between
+        hypotheses (`arbitrary_dependence=False`).
+
+    Args:
+        experiment_results: Experiment results.
+        metrics: Metrics included in the comparison.
+            If `None`, all metrics are included.
+        alpha: Significance level. If `None`, the value from global settings is used.
+        method: Correction method, Šidák (`"sidak"`) or Bonferroni (`"bonferroni"`).
+        arbitrary_dependence: If `True`, arbitrary dependence between hypotheses
+            is assumed and Holm's step-down procedure is performed.
+            If `False`, non-negative correlation between hypotheses is assumed
+            and Hochberg's step-up procedure is performed.
+
+    Returns:
+        The experiments results with adjusted p-values and alphas.
+    """
+    alpha = (
+        tea_tasting.utils.auto_check(alpha, "alpha")
+        if alpha is not None
+        else tea_tasting.config.get_config("alpha")
+    )
+    method = tea_tasting.utils.check_scalar(
+        method, "method", typ=str, in_={"sidak", "bonferroni"})
+    arbitrary_dependence = tea_tasting.utils.check_scalar(
+        arbitrary_dependence, "arbitrary_dependence", typ=bool)
+
+    # results and metric_results refer to the same dicts.
+    results, metric_results = _copy_results(experiment_results, metrics)
+    method_cls = _Sidak if method == "sidak" else _Bonferroni
+    method_ = method_cls(alpha=alpha, m=len(metric_results))  # type: ignore
+    procedure = _holm_stepdown if arbitrary_dependence else _hochberg_stepup
+    # In-place update.
+    procedure(metric_results, method_.adjust)
 
     return MultipleComparisonsResults(results)
 
@@ -141,7 +199,7 @@ def _copy_results(
     return copy_of_experiment_results, copy_of_metric_results
 
 
-def _adjust_stepup(
+def _hochberg_stepup(
     metric_results: Sequence[dict[str, Any]],
     adjust: Callable[[float, int], tuple[float, float]],
 ) -> None:
@@ -165,6 +223,30 @@ def _adjust_stepup(
         k -= 1
 
 
+def _holm_stepdown(
+    metric_results: Sequence[dict[str, Any]],
+    adjust: Callable[[float, int], tuple[float, float]],
+) -> None:
+    pvalue_adj_min = 0
+    alpha_adj_max = 1
+    k = 1
+    for metric_result in sorted(metric_results, key=lambda d: d["pvalue"]):
+        pvalue = metric_result["pvalue"]
+        pvalue_adj, alpha_adj = adjust(pvalue, k)
+
+        if alpha_adj_max == 1 and pvalue > alpha_adj:
+            alpha_adj_max = alpha_adj
+        alpha_adj = min(alpha_adj, alpha_adj_max)
+        pvalue_adj = pvalue_adj_min = min(max(pvalue_adj, pvalue_adj_min), 1)
+
+        metric_result.update(
+            pvalue_adj=pvalue_adj,
+            alpha_adj=alpha_adj,
+            null_rejected=int(pvalue <= alpha_adj),
+        )
+        k += 1
+
+
 class _Adjustment(abc.ABC):
     @abc.abstractmethod
     def adjust(self, pvalue: float, k: int) -> tuple[float, float]:
@@ -188,3 +270,23 @@ def __init__(self,
     def adjust(self, pvalue: float, k: int) -> tuple[float, float]:
         coef = k / self.denom_
         return pvalue / coef, self.alpha * coef
+
+
+class _Bonferroni(_Adjustment):
+    def __init__(self, alpha: float, m: int) -> None:
+        self.alpha = alpha
+        self.m = m
+
+    def adjust(self, pvalue: float, k: int) -> tuple[float, float]:
+        coef = self.m - k + 1
+        return pvalue * coef, self.alpha / coef
+
+
+class _Sidak(_Adjustment):
+    def __init__(self, alpha: float, m: int) -> None:
+        self.alpha = alpha
+        self.m = m
+
+    def adjust(self, pvalue: float, k: int) -> tuple[float, float]:
+        coef = self.m - k + 1
+        return 1 - (1 - pvalue)**coef, 1 - (1 - self.alpha)**(1 / coef)
diff --git a/tests/test_multiplicity.py b/tests/test_multiplicity.py
@@ -76,7 +76,7 @@ def test_adjust_fdr_default(
     assert results[(0, 3)]["metric1"]["null_rejected"] == 1  # type: ignore
     assert results[(0, 3)]["metric3"]["null_rejected"] == 0  # type: ignore
 
-def test_adjust_fdr_arbitrary_dependence(
+def test_adjust_fdr_no_arbitrary_dependence(
     experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
 ):
     results = tea_tasting.multiplicity.adjust_fdr(
@@ -127,3 +127,109 @@ def test_adjust_fdr_single_metric(
     assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.03333333333333333)  # type: ignore
     assert results[(0, 1)]["metric1"]["null_rejected"] == 1  # type: ignore
     assert results[(0, 3)]["metric1"]["null_rejected"] == 1  # type: ignore
+
+
+def test_adjust_fwer_default(
+    experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
+):
+    results = tea_tasting.multiplicity.adjust_fwer(experiment_results)
+    assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.0296274906437343)  # type: ignore
+    assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.087327)  # type: ignore
+    assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.0345134180118071)  # type: ignore
+    assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.087327)  # type: ignore
+    assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.0394039900000001)  # type: ignore
+    assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.087327)  # type: ignore
+    assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.0085124446108471)  # type: ignore
+    assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.0169524275084415)  # type: ignore
+    assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.0102062183130115)  # type: ignore
+    assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.0169524275084415)  # type: ignore
+    assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0127414550985662)  # type: ignore
+    assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0169524275084415)  # type: ignore
+    assert results[(0, 1)]["metric1"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 1)]["metric2"]["null_rejected"] == 0  # type: ignore
+    assert results[(0, 2)]["metric2"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 2)]["metric3"]["null_rejected"] == 0  # type: ignore
+    assert results[(0, 3)]["metric1"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 3)]["metric3"]["null_rejected"] == 0  # type: ignore
+
+
+def test_adjust_fwer_no_arbitrary_dependence(
+    experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
+):
+    results = tea_tasting.multiplicity.adjust_fwer(
+        experiment_results,
+        arbitrary_dependence=False,
+    )
+    assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.0296274906437343)  # type: ignore
+    assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.0600000000000001)  # type: ignore
+    assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.0345134180118071)  # type: ignore
+    assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.0600000000000001)  # type: ignore
+    assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.0394039900000001)  # type: ignore
+    assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.0600000000000001)  # type: ignore
+    assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.0127414550985662)  # type: ignore
+    assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.05)  # type: ignore
+    assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.0127414550985662)  # type: ignore
+    assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.0253205655191037)  # type: ignore
+    assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0127414550985662)  # type: ignore
+    assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0169524275084415)  # type: ignore
+    assert results[(0, 1)]["metric1"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 1)]["metric2"]["null_rejected"] == 0  # type: ignore
+    assert results[(0, 2)]["metric2"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 2)]["metric3"]["null_rejected"] == 0  # type: ignore
+    assert results[(0, 3)]["metric1"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 3)]["metric3"]["null_rejected"] == 0  # type: ignore
+
+
+def test_adjust_fwer_bonferroni(
+    experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
+):
+    results = tea_tasting.multiplicity.adjust_fwer(
+        experiment_results,
+        method="bonferroni",
+    )
+    assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.03)  # type: ignore
+    assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.09)  # type: ignore
+    assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.035)  # type: ignore
+    assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.09)  # type: ignore
+    assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.04)  # type: ignore
+    assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.09)  # type: ignore
+    assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.00833333333333333)  # type: ignore
+    assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.0166666666666667)  # type: ignore
+    assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.01)  # type: ignore
+    assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.0166666666666667)  # type: ignore
+    assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0125)  # type: ignore
+    assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0166666666666667)  # type: ignore
+    assert results[(0, 1)]["metric1"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 1)]["metric2"]["null_rejected"] == 0  # type: ignore
+    assert results[(0, 2)]["metric2"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 2)]["metric3"]["null_rejected"] == 0  # type: ignore
+    assert results[(0, 3)]["metric1"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 3)]["metric3"]["null_rejected"] == 0  # type: ignore
+
+
+def test_adjust_fwer_bonferroni_no_arbitrary_dependence(
+    experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
+):
+    results = tea_tasting.multiplicity.adjust_fwer(
+        experiment_results,
+        method="bonferroni",
+        arbitrary_dependence=False,
+    )
+    assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.03)  # type: ignore
+    assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.06)  # type: ignore
+    assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.035)  # type: ignore
+    assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.06)  # type: ignore
+    assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.04)  # type: ignore
+    assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.06)  # type: ignore
+    assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.0125)  # type: ignore
+    assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.05)  # type: ignore
+    assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.0125)  # type: ignore
+    assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.025)  # type: ignore
+    assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0125)  # type: ignore
+    assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0166666666666667)  # type: ignore
+    assert results[(0, 1)]["metric1"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 1)]["metric2"]["null_rejected"] == 0  # type: ignore
+    assert results[(0, 2)]["metric2"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 2)]["metric3"]["null_rejected"] == 0  # type: ignore
+    assert results[(0, 3)]["metric1"]["null_rejected"] == 1  # type: ignore
+    assert results[(0, 3)]["metric3"]["null_rejected"] == 0  # type: ignore