diff --git a/src/tea_tasting/__init__.py b/src/tea_tasting/__init__.py index 7e5f671..4ff3afa 100644 --- a/src/tea_tasting/__init__.py +++ b/src/tea_tasting/__init__.py @@ -21,5 +21,5 @@ from tea_tasting.datasets import make_sessions_data, make_users_data from tea_tasting.experiment import Experiment from tea_tasting.metrics import Bootstrap, Mean, Quantile, RatioOfMeans, SampleRatio -from tea_tasting.multiplicity import adjust_fdr +from tea_tasting.multiplicity import adjust_fdr, adjust_fwer from tea_tasting.version import __version__ diff --git a/src/tea_tasting/multiplicity.py b/src/tea_tasting/multiplicity.py index 9b46ba8..226d26f 100644 --- a/src/tea_tasting/multiplicity.py +++ b/src/tea_tasting/multiplicity.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from collections.abc import Callable, Sequence + from typing import Literal NO_NAME_COMPARISON = "-" @@ -82,7 +83,7 @@ def adjust_fdr( and Benjamini-Hochberg procedure is performed. Returns: - The experiments results with adjusted p-values and alpha. + The experiments results with adjusted p-values and alphas. """ alpha = ( tea_tasting.utils.auto_check(alpha, "alpha") @@ -100,7 +101,64 @@ def adjust_fdr( arbitrary_dependence=arbitrary_dependence, ) # In-place update. - _adjust_stepup(metric_results, method.adjust) + _hochberg_stepup(metric_results, method.adjust) + + return MultipleComparisonsResults(results) + + +def adjust_fwer( + experiment_results: tea_tasting.experiment.ExperimentResult | dict[ + Any, tea_tasting.experiment.ExperimentResult], + metrics: str | set[str] | Sequence[str] | None = None, + *, + alpha: float | None = None, + method: Literal["sidak", "bonferroni"] = "sidak", + arbitrary_dependence: bool = True, +) -> MultipleComparisonsResults: + """Adjust p-value and alpha to control the family-wise error rate (FWER). + + The number of hypotheses tested is the total number of metrics included in + the comparison in all experiment results. For example, if there are + 3 experiments with 2 metrics in each, the number of hypotheses is 6. + + The function performs one of the following procedures, depending on parameters: + + - Holm's step-down procedure, assuming arbitrary dependence between + hypotheses (`arbitrary_dependence=True`). + - Hochberg's step-up procedure, assuming non-negative correlation between + hypotheses (`arbitrary_dependence=False`). + + Args: + experiment_results: Experiment results. + metrics: Metrics included in the comparison. + If `None`, all metrics are included. + alpha: Significance level. If `None`, the value from global settings is used. + method: Correction method, Šidák (`"sidak"`) or Bonferroni (`"bonferroni"`). + arbitrary_dependence: If `True`, arbitrary dependence between hypotheses + is assumed and Holm's step-down procedure is performed. + If `False`, non-negative correlation between hypotheses is assumed + and Hochberg's step-up procedure is performed. + + Returns: + The experiments results with adjusted p-values and alphas. + """ + alpha = ( + tea_tasting.utils.auto_check(alpha, "alpha") + if alpha is not None + else tea_tasting.config.get_config("alpha") + ) + method = tea_tasting.utils.check_scalar( + method, "method", typ=str, in_={"sidak", "bonferroni"}) + arbitrary_dependence = tea_tasting.utils.check_scalar( + arbitrary_dependence, "arbitrary_dependence", typ=bool) + + # results and metric_results refer to the same dicts. + results, metric_results = _copy_results(experiment_results, metrics) + method_cls = _Sidak if method == "sidak" else _Bonferroni + method_ = method_cls(alpha=alpha, m=len(metric_results)) # type: ignore + procedure = _holm_stepdown if arbitrary_dependence else _hochberg_stepup + # In-place update. + procedure(metric_results, method_.adjust) return MultipleComparisonsResults(results) @@ -141,7 +199,7 @@ def _copy_results( return copy_of_experiment_results, copy_of_metric_results -def _adjust_stepup( +def _hochberg_stepup( metric_results: Sequence[dict[str, Any]], adjust: Callable[[float, int], tuple[float, float]], ) -> None: @@ -165,6 +223,30 @@ def _adjust_stepup( k -= 1 +def _holm_stepdown( + metric_results: Sequence[dict[str, Any]], + adjust: Callable[[float, int], tuple[float, float]], +) -> None: + pvalue_adj_min = 0 + alpha_adj_max = 1 + k = 1 + for metric_result in sorted(metric_results, key=lambda d: d["pvalue"]): + pvalue = metric_result["pvalue"] + pvalue_adj, alpha_adj = adjust(pvalue, k) + + if alpha_adj_max == 1 and pvalue > alpha_adj: + alpha_adj_max = alpha_adj + alpha_adj = min(alpha_adj, alpha_adj_max) + pvalue_adj = pvalue_adj_min = min(max(pvalue_adj, pvalue_adj_min), 1) + + metric_result.update( + pvalue_adj=pvalue_adj, + alpha_adj=alpha_adj, + null_rejected=int(pvalue <= alpha_adj), + ) + k += 1 + + class _Adjustment(abc.ABC): @abc.abstractmethod def adjust(self, pvalue: float, k: int) -> tuple[float, float]: @@ -188,3 +270,23 @@ def __init__(self, def adjust(self, pvalue: float, k: int) -> tuple[float, float]: coef = k / self.denom_ return pvalue / coef, self.alpha * coef + + +class _Bonferroni(_Adjustment): + def __init__(self, alpha: float, m: int) -> None: + self.alpha = alpha + self.m = m + + def adjust(self, pvalue: float, k: int) -> tuple[float, float]: + coef = self.m - k + 1 + return pvalue * coef, self.alpha / coef + + +class _Sidak(_Adjustment): + def __init__(self, alpha: float, m: int) -> None: + self.alpha = alpha + self.m = m + + def adjust(self, pvalue: float, k: int) -> tuple[float, float]: + coef = self.m - k + 1 + return 1 - (1 - pvalue)**coef, 1 - (1 - self.alpha)**(1 / coef) diff --git a/tests/test_multiplicity.py b/tests/test_multiplicity.py index 0084939..9144c64 100644 --- a/tests/test_multiplicity.py +++ b/tests/test_multiplicity.py @@ -76,7 +76,7 @@ def test_adjust_fdr_default( assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore -def test_adjust_fdr_arbitrary_dependence( +def test_adjust_fdr_no_arbitrary_dependence( experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult], ): results = tea_tasting.multiplicity.adjust_fdr( @@ -127,3 +127,109 @@ def test_adjust_fdr_single_metric( assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.03333333333333333) # type: ignore assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore + + +def test_adjust_fwer_default( + experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult], +): + results = tea_tasting.multiplicity.adjust_fwer(experiment_results) + assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.0296274906437343) # type: ignore + assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.087327) # type: ignore + assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.0345134180118071) # type: ignore + assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.087327) # type: ignore + assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.0394039900000001) # type: ignore + assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.087327) # type: ignore + assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.0085124446108471) # type: ignore + assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.0169524275084415) # type: ignore + assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.0102062183130115) # type: ignore + assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.0169524275084415) # type: ignore + assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0127414550985662) # type: ignore + assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0169524275084415) # type: ignore + assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore + assert results[(0, 1)]["metric2"]["null_rejected"] == 0 # type: ignore + assert results[(0, 2)]["metric2"]["null_rejected"] == 1 # type: ignore + assert results[(0, 2)]["metric3"]["null_rejected"] == 0 # type: ignore + assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore + assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore + + +def test_adjust_fwer_no_arbitrary_dependence( + experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult], +): + results = tea_tasting.multiplicity.adjust_fwer( + experiment_results, + arbitrary_dependence=False, + ) + assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.0296274906437343) # type: ignore + assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.0600000000000001) # type: ignore + assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.0345134180118071) # type: ignore + assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.0600000000000001) # type: ignore + assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.0394039900000001) # type: ignore + assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.0600000000000001) # type: ignore + assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.0127414550985662) # type: ignore + assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.05) # type: ignore + assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.0127414550985662) # type: ignore + assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.0253205655191037) # type: ignore + assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0127414550985662) # type: ignore + assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0169524275084415) # type: ignore + assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore + assert results[(0, 1)]["metric2"]["null_rejected"] == 0 # type: ignore + assert results[(0, 2)]["metric2"]["null_rejected"] == 1 # type: ignore + assert results[(0, 2)]["metric3"]["null_rejected"] == 0 # type: ignore + assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore + assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore + + +def test_adjust_fwer_bonferroni( + experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult], +): + results = tea_tasting.multiplicity.adjust_fwer( + experiment_results, + method="bonferroni", + ) + assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.03) # type: ignore + assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.09) # type: ignore + assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.035) # type: ignore + assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.09) # type: ignore + assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.04) # type: ignore + assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.09) # type: ignore + assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.00833333333333333) # type: ignore + assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.0166666666666667) # type: ignore + assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.01) # type: ignore + assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.0166666666666667) # type: ignore + assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0125) # type: ignore + assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0166666666666667) # type: ignore + assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore + assert results[(0, 1)]["metric2"]["null_rejected"] == 0 # type: ignore + assert results[(0, 2)]["metric2"]["null_rejected"] == 1 # type: ignore + assert results[(0, 2)]["metric3"]["null_rejected"] == 0 # type: ignore + assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore + assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore + + +def test_adjust_fwer_bonferroni_no_arbitrary_dependence( + experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult], +): + results = tea_tasting.multiplicity.adjust_fwer( + experiment_results, + method="bonferroni", + arbitrary_dependence=False, + ) + assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.03) # type: ignore + assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.06) # type: ignore + assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.035) # type: ignore + assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.06) # type: ignore + assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.04) # type: ignore + assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.06) # type: ignore + assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.0125) # type: ignore + assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.05) # type: ignore + assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.0125) # type: ignore + assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.025) # type: ignore + assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0125) # type: ignore + assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0166666666666667) # type: ignore + assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore + assert results[(0, 1)]["metric2"]["null_rejected"] == 0 # type: ignore + assert results[(0, 2)]["metric2"]["null_rejected"] == 1 # type: ignore + assert results[(0, 2)]["metric3"]["null_rejected"] == 0 # type: ignore + assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore + assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore