Skip to content

Commit

Permalink
Merge pull request #91 from e10v/dev
Browse files Browse the repository at this point in the history
Add family-wise error rate (FWER) for multiple comparison problem
  • Loading branch information
e10v committed Sep 7, 2024
2 parents 6da4441 + 0b5794c commit 50eac0f
Show file tree
Hide file tree
Showing 3 changed files with 213 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/tea_tasting/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,5 @@
from tea_tasting.datasets import make_sessions_data, make_users_data
from tea_tasting.experiment import Experiment
from tea_tasting.metrics import Bootstrap, Mean, Quantile, RatioOfMeans, SampleRatio
from tea_tasting.multiplicity import adjust_fdr
from tea_tasting.multiplicity import adjust_fdr, adjust_fwer
from tea_tasting.version import __version__
108 changes: 105 additions & 3 deletions src/tea_tasting/multiplicity.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

if TYPE_CHECKING:
from collections.abc import Callable, Sequence
from typing import Literal


NO_NAME_COMPARISON = "-"
Expand Down Expand Up @@ -82,7 +83,7 @@ def adjust_fdr(
and Benjamini-Hochberg procedure is performed.
Returns:
The experiments results with adjusted p-values and alpha.
The experiments results with adjusted p-values and alphas.
"""
alpha = (
tea_tasting.utils.auto_check(alpha, "alpha")
Expand All @@ -100,7 +101,64 @@ def adjust_fdr(
arbitrary_dependence=arbitrary_dependence,
)
# In-place update.
_adjust_stepup(metric_results, method.adjust)
_hochberg_stepup(metric_results, method.adjust)

return MultipleComparisonsResults(results)


def adjust_fwer(
experiment_results: tea_tasting.experiment.ExperimentResult | dict[
Any, tea_tasting.experiment.ExperimentResult],
metrics: str | set[str] | Sequence[str] | None = None,
*,
alpha: float | None = None,
method: Literal["sidak", "bonferroni"] = "sidak",
arbitrary_dependence: bool = True,
) -> MultipleComparisonsResults:
"""Adjust p-value and alpha to control the family-wise error rate (FWER).
The number of hypotheses tested is the total number of metrics included in
the comparison in all experiment results. For example, if there are
3 experiments with 2 metrics in each, the number of hypotheses is 6.
The function performs one of the following procedures, depending on parameters:
- Holm's step-down procedure, assuming arbitrary dependence between
hypotheses (`arbitrary_dependence=True`).
- Hochberg's step-up procedure, assuming non-negative correlation between
hypotheses (`arbitrary_dependence=False`).
Args:
experiment_results: Experiment results.
metrics: Metrics included in the comparison.
If `None`, all metrics are included.
alpha: Significance level. If `None`, the value from global settings is used.
method: Correction method, Šidák (`"sidak"`) or Bonferroni (`"bonferroni"`).
arbitrary_dependence: If `True`, arbitrary dependence between hypotheses
is assumed and Holm's step-down procedure is performed.
If `False`, non-negative correlation between hypotheses is assumed
and Hochberg's step-up procedure is performed.
Returns:
The experiments results with adjusted p-values and alphas.
"""
alpha = (
tea_tasting.utils.auto_check(alpha, "alpha")
if alpha is not None
else tea_tasting.config.get_config("alpha")
)
method = tea_tasting.utils.check_scalar(
method, "method", typ=str, in_={"sidak", "bonferroni"})
arbitrary_dependence = tea_tasting.utils.check_scalar(
arbitrary_dependence, "arbitrary_dependence", typ=bool)

# results and metric_results refer to the same dicts.
results, metric_results = _copy_results(experiment_results, metrics)
method_cls = _Sidak if method == "sidak" else _Bonferroni
method_ = method_cls(alpha=alpha, m=len(metric_results)) # type: ignore
procedure = _holm_stepdown if arbitrary_dependence else _hochberg_stepup
# In-place update.
procedure(metric_results, method_.adjust)

return MultipleComparisonsResults(results)

Expand Down Expand Up @@ -141,7 +199,7 @@ def _copy_results(
return copy_of_experiment_results, copy_of_metric_results


def _adjust_stepup(
def _hochberg_stepup(
metric_results: Sequence[dict[str, Any]],
adjust: Callable[[float, int], tuple[float, float]],
) -> None:
Expand All @@ -165,6 +223,30 @@ def _adjust_stepup(
k -= 1


def _holm_stepdown(
metric_results: Sequence[dict[str, Any]],
adjust: Callable[[float, int], tuple[float, float]],
) -> None:
pvalue_adj_min = 0
alpha_adj_max = 1
k = 1
for metric_result in sorted(metric_results, key=lambda d: d["pvalue"]):
pvalue = metric_result["pvalue"]
pvalue_adj, alpha_adj = adjust(pvalue, k)

if alpha_adj_max == 1 and pvalue > alpha_adj:
alpha_adj_max = alpha_adj
alpha_adj = min(alpha_adj, alpha_adj_max)
pvalue_adj = pvalue_adj_min = min(max(pvalue_adj, pvalue_adj_min), 1)

metric_result.update(
pvalue_adj=pvalue_adj,
alpha_adj=alpha_adj,
null_rejected=int(pvalue <= alpha_adj),
)
k += 1


class _Adjustment(abc.ABC):
@abc.abstractmethod
def adjust(self, pvalue: float, k: int) -> tuple[float, float]:
Expand All @@ -188,3 +270,23 @@ def __init__(self,
def adjust(self, pvalue: float, k: int) -> tuple[float, float]:
coef = k / self.denom_
return pvalue / coef, self.alpha * coef


class _Bonferroni(_Adjustment):
def __init__(self, alpha: float, m: int) -> None:
self.alpha = alpha
self.m = m

def adjust(self, pvalue: float, k: int) -> tuple[float, float]:
coef = self.m - k + 1
return pvalue * coef, self.alpha / coef


class _Sidak(_Adjustment):
def __init__(self, alpha: float, m: int) -> None:
self.alpha = alpha
self.m = m

def adjust(self, pvalue: float, k: int) -> tuple[float, float]:
coef = self.m - k + 1
return 1 - (1 - pvalue)**coef, 1 - (1 - self.alpha)**(1 / coef)
108 changes: 107 additions & 1 deletion tests/test_multiplicity.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_adjust_fdr_default(
assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore

def test_adjust_fdr_arbitrary_dependence(
def test_adjust_fdr_no_arbitrary_dependence(
experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
):
results = tea_tasting.multiplicity.adjust_fdr(
Expand Down Expand Up @@ -127,3 +127,109 @@ def test_adjust_fdr_single_metric(
assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.03333333333333333) # type: ignore
assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore


def test_adjust_fwer_default(
experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
):
results = tea_tasting.multiplicity.adjust_fwer(experiment_results)
assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.0296274906437343) # type: ignore
assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.087327) # type: ignore
assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.0345134180118071) # type: ignore
assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.087327) # type: ignore
assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.0394039900000001) # type: ignore
assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.087327) # type: ignore
assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.0085124446108471) # type: ignore
assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.0169524275084415) # type: ignore
assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.0102062183130115) # type: ignore
assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.0169524275084415) # type: ignore
assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0127414550985662) # type: ignore
assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0169524275084415) # type: ignore
assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 1)]["metric2"]["null_rejected"] == 0 # type: ignore
assert results[(0, 2)]["metric2"]["null_rejected"] == 1 # type: ignore
assert results[(0, 2)]["metric3"]["null_rejected"] == 0 # type: ignore
assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore


def test_adjust_fwer_no_arbitrary_dependence(
experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
):
results = tea_tasting.multiplicity.adjust_fwer(
experiment_results,
arbitrary_dependence=False,
)
assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.0296274906437343) # type: ignore
assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.0600000000000001) # type: ignore
assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.0345134180118071) # type: ignore
assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.0600000000000001) # type: ignore
assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.0394039900000001) # type: ignore
assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.0600000000000001) # type: ignore
assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.0127414550985662) # type: ignore
assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.05) # type: ignore
assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.0127414550985662) # type: ignore
assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.0253205655191037) # type: ignore
assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0127414550985662) # type: ignore
assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0169524275084415) # type: ignore
assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 1)]["metric2"]["null_rejected"] == 0 # type: ignore
assert results[(0, 2)]["metric2"]["null_rejected"] == 1 # type: ignore
assert results[(0, 2)]["metric3"]["null_rejected"] == 0 # type: ignore
assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore


def test_adjust_fwer_bonferroni(
experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
):
results = tea_tasting.multiplicity.adjust_fwer(
experiment_results,
method="bonferroni",
)
assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.03) # type: ignore
assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.09) # type: ignore
assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.035) # type: ignore
assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.09) # type: ignore
assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.04) # type: ignore
assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.09) # type: ignore
assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.00833333333333333) # type: ignore
assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.0166666666666667) # type: ignore
assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.01) # type: ignore
assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.0166666666666667) # type: ignore
assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0125) # type: ignore
assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0166666666666667) # type: ignore
assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 1)]["metric2"]["null_rejected"] == 0 # type: ignore
assert results[(0, 2)]["metric2"]["null_rejected"] == 1 # type: ignore
assert results[(0, 2)]["metric3"]["null_rejected"] == 0 # type: ignore
assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore


def test_adjust_fwer_bonferroni_no_arbitrary_dependence(
experiment_results: dict[Any, tea_tasting.experiment.ExperimentResult],
):
results = tea_tasting.multiplicity.adjust_fwer(
experiment_results,
method="bonferroni",
arbitrary_dependence=False,
)
assert results[(0, 1)]["metric1"]["pvalue_adj"] == pytest.approx(0.03) # type: ignore
assert results[(0, 1)]["metric2"]["pvalue_adj"] == pytest.approx(0.06) # type: ignore
assert results[(0, 2)]["metric2"]["pvalue_adj"] == pytest.approx(0.035) # type: ignore
assert results[(0, 2)]["metric3"]["pvalue_adj"] == pytest.approx(0.06) # type: ignore
assert results[(0, 3)]["metric1"]["pvalue_adj"] == pytest.approx(0.04) # type: ignore
assert results[(0, 3)]["metric3"]["pvalue_adj"] == pytest.approx(0.06) # type: ignore
assert results[(0, 1)]["metric1"]["alpha_adj"] == pytest.approx(0.0125) # type: ignore
assert results[(0, 1)]["metric2"]["alpha_adj"] == pytest.approx(0.05) # type: ignore
assert results[(0, 2)]["metric2"]["alpha_adj"] == pytest.approx(0.0125) # type: ignore
assert results[(0, 2)]["metric3"]["alpha_adj"] == pytest.approx(0.025) # type: ignore
assert results[(0, 3)]["metric1"]["alpha_adj"] == pytest.approx(0.0125) # type: ignore
assert results[(0, 3)]["metric3"]["alpha_adj"] == pytest.approx(0.0166666666666667) # type: ignore
assert results[(0, 1)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 1)]["metric2"]["null_rejected"] == 0 # type: ignore
assert results[(0, 2)]["metric2"]["null_rejected"] == 1 # type: ignore
assert results[(0, 2)]["metric3"]["null_rejected"] == 0 # type: ignore
assert results[(0, 3)]["metric1"]["null_rejected"] == 1 # type: ignore
assert results[(0, 3)]["metric3"]["null_rejected"] == 0 # type: ignore

0 comments on commit 50eac0f

Please sign in to comment.