From 528682e332dfc2828ba34f9272ca608cacd960e9 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Fri, 26 Jul 2024 09:47:28 -0700 Subject: [PATCH] refactor resample action - removed unused code and associated tests - added tests of resample action - removed random_seed parameter (it seems to have not worked when I added tests, possibly b/c under the hood the randomization is happening with numpy.random, not Python's random) - this should be added back at some point, but i don't have time right now - see issue #18 - updated copyright headers - edited help text, refactored for re-use in related actions - added usage examples --- q2_boots/__init__.py | 6 +- q2_boots/_examples.py | 46 ++++++++ q2_boots/_normalize.py | 36 ------ q2_boots/_resample.py | 19 +++ q2_boots/alpha.py | 2 +- q2_boots/beta.py | 2 +- q2_boots/core_metrics.py | 2 +- q2_boots/plugin_setup.py | 115 ++++++++++++------- q2_boots/tests/test_alpha.py | 2 +- q2_boots/tests/test_beta.py | 2 +- q2_boots/tests/test_core_metrics.py | 2 +- q2_boots/tests/test_examples.py | 16 +++ q2_boots/tests/test_normalize.py | 58 ---------- q2_boots/tests/test_resample.py | 172 ++++++++++++++++++++++++++++ 14 files changed, 335 insertions(+), 145 deletions(-) create mode 100644 q2_boots/_examples.py delete mode 100644 q2_boots/_normalize.py create mode 100644 q2_boots/_resample.py create mode 100644 q2_boots/tests/test_examples.py delete mode 100644 q2_boots/tests/test_normalize.py create mode 100644 q2_boots/tests/test_resample.py diff --git a/q2_boots/__init__.py b/q2_boots/__init__.py index 6e1a6fc..7314a5c 100644 --- a/q2_boots/__init__.py +++ b/q2_boots/__init__.py @@ -1,12 +1,12 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from ._normalize import _bootstrap_iteration, resample +from ._resample import resample from .alpha import (alpha, alpha_collection, alpha_average) @@ -15,7 +15,7 @@ from .core_metrics import (core_metrics) from . import _version -__all__ = ['_bootstrap_iteration', 'resample', +__all__ = ['resample', 'alpha', 'alpha_collection', 'beta', diff --git a/q2_boots/_examples.py b/q2_boots/_examples.py new file mode 100644 index 0000000..e7c7b6d --- /dev/null +++ b/q2_boots/_examples.py @@ -0,0 +1,46 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import pandas as pd +import qiime2 + +def table_factory(): + table = pd.DataFrame(data=[[0, 1, 1], + [10, 10, 9], + [30, 20, 9], + [42, 42, 9]], + columns=['F1', 'F2', 'F3'], + index=['S1', 'S2', 'S3', 'S4']) + return qiime2.Artifact.import_data( + "FeatureTable[Frequency]", table, view_type=pd.DataFrame) + +def _resample_bootstrap_example(use): + table = use.init_artifact('table', table_factory) + + resampled_tables, = use.action( + use.UsageAction(plugin_id='boots', + action_id='resample'), + use.UsageInputs(table=table, + sampling_depth=20, + n=10, + replacement=True), + use.UsageOutputNames(resampled_tables='bootstrapped_tables') + ) + +def _resample_rarefaction_example(use): + table = use.init_artifact('table', table_factory) + + resampled_tables, = use.action( + use.UsageAction(plugin_id='boots', + action_id='resample'), + use.UsageInputs(table=table, + sampling_depth=20, + n=10, + replacement=False), + use.UsageOutputNames(resampled_tables='rarefaction_tables') + ) diff --git a/q2_boots/_normalize.py b/q2_boots/_normalize.py deleted file mode 100644 index 9c6c349..0000000 --- a/q2_boots/_normalize.py +++ /dev/null @@ -1,36 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file LICENSE, distributed with this software. -# ---------------------------------------------------------------------------- - -import biom -import random - - -def _bootstrap_iteration(table: biom.Table, sampling_depth: int) -> biom.Table: - table = table.filter(lambda v, i, m: v.sum() >= sampling_depth, - inplace=False, axis='sample') - - table = table.subsample(sampling_depth, axis='sample', by_id=False, - with_replacement=True) - - return table - - -def resample(ctx, table, sampling_depth, n, replacement, random_seed=None): - - if random_seed is not None: - random.seed(random_seed) - - _iteration = ctx.get_action('feature_table', 'rarefy') - - tables = [] - - for i in range(n): - tables.append(_iteration(table=table, sampling_depth=sampling_depth, - with_replacement=replacement)[0]) - - return tables diff --git a/q2_boots/_resample.py b/q2_boots/_resample.py new file mode 100644 index 0000000..20cb18d --- /dev/null +++ b/q2_boots/_resample.py @@ -0,0 +1,19 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +def resample(ctx, table, sampling_depth, n, replacement): + rarefy_action = ctx.get_action('feature_table', 'rarefy') + resampled_tables = [] + + for i in range(n): + resampled_table = rarefy_action(table=table, + sampling_depth=sampling_depth, + with_replacement=replacement)[0] + resampled_tables.append(resampled_table) + + return {f'resampled-table-{i}': t for i, t in enumerate(resampled_tables)} diff --git a/q2_boots/alpha.py b/q2_boots/alpha.py index 017a175..c37d547 100644 --- a/q2_boots/alpha.py +++ b/q2_boots/alpha.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). # # Distributed under the terms of the Modified BSD License. # diff --git a/q2_boots/beta.py b/q2_boots/beta.py index f2b3e24..2f03e46 100644 --- a/q2_boots/beta.py +++ b/q2_boots/beta.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). # # Distributed under the terms of the Modified BSD License. # diff --git a/q2_boots/core_metrics.py b/q2_boots/core_metrics.py index 9255ae7..29f09ed 100644 --- a/q2_boots/core_metrics.py +++ b/q2_boots/core_metrics.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). # # Distributed under the terms of the Modified BSD License. # diff --git a/q2_boots/plugin_setup.py b/q2_boots/plugin_setup.py index 6ed6b18..9659f50 100644 --- a/q2_boots/plugin_setup.py +++ b/q2_boots/plugin_setup.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). # # Distributed under the terms of the Modified BSD License. # @@ -23,6 +23,78 @@ from q2_types.ordination import PCoAResults import q2_boots +from q2_boots._examples import (_resample_bootstrap_example, + _resample_rarefaction_example) + +Citations = Citations.load('citations.bib', package='q2_boots') + +plugin = Plugin( + name='boots', + version=q2_boots.__version__, + website='https://github.com/caporaso-lab/q2-boots', + package='q2_boots', + short_description=('Bootstrapped and rarefaction-based diversity ' + 'analyses.'), + description=('A plugin providing bootstrapped and rarefaction-based ' + 'diversity analyses, designed to mirror the interface of ' + 'q2-diversity.') +) + + +_feature_table_description = 'The feature table to be resampled.' +_sampling_depth_description = ( + 'The total number of observations that each sample in `table` should be ' + 'resampled to. Samples where the total number of observations in `table` ' + 'is less than `sampling_depth` will be not be included in the output ' + 'tables.') +_n_description = 'The number of resampled tables that should be generated.' +_replacement_description = ( + 'Resample `table` with replacement (i.e., bootstrap) or resample without ' + 'replacement (i.e., rarefaction).') +_resampled_tables_description = 'The `n` resampled tables.' + +## Resampling + +_resample_inputs = { + 'table': FeatureTable[Frequency] +} +_resample_parameters = { + 'sampling_depth': Int % Range(1, None), + 'n': Int % Range(1, None), + 'replacement': Bool +} +_resample_outputs = { + 'resampled_tables': Collection[FeatureTable[Frequency]] +} +_resample_input_descriptions = { + 'table': _feature_table_description +} +_resample_parameter_descriptions = { + 'sampling_depth': _sampling_depth_description, + 'n': _n_description, + 'replacement': _replacement_description +} +_resample_output_descriptions = { + 'resampled_tables': _resampled_tables_description +} + +plugin.pipelines.register_function( + function=q2_boots.resample, + inputs=_resample_inputs, + parameters=_resample_parameters, + outputs=_resample_outputs, + input_descriptions=_resample_input_descriptions, + parameter_descriptions=_resample_parameter_descriptions, + output_descriptions=_resample_output_descriptions, + name='Resample feature table.', + description=('Resample `table` to `sampling_depth` total observations with ' + 'replacement (i.e., bootstrapping) or without replacement ' + '(i.e., rarefaction) `n` times, to generate `n` resampled ' + 'feature tables.'), + examples={'Generate 10 bootstrapped tables.': _resample_bootstrap_example, + 'Generate 10 rarefied tables.': _resample_rarefaction_example} +) + n_jobs_description = ( 'The number of concurrent jobs to use in performing this calculation. ' @@ -49,47 +121,6 @@ 'included' ) -Citations = Citations.load('citations.bib', package='q2_boots') -plugin = Plugin( - name='boots', - version=q2_boots.__version__, - website='https://github.com/caporaso-lab/q2-boots', - package='q2_boots', - short_description=('Bootstrapped and rarefaction-based diversity ' - 'analyses.'), - description=('A plugin providing bootstrapped and rarefaction-based ' - 'diversity analyses, designed to mirror the interface of ' - 'q2-diversity.') -) - -plugin.pipelines.register_function( - function=q2_boots.resample, - inputs={'table': FeatureTable[Frequency]}, - parameters={'sampling_depth': Int % Range(1, None), - 'n': Int % Range(1, None), - 'replacement': Bool, - 'random_seed': Int}, - outputs={'subsampled_tables': Collection[FeatureTable[Frequency]]}, - input_descriptions={'table': 'The table to be subsampled'}, - parameter_descriptions={ - 'sampling_depth': ('The total frequency that each sample should be ' - 'subsampled to. Samples where the sum of frequencies ' - 'is less than the sampling depth will be not be ' - 'included in the resulting table.'), - 'n': 'The number of times to subsample the input table.', - 'replacement': '', - 'random_seed': random_seed_description - }, - output_descriptions={ - 'subsampled_tables': 'A collection of n tables normalized to the specified ' - 'sampling depth' - }, - name='Bootstrap', - description='This pipeline is a repeated subsampling of a specified input table. ' - 'N tables are produced normalized so the sum of each sample\'s ' - 'frequency is equal to the sampling depth.' -) - plugin.pipelines.register_function( function=q2_boots.alpha_collection, inputs={'table': FeatureTable[Frequency | RelativeFrequency | PresenceAbsence], diff --git a/q2_boots/tests/test_alpha.py b/q2_boots/tests/test_alpha.py index 15de4ab..5491bb6 100644 --- a/q2_boots/tests/test_alpha.py +++ b/q2_boots/tests/test_alpha.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). # # Distributed under the terms of the Modified BSD License. # diff --git a/q2_boots/tests/test_beta.py b/q2_boots/tests/test_beta.py index 8ed1efc..ee31655 100644 --- a/q2_boots/tests/test_beta.py +++ b/q2_boots/tests/test_beta.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). # # Distributed under the terms of the Modified BSD License. # diff --git a/q2_boots/tests/test_core_metrics.py b/q2_boots/tests/test_core_metrics.py index 80c7c1e..228577f 100644 --- a/q2_boots/tests/test_core_metrics.py +++ b/q2_boots/tests/test_core_metrics.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). # # Distributed under the terms of the Modified BSD License. # diff --git a/q2_boots/tests/test_examples.py b/q2_boots/tests/test_examples.py new file mode 100644 index 0000000..825f717 --- /dev/null +++ b/q2_boots/tests/test_examples.py @@ -0,0 +1,16 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from qiime2.plugin.testing import TestPluginBase + + +class UsageExampleTests(TestPluginBase): + package = 'q2_boots.tests' + + def test_examples(self): + self.execute_examples() \ No newline at end of file diff --git a/q2_boots/tests/test_normalize.py b/q2_boots/tests/test_normalize.py deleted file mode 100644 index bff5dcd..0000000 --- a/q2_boots/tests/test_normalize.py +++ /dev/null @@ -1,58 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file LICENSE, distributed with this software. -# ---------------------------------------------------------------------------- - -from unittest import TestCase, main - -import numpy as np -from biom.table import Table - -from q2_boots import _bootstrap_iteration - - -class TestBootstrapIteration(TestCase): - - def test_bootstrap_iteration_filters_samples(self): - t = Table(np.array([[0, 1, 3], [1, 1, 2]]), - ['O1', 'O2'], - ['S1', 'S2', 'S3']) - - observed = _bootstrap_iteration(t, 6) - self.assertTrue(observed.is_empty()) - - observed = _bootstrap_iteration(t, 5) - self.assertEqual(list(observed.ids(axis='sample')), ['S3']) - - observed = _bootstrap_iteration(t, 2) - self.assertEqual(list(observed.ids(axis='sample')), ['S2', 'S3']) - - observed = _bootstrap_iteration(t, 1) - self.assertEqual(list(observed.ids(axis='sample')), ['S1', 'S2', 'S3']) - - def test_bootstrap_iteration_obtains_expected_counts(self): - t = Table(np.array([[0, 10, 30], [1, 10, 20]]), - ['O1', 'O2'], - ['S1', 'S2', 'S3']) - - observed = _bootstrap_iteration(t, 1) - self.assertEqual(list(observed.sum(axis="sample")), [1., 1., 1.]) - - observed = _bootstrap_iteration(t, 10) - self.assertEqual(list(observed.sum(axis="sample")), [10., 10.]) - - observed = _bootstrap_iteration(t, 19) - self.assertEqual(list(observed.sum(axis="sample")), [19., 19.]) - - observed = _bootstrap_iteration(t, 25) - self.assertEqual(list(observed.sum(axis="sample")), [25.]) - - observed = _bootstrap_iteration(t, 49) - self.assertEqual(list(observed.sum(axis="sample")), [49.]) - - -if __name__ == "__main__": - main() diff --git a/q2_boots/tests/test_resample.py b/q2_boots/tests/test_resample.py new file mode 100644 index 0000000..bea5fc3 --- /dev/null +++ b/q2_boots/tests/test_resample.py @@ -0,0 +1,172 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2024, Caporaso Lab (https://cap-lab.bio). +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import numpy as np +import pandas as pd +import pandas.testing as pdt + +import qiime2 +from qiime2.plugin.testing import TestPluginBase + +from q2_boots import resample + + +class ResampleTests(TestPluginBase): + package = 'q2_boots.tests' + + def setUp(self): + super().setUp() + self.resample_pipeline = self.plugin.pipelines['resample'] + + table1 = pd.DataFrame(data=[[0, 1],[1, 1],[3, 2]], + columns=['F1', 'F2'], + index=['S1', 'S2', 'S3']) + self.table_artifact1 = qiime2.Artifact.import_data( + "FeatureTable[Frequency]", table1, view_type=pd.DataFrame + ) + + table2 = pd.DataFrame(data=[[0, 1, 1], + [10, 10, 9], + [30, 20, 9], + [42, 42, 9]], + columns=['F1', 'F2', 'F3'], + index=['S1', 'S2', 'S3', 'S4']) + self.table_artifact2 = qiime2.Artifact.import_data( + "FeatureTable[Frequency]", table2, view_type=pd.DataFrame + ) + + table3 = pd.DataFrame(data=[[1, 1]], + columns=['F1', 'F2'], + index=['S1']) + self.table_artifact3 = qiime2.Artifact.import_data( + "FeatureTable[Frequency]", table3, view_type=pd.DataFrame + ) + + def test_resample_w_replacement_filters_samples(self): + self._resample_filters_sample(replacement=True) + + def test_resample_wo_replacement_filters_samples(self): + self._resample_filters_sample(replacement=False) + + def test_expected_sampling_depth_w_replacement(self): + self._expected_sampling_depth(replacement=True) + + def test_expected_sampling_depth_wo_replacement(self): + self._expected_sampling_depth(replacement=False) + + def test_expected_n_tables(self): + obs_tables, = self.resample_pipeline(table=self.table_artifact1, + sampling_depth=1, + n=4, + replacement=True) + self.assertEqual(len(obs_tables), 4) + + obs_tables, = self.resample_pipeline(table=self.table_artifact1, + sampling_depth=1, + n=2, + replacement=True) + self.assertEqual(len(obs_tables), 2) + + def test_w_replacement(self): + obs_tables, = self.resample_pipeline(table=self.table_artifact3, + sampling_depth=2, + n=100, + replacement=True) + # if sampling with replacement from a sample with 2 unique features + # that have one observation each, we should observe a resampled + # table with only one feature 50% of the time. the probability of not + # seeing a table with only one feature in 100 resample tables is 1e-30, + # so intermittent failure of this test is possible but should be + # extremely infrequent + fewer_than_two_unique_features_ever_observed = False + for obs_table in obs_tables.values(): + obs_table = obs_table.view(pd.DataFrame) + if len(obs_table.columns) < 2: + fewer_than_two_unique_features_ever_observed = True + self.assertTrue(fewer_than_two_unique_features_ever_observed) + + def test_wo_replacement(self): + obs_tables, = self.resample_pipeline(table=self.table_artifact3, + sampling_depth=2, + n=100, + replacement=False) + # if sampling with replacement from a sample with 2 unique features + # that have one observation each, we should never observe a resampled + # feature table with only one feature. confirm that over 100 + # iterations we always have two features in the resampled table. + exactly_two_features_always_observed = True + for obs_table in obs_tables.values(): + obs_table = obs_table.view(pd.DataFrame) + if len(obs_table.columns) != 2: + exactly_two_features_always_observed = False + self.assertTrue(exactly_two_features_always_observed) + + + # test helper functions + def _expected_sampling_depth(self, replacement): + obs_tables, = self.resample_pipeline(table=self.table_artifact2, + sampling_depth=1, + n=10, + replacement=replacement) + for obs_table in obs_tables.values(): + obs_table = obs_table.view(pd.DataFrame) + self.assertEqual(list(obs_table.sum(axis=1)), + [1., 1., 1., 1.]) + + obs_tables, = self.resample_pipeline(table=self.table_artifact2, + sampling_depth=2, + n=10, + replacement=replacement) + for obs_table in obs_tables.values(): + obs_table = obs_table.view(pd.DataFrame) + self.assertEqual(list(obs_table.sum(axis=1)), + [2., 2., 2., 2.]) + + obs_tables, = self.resample_pipeline(table=self.table_artifact2, + sampling_depth=50, + n=10, + replacement=replacement) + for obs_table in obs_tables.values(): + obs_table = obs_table.view(pd.DataFrame) + self.assertEqual(list(obs_table.sum(axis=1)), + [50., 50.]) + + + def _resample_filters_sample(self, replacement): + with self.assertRaisesRegex(ValueError, "no samples or features"): + _ = self.resample_pipeline(table=self.table_artifact1, + sampling_depth=6, + n=10, + replacement=replacement) + + obs_tables, = self.resample_pipeline(table=self.table_artifact1, + sampling_depth=5, + n=10, + replacement=replacement) + for obs_table in obs_tables.values(): + obs_table = obs_table.view(pd.DataFrame) + sids = list(obs_table.index) + self.assertEqual(sids, ['S3']) + + obs_tables, = self.resample_pipeline(table=self.table_artifact1, + sampling_depth=2, + n=10, + replacement=replacement) + for obs_table in obs_tables.values(): + obs_table = obs_table.view(pd.DataFrame) + sids = list(obs_table.index) + self.assertEqual(sids, ['S2', 'S3']) + + obs_tables, = self.resample_pipeline(table=self.table_artifact1, + sampling_depth=1, + n=10, + replacement=replacement) + for obs_table in obs_tables.values(): + obs_table = obs_table.view(pd.DataFrame) + sids = list(obs_table.index) + self.assertEqual(sids, ['S1', 'S2', 'S3'])