From 2c21b631950e75772199c9ceb9c9b5ec445918e2 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Sun, 30 Oct 2022 13:17:58 +0000 Subject: [PATCH] Add `genotype_values()` method See https://github.com/tskit-dev/tsinfer/issues/739 --- python/CHANGELOG.rst | 11 +++++++++++ python/tests/test_genotypes.py | 27 +++++++++++++++++++++++++++ python/tskit/genotypes.py | 13 +++++++++++++ 3 files changed, 51 insertions(+) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index ba8045b7ab..b5702d737c 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -1,3 +1,14 @@ +-------------------- +[0.5.4] - 2022-XX-XX +-------------------- + +**Features** + + - Variants have a `genotype_values()` method that returns the genotypes as an + (inefficient) array of strings or objects, rather than integer indexes, to + aid comparison of genetic variation (:user:`hyanwong`, :pr:`2617`) + + -------------------- [0.5.3] - 2022-10-03 -------------------- diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index 1443e40061..bfa7c18d91 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -655,6 +655,33 @@ def test_snipped_tree_sequence_mutations_over_isolated(self): assert non_missing_found assert missing_found + def test_genotype_values(self): + tables = tskit.TableCollection(1.0) + tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) + tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) + s = tables.sites.add_row(0, "C") + tables.mutations.add_row(site=s, derived_state="G", node=0) + tables.mutations.add_row(site=s, derived_state="T", node=1) + s = tables.sites.add_row(0.5, "") + tables.mutations.add_row(site=s, derived_state="A long string", node=0) + ts = tables.tree_sequence() + + v = ts.variants(isolated_as_missing=False) + vals = next(v).genotype_values() + assert vals.dtype.type == np.str_ + assert np.array_equal(vals, np.array(["G", "T"])) + vals = next(v).genotype_values() + assert vals.dtype.type == np.str_ + assert np.array_equal(vals, np.array(["A long string", ""])) + + v = ts.variants(isolated_as_missing=True) + vals = next(v).genotype_values() + assert vals.dtype.type == np.str_ + assert np.array_equal(vals, np.array(["G", "T"])) + vals = next(v).genotype_values() + assert vals.dtype.type == np.object_ + assert np.array_equal(vals, np.array(["A long string", None])) + class TestLimitInterval: def test_simple_case(self, ts_fixture): diff --git a/python/tskit/genotypes.py b/python/tskit/genotypes.py index d0abfb3835..a967a07661 100644 --- a/python/tskit/genotypes.py +++ b/python/tskit/genotypes.py @@ -245,6 +245,19 @@ def copy(self) -> Variant: variant_copy._ll_variant = self._ll_variant.restricted_copy() return variant_copy + def genotype_values(self) -> np.ndarray: + """ + Returns the genotypes at this site as an numpy array of strings (if + there is no missing data) or objects (if the genotypes contain missing data, + in which case some elements will be equal to ``None``), + rather than an array of integer indexes. Note that this is inefficient + compared to working with the underlying integer representation as + returned by the :attr:`~Variant.genotypes` property. + + :return: An array of length ``num_sites`` containing strings or objects. + """ + return np.array(self.alleles)[self.genotypes] + def counts(self) -> typing.Counter[str | None]: """ Returns a :class:`python:collections.Counter` object providing counts for each