Skip to content

Commit

Permalink
Add genotype_values() method
Browse files Browse the repository at this point in the history
  • Loading branch information
hyanwong committed Jan 7, 2023
1 parent cba6922 commit 9f178fe
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 2 deletions.
6 changes: 6 additions & 0 deletions python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
to ``None``, which is treated as ``True``. Previously, passing ``None``
would result in an error. (:user:`hyanwong`, :pr:`2609`, :issue:`2608`)

**Features**

- Variants have a `genotype_values()` method that returns the genotypes as an
(inefficient) array of strings or objects, rather than integer indexes, to
aid comparison of genetic variation (:user:`hyanwong`, :pr:`2617`)


--------------------
[0.5.3] - 2022-10-03
Expand Down
61 changes: 60 additions & 1 deletion python/tests/test_genotypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2019-2022 Tskit Developers
# Copyright (c) 2019-2023 Tskit Developers
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -655,6 +655,65 @@ def test_snipped_tree_sequence_mutations_over_isolated(self):
assert non_missing_found
assert missing_found

def get_missing_data_ts(self):
tables = tskit.TableCollection(1.0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
s = tables.sites.add_row(0, "A")
tables.mutations.add_row(site=s, derived_state="B", node=1)
tables.mutations.add_row(site=s, derived_state="C", node=2)
s = tables.sites.add_row(0.5, "")
tables.mutations.add_row(site=s, derived_state="A long string", node=2)
return tables.tree_sequence()

def test_states(self):
ts = self.get_missing_data_ts()
v_iter = ts.variants(isolated_as_missing=False)
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["A", "B", "C"]))
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["", "", "A long string"]))
# With no mssing data, it shouldn't matter if the missing string = an allele
assert np.array_equal(
v.states(missing_data_string=""), np.array(["", "", "A long string"])
)

v_iter = ts.variants(isolated_as_missing=True)
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["N", "B", "C"]))
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["N", "N", "A long string"]))
assert np.array_equal(
v.states(missing_data_string="MISSING"),
np.array(["MISSING", "MISSING", "A long string"]),
)

@pytest.mark.parametrize("missing", [True, False])
def test_states_haplotypes_equiv(self, missing):
ts = msprime.sim_ancestry(2, sequence_length=20, random_seed=1)
ts = msprime.sim_mutations(ts, rate=0.1, random_seed=1)
assert ts.num_sites > 5
tables = ts.dump_tables()
tables.delete_intervals([[0, ts.site(4).position]])
tables.sites.replace_with(ts.tables.sites)
ts = tables.tree_sequence()
states = np.array(
[v.states() for v in ts.variants(isolated_as_missing=missing)]
)
for h1, h2 in zip(ts.haplotypes(isolated_as_missing=missing), states.T):
assert h1 == "".join(h2)

@pytest.mark.parametrize("s", ["", "A long string", True, np.nan, 0, -1])
def test_bad_states(self, s):
ts = self.get_missing_data_ts()
v_iter = ts.variants(isolated_as_missing=True)
v = next(v_iter)
v = next(v_iter)
match = "existing allele" if isinstance(s, str) else "not a string"
with pytest.raises(ValueError, match=match):
v.states(missing_data_string=s)


class TestLimitInterval:
def test_simple_case(self, ts_fixture):
Expand Down
31 changes: 30 additions & 1 deletion python/tskit/genotypes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# Copyright (c) 2018-2022 Tskit Developers
# Copyright (c) 2018-2023 Tskit Developers
# Copyright (c) 2015-2018 University of Oxford
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
Expand Down Expand Up @@ -245,6 +245,35 @@ def copy(self) -> Variant:
variant_copy._ll_variant = self._ll_variant.restricted_copy()
return variant_copy

def states(self, missing_data_string=None) -> np.ndarray:
"""
Returns the allelic states at this site as an array of strings.
.. warning::
Using this method is inefficient compared to working with the
underlying integer representation of genotypes as returned by
the :attr:`~Variant.genotypes` property.
:param str missing_data_string: A string that will be used to represent missing
data. If any normal allele contains this character, an error is raised.
Default: `None`, treated as `'N'`.
:return: An numpy array of strings of length ``num_sites``.
"""
if missing_data_string is None:
missing_data_string = "N"
elif not isinstance(missing_data_string, str):
# Must explicitly test here, otherwise we output a numpy object array
raise ValueError("Missing data string is not a string")
alleles = self.alleles
if alleles[-1] is None:
if missing_data_string in alleles:
raise ValueError(
"An existing allele is equal to the "
f"missing data string '{missing_data_string}'"
)
alleles = alleles[:-1] + (missing_data_string,)
return np.array(alleles)[self.genotypes]

def counts(self) -> typing.Counter[str | None]:
"""
Returns a :class:`python:collections.Counter` object providing counts for each
Expand Down

0 comments on commit 9f178fe

Please sign in to comment.