Skip to content

Commit

Permalink
Add genotype_values() method
Browse files Browse the repository at this point in the history
  • Loading branch information
hyanwong committed Sep 24, 2024
1 parent d6b2303 commit efa8d08
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 1 deletion.
4 changes: 3 additions & 1 deletion python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
- Add ``Interval.mid`` and ``Tree.mid`` properties to return the midpoint of the interval.
(:user:`currocam`, :pr:`2960`)

- Variants now have a `states()` method that returns the genotypes as an
(inefficient) array of strings, rather than integer indexes, to
aid comparison of genetic variation (:user:`hyanwong`, :pr:`2617`)

--------------------
[0.5.8] - 2024-06-27
Expand Down Expand Up @@ -165,7 +168,6 @@
to ``None``, which is treated as ``True``. Previously, passing ``None``
would result in an error. (:user:`hyanwong`, :pr:`2609`, :issue:`2608`)


--------------------
[0.5.3] - 2022-10-03
--------------------
Expand Down
59 changes: 59 additions & 0 deletions python/tests/test_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,65 @@ def test_snipped_tree_sequence_mutations_over_isolated(self):
assert non_missing_found
assert missing_found

def get_missing_data_ts(self):
tables = tskit.TableCollection(1.0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
s = tables.sites.add_row(0, "A")
tables.mutations.add_row(site=s, derived_state="B", node=1)
tables.mutations.add_row(site=s, derived_state="C", node=2)
s = tables.sites.add_row(0.5, "")
tables.mutations.add_row(site=s, derived_state="A long string", node=2)
return tables.tree_sequence()

def test_states(self):
ts = self.get_missing_data_ts()
v_iter = ts.variants(isolated_as_missing=False)
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["A", "B", "C"]))
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["", "", "A long string"]))
# With no mssing data, it shouldn't matter if the missing string = an allele
assert np.array_equal(
v.states(missing_data_string=""), np.array(["", "", "A long string"])
)

v_iter = ts.variants(isolated_as_missing=True)
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["N", "B", "C"]))
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["N", "N", "A long string"]))
assert np.array_equal(
v.states(missing_data_string="MISSING"),
np.array(["MISSING", "MISSING", "A long string"]),
)

@pytest.mark.parametrize("missing", [True, False])
def test_states_haplotypes_equiv(self, missing):
ts = msprime.sim_ancestry(2, sequence_length=20, random_seed=1)
ts = msprime.sim_mutations(ts, rate=0.1, random_seed=1)
assert ts.num_sites > 5
tables = ts.dump_tables()
tables.delete_intervals([[0, ts.site(4).position]])
tables.sites.replace_with(ts.tables.sites)
ts = tables.tree_sequence()
states = np.array(
[v.states() for v in ts.variants(isolated_as_missing=missing)]
)
for h1, h2 in zip(ts.haplotypes(isolated_as_missing=missing), states.T):
assert h1 == "".join(h2)

@pytest.mark.parametrize("s", ["", "A long string", True, np.nan, 0, -1])
def test_bad_states(self, s):
ts = self.get_missing_data_ts()
v_iter = ts.variants(isolated_as_missing=True)
v = next(v_iter)
v = next(v_iter)
match = "existing allele" if isinstance(s, str) else "not a string"
with pytest.raises(ValueError, match=match):
v.states(missing_data_string=s)


class TestLimitInterval:
def test_simple_case(self, ts_fixture):
Expand Down
29 changes: 29 additions & 0 deletions python/tskit/genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,35 @@ def copy(self) -> Variant:
variant_copy._ll_variant = self._ll_variant.restricted_copy()
return variant_copy

def states(self, missing_data_string=None) -> np.ndarray:
"""
Returns the allelic states at this site as an array of strings.
.. warning::
Using this method is inefficient compared to working with the
underlying integer representation of genotypes as returned by
the :attr:`~Variant.genotypes` property.
:param str missing_data_string: A string that will be used to represent missing
data. If any normal allele contains this character, an error is raised.
Default: `None`, treated as `'N'`.
:return: An numpy array of strings of length ``num_sites``.
"""
if missing_data_string is None:
missing_data_string = "N"
elif not isinstance(missing_data_string, str):
# Must explicitly test here, otherwise we output a numpy object array
raise ValueError("Missing data string is not a string")
alleles = self.alleles
if alleles[-1] is None:
if missing_data_string in alleles:
raise ValueError(
"An existing allele is equal to the "
f"missing data string '{missing_data_string}'"
)
alleles = alleles[:-1] + (missing_data_string,)
return np.array(alleles)[self.genotypes]

def counts(self) -> typing.Counter[str | None]:
"""
Returns a :class:`python:collections.Counter` object providing counts for each
Expand Down

0 comments on commit efa8d08

Please sign in to comment.