Skip to content

Commit

Permalink
Improve VCF sample handling
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Jul 27, 2022
1 parent 98dacf5 commit 009a251
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 26 deletions.
145 changes: 129 additions & 16 deletions python/tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,13 +208,6 @@ def test_simple_infinite_sites_ploidy_2_reversed_samples(self):
assert ts.num_sites > 2
self.verify(ts)

def test_simple_infinite_sites_ploidy_2_even_samples(self):
ts = msprime.simulate(20, mutation_rate=1, random_seed=2)
samples = ts.samples()[0::2]
ts = tsutil.insert_individuals(ts, nodes=samples, ploidy=2)
assert ts.num_sites > 2
self.verify(ts)

def test_simple_jukes_cantor_random_ploidy(self):
ts = msprime.simulate(10, random_seed=2)
ts = tsutil.jukes_cantor(ts, num_sites=10, mu=1, seed=2)
Expand Down Expand Up @@ -349,21 +342,49 @@ def test_bad_ploidy(self):
with pytest.raises(ValueError):
ts.write_vcf(io.StringIO, bad_ploidy)

def test_individuals_no_nodes(self):
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
tables = ts.dump_tables()
def test_individuals_no_nodes_default_args(self):
ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
tables = ts1.dump_tables()
tables.individuals.add_row()
ts = tables.tree_sequence()
with pytest.raises(ValueError):
ts.write_vcf(io.StringIO())
ts2 = tables.tree_sequence()
assert ts1.as_vcf() == ts2.as_vcf()

def test_individuals_no_nodes_as_argument(self):
ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
tables = ts1.dump_tables()
tables.individuals.add_row()
ts2 = tables.tree_sequence()
with pytest.raises(ValueError, match="0 not associated with a node"):
ts2.as_vcf(individuals=[0])

def test_ploidy_with_individuals(self):
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
ts = msprime.sim_ancestry(3, random_seed=2)
ts = tsutil.insert_branch_sites(ts)
with pytest.raises(ValueError):
ts.write_vcf(io.StringIO(), ploidy=2)

def test_empth_individuals(self):
ts = msprime.sim_ancestry(3, random_seed=2)
ts = tsutil.insert_branch_sites(ts)
with pytest.raises(ValueError):
ts.as_vcf(individuals=[])

def test_mixed_sample_non_sample_individuals(self):
ts = msprime.sim_ancestry(3, random_seed=2)
tables = ts.dump_tables()
tables.individuals.add_row()
# Add a reference to an individual from a non-sample
individual = tables.nodes.individual
individual[-1] = 0
tables.nodes.individual = individual
ts = tables.tree_sequence()
with pytest.raises(ValueError):
ts.write_vcf(io.StringIO(), ploidy=2)
ts = tsutil.insert_branch_sites(ts)
with pytest.raises(
ValueError, match="0 has nodes that are sample and non-sample"
):
ts.as_vcf()
# but it's OK if we run without the affected individual
assert len(ts.as_vcf(individuals=[1, 2])) > 0

def test_bad_individuals(self):
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
Expand Down Expand Up @@ -703,3 +724,95 @@ def test_ok_with_pysam(self):

gts = [variants[1].samples[key]["GT"] for key in samples]
assert gts == [(0,), (1,), (None,)]


def drop_individuals(ts):
tables = ts.dump_tables()
individual = tables.nodes.individual
individual[:] = -1
tables.individuals.clear()
tables.nodes.individual = individual
return tables.tree_sequence()


class TestSampleOptions:
@tests.cached_example
def ts(self):
ts = tskit.Tree.generate_balanced(3, span=10).tree_sequence
ts = tsutil.insert_branch_sites(ts)
tables = ts.dump_tables()
tables.individuals.add_row()
tables.individuals.add_row()
individual = tables.nodes.individual
# One diploid and one haploid, not in adjacent individuals
individual[0] = 0
individual[1] = 1
individual[2] = 0
tables.nodes.individual = individual
return tables.tree_sequence()

def test_no_individuals_defaults(self):
ts = drop_individuals(self.ts())
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1\ttsk_2
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1\t0\t0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0\t1\t1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0\t1\t0
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0\t1"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf()) == expected

def test_no_individuals_ploidy_3(self):
ts = drop_individuals(self.ts())
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0|0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1|1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|1|0
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|0|1"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf(ploidy=3)) == expected

def test_defaults(self):
ts = self.ts()
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0\t0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1\t1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|0\t1
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|1\t0"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf()) == expected

def test_individual_0(self):
ts = self.ts()
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|0
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|1"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf(individuals=[0])) == expected

def test_individual_1(self):
ts = self.ts()
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf(individuals=[1])) == expected

def test_reversed(self):
ts = self.ts()
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1\t0|1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1\t0|0
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0|1"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf(individuals=[1, 0])) == expected
1 change: 0 additions & 1 deletion python/tests/tsutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,6 @@ def insert_random_ploidy_individuals(
ind_id = tables.individuals.add_row(location=location, parents=parents)
individual[nodes] = ind_id
j += ploidy
j += rng.randint(0, 5) # skip a random number
tables.nodes.individual = individual
return tables.tree_sequence()

Expand Down
41 changes: 32 additions & 9 deletions python/tskit/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"""
import numpy as np

import tskit
from . import provenance


Expand Down Expand Up @@ -67,12 +68,7 @@ def __init__(
self.contig_id = contig_id
self.isolated_as_missing = isolated_as_missing

if individuals is None:
individuals = np.arange(tree_sequence.num_individuals, dtype=int)
self.individuals = individuals

self.__make_sample_mapping(ploidy)

self.__make_sample_mapping(ploidy, individuals)
if individual_names is None:
individual_names = [f"tsk_{j}" for j in range(self.num_individuals)]
self.individual_names = individual_names
Expand Down Expand Up @@ -115,13 +111,34 @@ def __init__(
sample_mask = np.array(sample_mask, dtype=bool)
self.sample_mask = lambda _: sample_mask

def __make_sample_mapping(self, ploidy):
def __make_sample_mapping(self, ploidy, individuals):
"""
Compute the sample IDs for each VCF individual and the template for
writing out genotypes.
"""
ts = self.tree_sequence
self.samples = None
self.individual_ploidies = []

if individuals is None:
# Find all sample nodes that reference individuals
self.individuals = np.unique(ts.nodes_individual[ts.samples()])
if len(self.individuals) == 1 and self.individuals[0] == tskit.NULL:
# No samples refer to individuals
self.individuals = []
else:
# np.unique sorts the argument, so if NULL (-1) is present it
# will be the first value.
if self.individuals[0] == tskit.NULL:
raise ValueError(
"Sample nodes must either all be associated with individuals "
"or not associated with any individuals"
)
else:
if len(individuals) == 0:
raise ValueError("List of sample individuals empty")
self.individuals = individuals

if len(self.individuals) > 0:
if ploidy is not None:
raise ValueError("Cannot specify ploidy when individuals present")
Expand All @@ -130,10 +147,16 @@ def __make_sample_mapping(self, ploidy):
if i < 0 or i >= self.tree_sequence.num_individuals:
raise ValueError("Invalid individual IDs provided.")
ind = self.tree_sequence.individual(i)
if len(ind.nodes) == 0:
raise ValueError(f"Individual {i} not associated with a node")
is_sample = {ts.node(u).is_sample() for u in ind.nodes}
if len(is_sample) != 1:
raise ValueError(
f"Individual {ind.id} has nodes that are sample and "
"non-samples"
)
self.samples.extend(ind.nodes)
self.individual_ploidies.append(len(ind.nodes))
if len(self.samples) == 0:
raise ValueError("The individuals do not map to any sampled nodes.")
else:
if ploidy is None:
ploidy = 1
Expand Down

0 comments on commit 009a251

Please sign in to comment.