Skip to content

Commit

Permalink
Improve VCF sample handling
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Jul 28, 2022
1 parent 8d89566 commit 5db1a39
Show file tree
Hide file tree
Showing 4 changed files with 266 additions and 49 deletions.
246 changes: 215 additions & 31 deletions python/tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,13 +208,6 @@ def test_simple_infinite_sites_ploidy_2_reversed_samples(self):
assert ts.num_sites > 2
self.verify(ts)

def test_simple_infinite_sites_ploidy_2_even_samples(self):
ts = msprime.simulate(20, mutation_rate=1, random_seed=2)
samples = ts.samples()[0::2]
ts = tsutil.insert_individuals(ts, nodes=samples, ploidy=2)
assert ts.num_sites > 2
self.verify(ts)

def test_simple_jukes_cantor_random_ploidy(self):
ts = msprime.simulate(10, random_seed=2)
ts = tsutil.jukes_cantor(ts, num_sites=10, mu=1, seed=2)
Expand Down Expand Up @@ -341,36 +334,91 @@ class TestInterface:

def test_bad_ploidy(self):
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
for bad_ploidy in [-1, 0, 20]:
with pytest.raises(ValueError):
for bad_ploidy in [-1, 0]:
with pytest.raises(ValueError, match="Ploidy must be >= 1"):
ts.write_vcf(io.StringIO, bad_ploidy)
# Non divisible
for bad_ploidy in [3, 7]:
with pytest.raises(ValueError):
with pytest.raises(
ValueError, match="Sample size must be divisible by ploidy"
):
ts.write_vcf(io.StringIO, bad_ploidy)

def test_individuals_no_nodes(self):
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
def test_individuals_no_nodes_default_args(self):
ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
tables = ts1.dump_tables()
tables.individuals.add_row()
ts2 = tables.tree_sequence()
assert ts1.as_vcf() == ts2.as_vcf()

def test_individuals_no_nodes_as_argument(self):
ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
tables = ts1.dump_tables()
tables.individuals.add_row()
ts2 = tables.tree_sequence()
with pytest.raises(ValueError, match="0 not associated with a node"):
ts2.as_vcf(individuals=[0])

def test_ploidy_with_sample_individuals(self):
ts = msprime.sim_ancestry(3, random_seed=2)
ts = tsutil.insert_branch_sites(ts)
with pytest.raises(ValueError, match="Cannot specify ploidy when individuals"):
ts.write_vcf(io.StringIO(), ploidy=2)

def test_ploidy_with_no_node_individuals(self):
ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
tables = ts1.dump_tables()
tables.individuals.add_row()
ts2 = tables.tree_sequence()
with pytest.raises(ValueError, match="Cannot specify ploidy when individuals"):
ts2.as_vcf(ploidy=2)

def test_empty_individuals(self):
ts = msprime.sim_ancestry(3, random_seed=2)
ts = tsutil.insert_branch_sites(ts)
with pytest.raises(ValueError, match="List of sample individuals empty"):
ts.as_vcf(individuals=[])

def test_mixed_sample_non_sample_individuals(self):
ts = msprime.sim_ancestry(3, random_seed=2)
tables = ts.dump_tables()
tables.individuals.add_row()
# Add a reference to an individual from a non-sample
individual = tables.nodes.individual
individual[-1] = 0
tables.nodes.individual = individual
ts = tables.tree_sequence()
with pytest.raises(ValueError):
ts.write_vcf(io.StringIO())

def test_ploidy_with_individuals(self):
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
ts = tsutil.insert_branch_sites(ts)
with pytest.raises(
ValueError, match="0 has nodes that are sample and non-sample"
):
ts.as_vcf()
# but it's OK if we run without the affected individual
assert len(ts.as_vcf(individuals=[1, 2])) > 0

def test_samples_with_and_without_individuals(self):
ts = tskit.Tree.generate_balanced(3).tree_sequence
tables = ts.dump_tables()
tables.individuals.add_row()
# Add a reference to an individual from one sample
individual = tables.nodes.individual
individual[0] = 0
tables.nodes.individual = individual
ts = tables.tree_sequence()
with pytest.raises(ValueError):
ts.write_vcf(io.StringIO(), ploidy=2)
ts = tsutil.insert_branch_sites(ts)
with pytest.raises(
ValueError, match="Sample nodes must either all be associated"
):
ts.as_vcf()
# But it's OK if explicitly specify that sample
assert len(ts.as_vcf(individuals=[0])) > 0

def test_bad_individuals(self):
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
ts = tsutil.insert_individuals(ts, ploidy=2)
with pytest.raises(ValueError):
with pytest.raises(ValueError, match="Invalid individual IDs provided."):
ts.write_vcf(io.StringIO(), individuals=[0, -1])
with pytest.raises(ValueError):
with pytest.raises(ValueError, match="Invalid individual IDs provided."):
ts.write_vcf(io.StringIO(), individuals=[1, 2, ts.num_individuals])

def test_ploidy_positional(self):
Expand All @@ -379,7 +427,7 @@ def test_ploidy_positional(self):

def test_only_ploidy_positional(self):
ts = msprime.simulate(2, mutation_rate=2, random_seed=1)
with pytest.raises(TypeError):
with pytest.raises(TypeError, match="positional arguments"):
assert ts.as_vcf(2, "chr2")


Expand All @@ -400,7 +448,9 @@ def test_many_alleles(self):
for j in range(9, 15):
tables.mutations.add_row(0, node=j, derived_state=str(j))
ts = tables.tree_sequence()
with pytest.raises(ValueError):
with pytest.raises(
ValueError, match="More than 9 alleles not currently supported"
):
ts.write_vcf(io.StringIO())


Expand Down Expand Up @@ -436,17 +486,33 @@ def test_bad_length_individuals(self):
ts = msprime.simulate(6, mutation_rate=2, random_seed=1)
assert ts.num_sites > 0
ts = tsutil.insert_individuals(ts, ploidy=2)
with pytest.raises(ValueError):
with pytest.raises(
ValueError,
match="individual_names must have length equal to"
" the number of individuals",
):
ts.write_vcf(io.StringIO(), individual_names=[])
with pytest.raises(ValueError):
with pytest.raises(
ValueError,
match="individual_names must have length equal to"
" the number of individuals",
):
ts.write_vcf(io.StringIO(), individual_names=["x" for _ in range(4)])
with pytest.raises(ValueError):
with pytest.raises(
ValueError,
match="individual_names must have length equal to"
" the number of individuals",
):
ts.write_vcf(
io.StringIO(),
individuals=list(range(ts.num_individuals)),
individual_names=["x" for _ in range(ts.num_individuals - 1)],
)
with pytest.raises(ValueError):
with pytest.raises(
ValueError,
match="individual_names must have length equal to"
" the number of individuals",
):
ts.write_vcf(
io.StringIO(),
individuals=list(range(ts.num_individuals - 1)),
Expand All @@ -456,18 +522,30 @@ def test_bad_length_individuals(self):
def test_bad_length_ploidy(self):
ts = msprime.simulate(6, mutation_rate=2, random_seed=1)
assert ts.num_sites > 0
with pytest.raises(ValueError):
with pytest.raises(
ValueError,
match="individual_names must have length equal to"
" the number of individuals",
):
ts.write_vcf(io.StringIO(), ploidy=2, individual_names=[])
with pytest.raises(ValueError):
with pytest.raises(
ValueError,
match="individual_names must have length equal to"
" the number of individuals",
):
ts.write_vcf(
io.StringIO(), ploidy=2, individual_names=["x" for _ in range(4)]
)

def test_bad_type(self):
ts = msprime.simulate(2, mutation_rate=2, random_seed=1)
with pytest.raises(TypeError):
with pytest.raises(
TypeError, match="sequence item 0: expected str instance," " NoneType found"
):
ts.write_vcf(io.StringIO(), individual_names=[None, "b"])
with pytest.raises(TypeError):
with pytest.raises(
TypeError, match="sequence item 0: expected str instance," " bytes found"
):
ts.write_vcf(io.StringIO(), individual_names=[b"a", "b"])


Expand Down Expand Up @@ -703,3 +781,109 @@ def test_ok_with_pysam(self):

gts = [variants[1].samples[key]["GT"] for key in samples]
assert gts == [(0,), (1,), (None,)]


def drop_individuals(ts):
tables = ts.dump_tables()
individual = tables.nodes.individual
individual[:] = -1
tables.individuals.clear()
tables.nodes.individual = individual
return tables.tree_sequence()


class TestSampleOptions:
@tests.cached_example
def ts(self):
ts = tskit.Tree.generate_balanced(3, span=10).tree_sequence
ts = tsutil.insert_branch_sites(ts)
tables = ts.dump_tables()
tables.individuals.add_row()
tables.individuals.add_row()
individual = tables.nodes.individual
# One diploid and one haploid, not in adjacent individuals
individual[0] = 0
individual[1] = 1
individual[2] = 0
tables.nodes.individual = individual
return tables.tree_sequence()

def test_no_individuals_defaults(self):
ts = drop_individuals(self.ts())
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1\ttsk_2
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1\t0\t0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0\t1\t1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0\t1\t0
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0\t1"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf()) == expected

def test_no_individuals_ploidy_3(self):
ts = drop_individuals(self.ts())
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0|0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1|1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|1|0
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|0|1"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf(ploidy=3)) == expected

def test_defaults(self):
ts = self.ts()
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0\t0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1\t1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|0\t1
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|1\t0"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf()) == expected

def test_individual_0(self):
ts = self.ts()
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|0
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|1"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf(individuals=[0])) == expected

def test_individual_1(self):
ts = self.ts()
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf(individuals=[1])) == expected

def test_reversed(self):
ts = self.ts()
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1\t0|1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1\t0|0
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0|1"""
expected = textwrap.dedent(s)
assert drop_header(ts.as_vcf(individuals=[1, 0])) == expected

def test_reversed_names(self):
ts = self.ts()
s = """\
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tA\tB
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|0
1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1\t0|1
1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1\t0|0
1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0|1"""
expected = textwrap.dedent(s)
assert (
drop_header(ts.as_vcf(individuals=[1, 0], individual_names=["A", "B"]))
== expected
)
1 change: 0 additions & 1 deletion python/tests/tsutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,6 @@ def insert_random_ploidy_individuals(
ind_id = tables.individuals.add_row(location=location, parents=parents)
individual[nodes] = ind_id
j += ploidy
j += rng.randint(0, 5) # skip a random number
tables.nodes.individual = individual
return tables.tree_sequence()

Expand Down
6 changes: 6 additions & 0 deletions python/tskit/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -5994,6 +5994,12 @@ def write_vcf(
for duplicates in this array, or perform any checks to ensure that
the output VCF is well-formed.
.. note::
The default individual names (VCF sample IDs) are always of
the form ``tsk_0``, ``tsk_1``, ..., ``tsk_{N - 1}``, where
N is the number of individuals we output. These numbers
are **not** necessarily the individual IDs.
.. note::
Warning to ``plink`` users:
Expand Down
Loading

0 comments on commit 5db1a39

Please sign in to comment.