From 2d0d33e92b50529683d200be41886afc37d671ad Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 26 Jul 2022 14:22:38 +0100 Subject: [PATCH] Improve VCF sample handling Closes #2257 Closes #2446 Closes #2448 --- python/tests/test_vcf.py | 246 ++++++++++++++++++++++++++++++++++----- python/tests/tsutil.py | 1 - python/tskit/trees.py | 6 + python/tskit/vcf.py | 62 +++++++--- 4 files changed, 266 insertions(+), 49 deletions(-) diff --git a/python/tests/test_vcf.py b/python/tests/test_vcf.py index 6934d12acb..0034b99b4b 100644 --- a/python/tests/test_vcf.py +++ b/python/tests/test_vcf.py @@ -208,13 +208,6 @@ def test_simple_infinite_sites_ploidy_2_reversed_samples(self): assert ts.num_sites > 2 self.verify(ts) - def test_simple_infinite_sites_ploidy_2_even_samples(self): - ts = msprime.simulate(20, mutation_rate=1, random_seed=2) - samples = ts.samples()[0::2] - ts = tsutil.insert_individuals(ts, nodes=samples, ploidy=2) - assert ts.num_sites > 2 - self.verify(ts) - def test_simple_jukes_cantor_random_ploidy(self): ts = msprime.simulate(10, random_seed=2) ts = tsutil.jukes_cantor(ts, num_sites=10, mu=1, seed=2) @@ -341,36 +334,91 @@ class TestInterface: def test_bad_ploidy(self): ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2) - for bad_ploidy in [-1, 0, 20]: - with pytest.raises(ValueError): + for bad_ploidy in [-1, 0]: + with pytest.raises(ValueError, match="Ploidy must be >= 1"): ts.write_vcf(io.StringIO, bad_ploidy) # Non divisible for bad_ploidy in [3, 7]: - with pytest.raises(ValueError): + with pytest.raises( + ValueError, match="Sample size must be divisible by ploidy" + ): ts.write_vcf(io.StringIO, bad_ploidy) - def test_individuals_no_nodes(self): - ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2) + def test_individuals_no_nodes_default_args(self): + ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2) + tables = ts1.dump_tables() + tables.individuals.add_row() + ts2 = tables.tree_sequence() + assert ts1.as_vcf() == ts2.as_vcf() + + def test_individuals_no_nodes_as_argument(self): + ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2) + tables = ts1.dump_tables() + tables.individuals.add_row() + ts2 = tables.tree_sequence() + with pytest.raises(ValueError, match="0 not associated with a node"): + ts2.as_vcf(individuals=[0]) + + def test_ploidy_with_sample_individuals(self): + ts = msprime.sim_ancestry(3, random_seed=2) + ts = tsutil.insert_branch_sites(ts) + with pytest.raises(ValueError, match="Cannot specify ploidy when individuals"): + ts.write_vcf(io.StringIO(), ploidy=2) + + def test_ploidy_with_no_node_individuals(self): + ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2) + tables = ts1.dump_tables() + tables.individuals.add_row() + ts2 = tables.tree_sequence() + with pytest.raises(ValueError, match="Cannot specify ploidy when individuals"): + ts2.as_vcf(ploidy=2) + + def test_empty_individuals(self): + ts = msprime.sim_ancestry(3, random_seed=2) + ts = tsutil.insert_branch_sites(ts) + with pytest.raises(ValueError, match="List of sample individuals empty"): + ts.as_vcf(individuals=[]) + + def test_mixed_sample_non_sample_individuals(self): + ts = msprime.sim_ancestry(3, random_seed=2) tables = ts.dump_tables() tables.individuals.add_row() + # Add a reference to an individual from a non-sample + individual = tables.nodes.individual + individual[-1] = 0 + tables.nodes.individual = individual ts = tables.tree_sequence() - with pytest.raises(ValueError): - ts.write_vcf(io.StringIO()) - - def test_ploidy_with_individuals(self): - ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2) + ts = tsutil.insert_branch_sites(ts) + with pytest.raises( + ValueError, match="0 has nodes that are sample and non-sample" + ): + ts.as_vcf() + # but it's OK if we run without the affected individual + assert len(ts.as_vcf(individuals=[1, 2])) > 0 + + def test_samples_with_and_without_individuals(self): + ts = tskit.Tree.generate_balanced(3).tree_sequence tables = ts.dump_tables() tables.individuals.add_row() + # Add a reference to an individual from one sample + individual = tables.nodes.individual + individual[0] = 0 + tables.nodes.individual = individual ts = tables.tree_sequence() - with pytest.raises(ValueError): - ts.write_vcf(io.StringIO(), ploidy=2) + ts = tsutil.insert_branch_sites(ts) + with pytest.raises( + ValueError, match="Sample nodes must either all be associated" + ): + ts.as_vcf() + # But it's OK if explicitly specify that sample + assert len(ts.as_vcf(individuals=[0])) > 0 def test_bad_individuals(self): ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2) ts = tsutil.insert_individuals(ts, ploidy=2) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Invalid individual IDs provided."): ts.write_vcf(io.StringIO(), individuals=[0, -1]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Invalid individual IDs provided."): ts.write_vcf(io.StringIO(), individuals=[1, 2, ts.num_individuals]) def test_ploidy_positional(self): @@ -379,7 +427,7 @@ def test_ploidy_positional(self): def test_only_ploidy_positional(self): ts = msprime.simulate(2, mutation_rate=2, random_seed=1) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match="positional arguments"): assert ts.as_vcf(2, "chr2") @@ -400,7 +448,9 @@ def test_many_alleles(self): for j in range(9, 15): tables.mutations.add_row(0, node=j, derived_state=str(j)) ts = tables.tree_sequence() - with pytest.raises(ValueError): + with pytest.raises( + ValueError, match="More than 9 alleles not currently supported" + ): ts.write_vcf(io.StringIO()) @@ -436,17 +486,33 @@ def test_bad_length_individuals(self): ts = msprime.simulate(6, mutation_rate=2, random_seed=1) assert ts.num_sites > 0 ts = tsutil.insert_individuals(ts, ploidy=2) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="individual_names must have length equal to" + " the number of individuals", + ): ts.write_vcf(io.StringIO(), individual_names=[]) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="individual_names must have length equal to" + " the number of individuals", + ): ts.write_vcf(io.StringIO(), individual_names=["x" for _ in range(4)]) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="individual_names must have length equal to" + " the number of individuals", + ): ts.write_vcf( io.StringIO(), individuals=list(range(ts.num_individuals)), individual_names=["x" for _ in range(ts.num_individuals - 1)], ) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="individual_names must have length equal to" + " the number of individuals", + ): ts.write_vcf( io.StringIO(), individuals=list(range(ts.num_individuals - 1)), @@ -456,18 +522,30 @@ def test_bad_length_individuals(self): def test_bad_length_ploidy(self): ts = msprime.simulate(6, mutation_rate=2, random_seed=1) assert ts.num_sites > 0 - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="individual_names must have length equal to" + " the number of individuals", + ): ts.write_vcf(io.StringIO(), ploidy=2, individual_names=[]) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="individual_names must have length equal to" + " the number of individuals", + ): ts.write_vcf( io.StringIO(), ploidy=2, individual_names=["x" for _ in range(4)] ) def test_bad_type(self): ts = msprime.simulate(2, mutation_rate=2, random_seed=1) - with pytest.raises(TypeError): + with pytest.raises( + TypeError, match="sequence item 0: expected str instance," " NoneType found" + ): ts.write_vcf(io.StringIO(), individual_names=[None, "b"]) - with pytest.raises(TypeError): + with pytest.raises( + TypeError, match="sequence item 0: expected str instance," " bytes found" + ): ts.write_vcf(io.StringIO(), individual_names=[b"a", "b"]) @@ -703,3 +781,109 @@ def test_ok_with_pysam(self): gts = [variants[1].samples[key]["GT"] for key in samples] assert gts == [(0,), (1,), (None,)] + + +def drop_individuals(ts): + tables = ts.dump_tables() + individual = tables.nodes.individual + individual[:] = -1 + tables.individuals.clear() + tables.nodes.individual = individual + return tables.tree_sequence() + + +class TestSampleOptions: + @tests.cached_example + def ts(self): + ts = tskit.Tree.generate_balanced(3, span=10).tree_sequence + ts = tsutil.insert_branch_sites(ts) + tables = ts.dump_tables() + tables.individuals.add_row() + tables.individuals.add_row() + individual = tables.nodes.individual + # One diploid and one haploid, not in adjacent individuals + individual[0] = 0 + individual[1] = 1 + individual[2] = 0 + tables.nodes.individual = individual + return tables.tree_sequence() + + def test_no_individuals_defaults(self): + ts = drop_individuals(self.ts()) + s = """\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1\ttsk_2 + 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1\t0\t0 + 1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0\t1\t1 + 1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0\t1\t0 + 1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0\t1""" + expected = textwrap.dedent(s) + assert drop_header(ts.as_vcf()) == expected + + def test_no_individuals_ploidy_3(self): + ts = drop_individuals(self.ts()) + s = """\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0 + 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0|0 + 1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1|1 + 1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|1|0 + 1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|0|1""" + expected = textwrap.dedent(s) + assert drop_header(ts.as_vcf(ploidy=3)) == expected + + def test_defaults(self): + ts = self.ts() + s = """\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1 + 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0\t0 + 1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1\t1 + 1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|0\t1 + 1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|1\t0""" + expected = textwrap.dedent(s) + assert drop_header(ts.as_vcf()) == expected + + def test_individual_0(self): + ts = self.ts() + s = """\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0 + 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0 + 1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1 + 1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|0 + 1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|1""" + expected = textwrap.dedent(s) + assert drop_header(ts.as_vcf(individuals=[0])) == expected + + def test_individual_1(self): + ts = self.ts() + s = """\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0 + 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0 + 1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1 + 1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1 + 1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0""" + expected = textwrap.dedent(s) + assert drop_header(ts.as_vcf(individuals=[1])) == expected + + def test_reversed(self): + ts = self.ts() + s = """\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1 + 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|0 + 1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1\t0|1 + 1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1\t0|0 + 1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0|1""" + expected = textwrap.dedent(s) + assert drop_header(ts.as_vcf(individuals=[1, 0])) == expected + + def test_reversed_names(self): + ts = self.ts() + s = """\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tA\tB + 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|0 + 1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1\t0|1 + 1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1\t0|0 + 1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0|1""" + expected = textwrap.dedent(s) + assert ( + drop_header(ts.as_vcf(individuals=[1, 0], individual_names=["A", "B"])) + == expected + ) diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 77a6863c13..4f2a928229 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -222,7 +222,6 @@ def insert_random_ploidy_individuals( ind_id = tables.individuals.add_row(location=location, parents=parents) individual[nodes] = ind_id j += ploidy - j += rng.randint(0, 5) # skip a random number tables.nodes.individual = individual return tables.tree_sequence() diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 9d5aa3805c..7a2cc01002 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5994,6 +5994,12 @@ def write_vcf( for duplicates in this array, or perform any checks to ensure that the output VCF is well-formed. + .. note:: + The default individual names (VCF sample IDs) are always of + the form ``tsk_0``, ``tsk_1``, ..., ``tsk_{N - 1}``, where + N is the number of individuals we output. These numbers + are **not** necessarily the individual IDs. + .. note:: Warning to ``plink`` users: diff --git a/python/tskit/vcf.py b/python/tskit/vcf.py index fba646ac1c..2ee74b761c 100644 --- a/python/tskit/vcf.py +++ b/python/tskit/vcf.py @@ -25,6 +25,7 @@ """ import numpy as np +import tskit from . import provenance @@ -67,12 +68,7 @@ def __init__( self.contig_id = contig_id self.isolated_as_missing = isolated_as_missing - if individuals is None: - individuals = np.arange(tree_sequence.num_individuals, dtype=int) - self.individuals = individuals - - self.__make_sample_mapping(ploidy) - + self.__make_sample_mapping(ploidy, individuals) if individual_names is None: individual_names = [f"tsk_{j}" for j in range(self.num_individuals)] self.individual_names = individual_names @@ -115,35 +111,67 @@ def __init__( sample_mask = np.array(sample_mask, dtype=bool) self.sample_mask = lambda _: sample_mask - def __make_sample_mapping(self, ploidy): + def __make_sample_mapping(self, ploidy, individuals): """ Compute the sample IDs for each VCF individual and the template for writing out genotypes. """ + ts = self.tree_sequence self.samples = None self.individual_ploidies = [] - if len(self.individuals) > 0: - if ploidy is not None: - raise ValueError("Cannot specify ploidy when individuals present") + + # Cannot use "ploidy" when *any* individuals are present. + if ts.num_individuals > 0 and ploidy is not None: + raise ValueError( + "Cannot specify ploidy when individuals are present in tables " + ) + + if individuals is None: + # Find all sample nodes that reference individuals + individuals = np.unique(ts.nodes_individual[ts.samples()]) + if len(individuals) == 1 and individuals[0] == tskit.NULL: + # No samples refer to individuals + individuals = None + else: + # np.unique sorts the argument, so if NULL (-1) is present it + # will be the first value. + if individuals[0] == tskit.NULL: + raise ValueError( + "Sample nodes must either all be associated with individuals " + "or not associated with any individuals" + ) + else: + individuals = np.array(individuals, dtype=np.int32) + if len(individuals) == 0: + raise ValueError("List of sample individuals empty") + + if individuals is not None: self.samples = [] - for i in self.individuals: + # FIXME this could probably be done more efficiently. + for i in individuals: if i < 0 or i >= self.tree_sequence.num_individuals: raise ValueError("Invalid individual IDs provided.") ind = self.tree_sequence.individual(i) + if len(ind.nodes) == 0: + raise ValueError(f"Individual {i} not associated with a node") + is_sample = {ts.node(u).is_sample() for u in ind.nodes} + if len(is_sample) != 1: + raise ValueError( + f"Individual {ind.id} has nodes that are sample and " + "non-samples" + ) self.samples.extend(ind.nodes) self.individual_ploidies.append(len(ind.nodes)) - if len(self.samples) == 0: - raise ValueError("The individuals do not map to any sampled nodes.") else: if ploidy is None: ploidy = 1 if ploidy < 1: raise ValueError("Ploidy must be >= 1") - if self.tree_sequence.num_samples % ploidy != 0: + if ts.num_samples % ploidy != 0: raise ValueError("Sample size must be divisible by ploidy") - self.individual_ploidies = [ - ploidy for _ in range(self.tree_sequence.sample_size // ploidy) - ] + self.individual_ploidies = np.full( + ts.sample_size // ploidy, ploidy, dtype=np.int32 + ) self.num_individuals = len(self.individual_ploidies) def __write_header(self, output):