From 2d0d33e92b50529683d200be41886afc37d671ad Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Tue, 26 Jul 2022 14:22:38 +0100
Subject: [PATCH] Improve VCF sample handling

Closes #2257
Closes #2446
Closes #2448
---
 python/tests/test_vcf.py | 246 ++++++++++++++++++++++++++++++++++-----
 python/tests/tsutil.py   |   1 -
 python/tskit/trees.py    |   6 +
 python/tskit/vcf.py      |  62 +++++++---
 4 files changed, 266 insertions(+), 49 deletions(-)

diff --git a/python/tests/test_vcf.py b/python/tests/test_vcf.py
index 6934d12acb..0034b99b4b 100644
--- a/python/tests/test_vcf.py
+++ b/python/tests/test_vcf.py
@@ -208,13 +208,6 @@ def test_simple_infinite_sites_ploidy_2_reversed_samples(self):
         assert ts.num_sites > 2
         self.verify(ts)
 
-    def test_simple_infinite_sites_ploidy_2_even_samples(self):
-        ts = msprime.simulate(20, mutation_rate=1, random_seed=2)
-        samples = ts.samples()[0::2]
-        ts = tsutil.insert_individuals(ts, nodes=samples, ploidy=2)
-        assert ts.num_sites > 2
-        self.verify(ts)
-
     def test_simple_jukes_cantor_random_ploidy(self):
         ts = msprime.simulate(10, random_seed=2)
         ts = tsutil.jukes_cantor(ts, num_sites=10, mu=1, seed=2)
@@ -341,36 +334,91 @@ class TestInterface:
 
     def test_bad_ploidy(self):
         ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
-        for bad_ploidy in [-1, 0, 20]:
-            with pytest.raises(ValueError):
+        for bad_ploidy in [-1, 0]:
+            with pytest.raises(ValueError, match="Ploidy must be >= 1"):
                 ts.write_vcf(io.StringIO, bad_ploidy)
         # Non divisible
         for bad_ploidy in [3, 7]:
-            with pytest.raises(ValueError):
+            with pytest.raises(
+                ValueError, match="Sample size must be divisible by ploidy"
+            ):
                 ts.write_vcf(io.StringIO, bad_ploidy)
 
-    def test_individuals_no_nodes(self):
-        ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
+    def test_individuals_no_nodes_default_args(self):
+        ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
+        tables = ts1.dump_tables()
+        tables.individuals.add_row()
+        ts2 = tables.tree_sequence()
+        assert ts1.as_vcf() == ts2.as_vcf()
+
+    def test_individuals_no_nodes_as_argument(self):
+        ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
+        tables = ts1.dump_tables()
+        tables.individuals.add_row()
+        ts2 = tables.tree_sequence()
+        with pytest.raises(ValueError, match="0 not associated with a node"):
+            ts2.as_vcf(individuals=[0])
+
+    def test_ploidy_with_sample_individuals(self):
+        ts = msprime.sim_ancestry(3, random_seed=2)
+        ts = tsutil.insert_branch_sites(ts)
+        with pytest.raises(ValueError, match="Cannot specify ploidy when individuals"):
+            ts.write_vcf(io.StringIO(), ploidy=2)
+
+    def test_ploidy_with_no_node_individuals(self):
+        ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
+        tables = ts1.dump_tables()
+        tables.individuals.add_row()
+        ts2 = tables.tree_sequence()
+        with pytest.raises(ValueError, match="Cannot specify ploidy when individuals"):
+            ts2.as_vcf(ploidy=2)
+
+    def test_empty_individuals(self):
+        ts = msprime.sim_ancestry(3, random_seed=2)
+        ts = tsutil.insert_branch_sites(ts)
+        with pytest.raises(ValueError, match="List of sample individuals empty"):
+            ts.as_vcf(individuals=[])
+
+    def test_mixed_sample_non_sample_individuals(self):
+        ts = msprime.sim_ancestry(3, random_seed=2)
         tables = ts.dump_tables()
         tables.individuals.add_row()
+        # Add a reference to an individual from a non-sample
+        individual = tables.nodes.individual
+        individual[-1] = 0
+        tables.nodes.individual = individual
         ts = tables.tree_sequence()
-        with pytest.raises(ValueError):
-            ts.write_vcf(io.StringIO())
-
-    def test_ploidy_with_individuals(self):
-        ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
+        ts = tsutil.insert_branch_sites(ts)
+        with pytest.raises(
+            ValueError, match="0 has nodes that are sample and non-sample"
+        ):
+            ts.as_vcf()
+        # but it's OK if we run without the affected individual
+        assert len(ts.as_vcf(individuals=[1, 2])) > 0
+
+    def test_samples_with_and_without_individuals(self):
+        ts = tskit.Tree.generate_balanced(3).tree_sequence
         tables = ts.dump_tables()
         tables.individuals.add_row()
+        # Add a reference to an individual from one sample
+        individual = tables.nodes.individual
+        individual[0] = 0
+        tables.nodes.individual = individual
         ts = tables.tree_sequence()
-        with pytest.raises(ValueError):
-            ts.write_vcf(io.StringIO(), ploidy=2)
+        ts = tsutil.insert_branch_sites(ts)
+        with pytest.raises(
+            ValueError, match="Sample nodes must either all be associated"
+        ):
+            ts.as_vcf()
+        # But it's OK if explicitly specify that sample
+        assert len(ts.as_vcf(individuals=[0])) > 0
 
     def test_bad_individuals(self):
         ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
         ts = tsutil.insert_individuals(ts, ploidy=2)
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match="Invalid individual IDs provided."):
             ts.write_vcf(io.StringIO(), individuals=[0, -1])
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match="Invalid individual IDs provided."):
             ts.write_vcf(io.StringIO(), individuals=[1, 2, ts.num_individuals])
 
     def test_ploidy_positional(self):
@@ -379,7 +427,7 @@ def test_ploidy_positional(self):
 
     def test_only_ploidy_positional(self):
         ts = msprime.simulate(2, mutation_rate=2, random_seed=1)
-        with pytest.raises(TypeError):
+        with pytest.raises(TypeError, match="positional arguments"):
             assert ts.as_vcf(2, "chr2")
 
 
@@ -400,7 +448,9 @@ def test_many_alleles(self):
         for j in range(9, 15):
             tables.mutations.add_row(0, node=j, derived_state=str(j))
             ts = tables.tree_sequence()
-            with pytest.raises(ValueError):
+            with pytest.raises(
+                ValueError, match="More than 9 alleles not currently supported"
+            ):
                 ts.write_vcf(io.StringIO())
 
 
@@ -436,17 +486,33 @@ def test_bad_length_individuals(self):
         ts = msprime.simulate(6, mutation_rate=2, random_seed=1)
         assert ts.num_sites > 0
         ts = tsutil.insert_individuals(ts, ploidy=2)
-        with pytest.raises(ValueError):
+        with pytest.raises(
+            ValueError,
+            match="individual_names must have length equal to"
+            " the number of individuals",
+        ):
             ts.write_vcf(io.StringIO(), individual_names=[])
-        with pytest.raises(ValueError):
+        with pytest.raises(
+            ValueError,
+            match="individual_names must have length equal to"
+            " the number of individuals",
+        ):
             ts.write_vcf(io.StringIO(), individual_names=["x" for _ in range(4)])
-        with pytest.raises(ValueError):
+        with pytest.raises(
+            ValueError,
+            match="individual_names must have length equal to"
+            " the number of individuals",
+        ):
             ts.write_vcf(
                 io.StringIO(),
                 individuals=list(range(ts.num_individuals)),
                 individual_names=["x" for _ in range(ts.num_individuals - 1)],
             )
-        with pytest.raises(ValueError):
+        with pytest.raises(
+            ValueError,
+            match="individual_names must have length equal to"
+            " the number of individuals",
+        ):
             ts.write_vcf(
                 io.StringIO(),
                 individuals=list(range(ts.num_individuals - 1)),
@@ -456,18 +522,30 @@ def test_bad_length_individuals(self):
     def test_bad_length_ploidy(self):
         ts = msprime.simulate(6, mutation_rate=2, random_seed=1)
         assert ts.num_sites > 0
-        with pytest.raises(ValueError):
+        with pytest.raises(
+            ValueError,
+            match="individual_names must have length equal to"
+            " the number of individuals",
+        ):
             ts.write_vcf(io.StringIO(), ploidy=2, individual_names=[])
-        with pytest.raises(ValueError):
+        with pytest.raises(
+            ValueError,
+            match="individual_names must have length equal to"
+            " the number of individuals",
+        ):
             ts.write_vcf(
                 io.StringIO(), ploidy=2, individual_names=["x" for _ in range(4)]
             )
 
     def test_bad_type(self):
         ts = msprime.simulate(2, mutation_rate=2, random_seed=1)
-        with pytest.raises(TypeError):
+        with pytest.raises(
+            TypeError, match="sequence item 0: expected str instance," " NoneType found"
+        ):
             ts.write_vcf(io.StringIO(), individual_names=[None, "b"])
-        with pytest.raises(TypeError):
+        with pytest.raises(
+            TypeError, match="sequence item 0: expected str instance," " bytes found"
+        ):
             ts.write_vcf(io.StringIO(), individual_names=[b"a", "b"])
 
 
@@ -703,3 +781,109 @@ def test_ok_with_pysam(self):
 
             gts = [variants[1].samples[key]["GT"] for key in samples]
             assert gts == [(0,), (1,), (None,)]
+
+
+def drop_individuals(ts):
+    tables = ts.dump_tables()
+    individual = tables.nodes.individual
+    individual[:] = -1
+    tables.individuals.clear()
+    tables.nodes.individual = individual
+    return tables.tree_sequence()
+
+
+class TestSampleOptions:
+    @tests.cached_example
+    def ts(self):
+        ts = tskit.Tree.generate_balanced(3, span=10).tree_sequence
+        ts = tsutil.insert_branch_sites(ts)
+        tables = ts.dump_tables()
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        individual = tables.nodes.individual
+        # One diploid and one haploid, not in adjacent individuals
+        individual[0] = 0
+        individual[1] = 1
+        individual[2] = 0
+        tables.nodes.individual = individual
+        return tables.tree_sequence()
+
+    def test_no_individuals_defaults(self):
+        ts = drop_individuals(self.ts())
+        s = """\
+        #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1\ttsk_2
+        1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1\t0\t0
+        1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0\t1\t1
+        1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0\t1\t0
+        1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0\t1"""
+        expected = textwrap.dedent(s)
+        assert drop_header(ts.as_vcf()) == expected
+
+    def test_no_individuals_ploidy_3(self):
+        ts = drop_individuals(self.ts())
+        s = """\
+        #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
+        1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0|0
+        1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1|1
+        1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|1|0
+        1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|0|1"""
+        expected = textwrap.dedent(s)
+        assert drop_header(ts.as_vcf(ploidy=3)) == expected
+
+    def test_defaults(self):
+        ts = self.ts()
+        s = """\
+        #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1
+        1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0\t0
+        1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1\t1
+        1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|0\t1
+        1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|1\t0"""
+        expected = textwrap.dedent(s)
+        assert drop_header(ts.as_vcf()) == expected
+
+    def test_individual_0(self):
+        ts = self.ts()
+        s = """\
+        #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
+        1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|0
+        1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t0|1
+        1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t0|0
+        1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0|1"""
+        expected = textwrap.dedent(s)
+        assert drop_header(ts.as_vcf(individuals=[0])) == expected
+
+    def test_individual_1(self):
+        ts = self.ts()
+        s = """\
+        #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0
+        1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0
+        1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1
+        1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1
+        1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0"""
+        expected = textwrap.dedent(s)
+        assert drop_header(ts.as_vcf(individuals=[1])) == expected
+
+    def test_reversed(self):
+        ts = self.ts()
+        s = """\
+        #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1
+        1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|0
+        1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1\t0|1
+        1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1\t0|0
+        1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0|1"""
+        expected = textwrap.dedent(s)
+        assert drop_header(ts.as_vcf(individuals=[1, 0])) == expected
+
+    def test_reversed_names(self):
+        ts = self.ts()
+        s = """\
+        #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tA\tB
+        1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|0
+        1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1\t0|1
+        1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1\t0|0
+        1\t6\t3\t0\t1\t.\tPASS\t.\tGT\t0\t0|1"""
+        expected = textwrap.dedent(s)
+        assert (
+            drop_header(ts.as_vcf(individuals=[1, 0], individual_names=["A", "B"]))
+            == expected
+        )
diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py
index 77a6863c13..4f2a928229 100644
--- a/python/tests/tsutil.py
+++ b/python/tests/tsutil.py
@@ -222,7 +222,6 @@ def insert_random_ploidy_individuals(
         ind_id = tables.individuals.add_row(location=location, parents=parents)
         individual[nodes] = ind_id
         j += ploidy
-        j += rng.randint(0, 5)  # skip a random number
     tables.nodes.individual = individual
     return tables.tree_sequence()
 
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index 9d5aa3805c..7a2cc01002 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -5994,6 +5994,12 @@ def write_vcf(
         for duplicates in this array, or perform any checks to ensure that
         the output VCF is well-formed.
 
+        .. note::
+            The default individual names (VCF sample IDs) are always of
+            the form ``tsk_0``, ``tsk_1``, ..., ``tsk_{N - 1}``, where
+            N is the number of individuals we output. These numbers
+            are **not** necessarily the individual IDs.
+
         .. note::
 
             Warning to ``plink`` users:
diff --git a/python/tskit/vcf.py b/python/tskit/vcf.py
index fba646ac1c..2ee74b761c 100644
--- a/python/tskit/vcf.py
+++ b/python/tskit/vcf.py
@@ -25,6 +25,7 @@
 """
 import numpy as np
 
+import tskit
 from . import provenance
 
 
@@ -67,12 +68,7 @@ def __init__(
         self.contig_id = contig_id
         self.isolated_as_missing = isolated_as_missing
 
-        if individuals is None:
-            individuals = np.arange(tree_sequence.num_individuals, dtype=int)
-        self.individuals = individuals
-
-        self.__make_sample_mapping(ploidy)
-
+        self.__make_sample_mapping(ploidy, individuals)
         if individual_names is None:
             individual_names = [f"tsk_{j}" for j in range(self.num_individuals)]
         self.individual_names = individual_names
@@ -115,35 +111,67 @@ def __init__(
                 sample_mask = np.array(sample_mask, dtype=bool)
                 self.sample_mask = lambda _: sample_mask
 
-    def __make_sample_mapping(self, ploidy):
+    def __make_sample_mapping(self, ploidy, individuals):
         """
         Compute the sample IDs for each VCF individual and the template for
         writing out genotypes.
         """
+        ts = self.tree_sequence
         self.samples = None
         self.individual_ploidies = []
-        if len(self.individuals) > 0:
-            if ploidy is not None:
-                raise ValueError("Cannot specify ploidy when individuals present")
+
+        # Cannot use "ploidy" when *any* individuals are present.
+        if ts.num_individuals > 0 and ploidy is not None:
+            raise ValueError(
+                "Cannot specify ploidy when individuals are present in tables "
+            )
+
+        if individuals is None:
+            # Find all sample nodes that reference individuals
+            individuals = np.unique(ts.nodes_individual[ts.samples()])
+            if len(individuals) == 1 and individuals[0] == tskit.NULL:
+                # No samples refer to individuals
+                individuals = None
+            else:
+                # np.unique sorts the argument, so if NULL (-1) is present it
+                # will be the first value.
+                if individuals[0] == tskit.NULL:
+                    raise ValueError(
+                        "Sample nodes must either all be associated with individuals "
+                        "or not associated with any individuals"
+                    )
+        else:
+            individuals = np.array(individuals, dtype=np.int32)
+            if len(individuals) == 0:
+                raise ValueError("List of sample individuals empty")
+
+        if individuals is not None:
             self.samples = []
-            for i in self.individuals:
+            # FIXME this could probably be done more efficiently.
+            for i in individuals:
                 if i < 0 or i >= self.tree_sequence.num_individuals:
                     raise ValueError("Invalid individual IDs provided.")
                 ind = self.tree_sequence.individual(i)
+                if len(ind.nodes) == 0:
+                    raise ValueError(f"Individual {i} not associated with a node")
+                is_sample = {ts.node(u).is_sample() for u in ind.nodes}
+                if len(is_sample) != 1:
+                    raise ValueError(
+                        f"Individual {ind.id} has nodes that are sample and "
+                        "non-samples"
+                    )
                 self.samples.extend(ind.nodes)
                 self.individual_ploidies.append(len(ind.nodes))
-            if len(self.samples) == 0:
-                raise ValueError("The individuals do not map to any sampled nodes.")
         else:
             if ploidy is None:
                 ploidy = 1
             if ploidy < 1:
                 raise ValueError("Ploidy must be >= 1")
-            if self.tree_sequence.num_samples % ploidy != 0:
+            if ts.num_samples % ploidy != 0:
                 raise ValueError("Sample size must be divisible by ploidy")
-            self.individual_ploidies = [
-                ploidy for _ in range(self.tree_sequence.sample_size // ploidy)
-            ]
+            self.individual_ploidies = np.full(
+                ts.sample_size // ploidy, ploidy, dtype=np.int32
+            )
         self.num_individuals = len(self.individual_ploidies)
 
     def __write_header(self, output):