Merge pull request #1175 from nextstrain/filter-empty-results

filter: Add --empty-results-reporting={error,warn,skip} option
nextstrain · Mar 13, 2023 · 8d24a79 · 8d24a79
2 parents ec69172 + 361ee78
commit 8d24a79
Show file tree

Hide file tree

Showing 6 changed files with 119 additions and 15 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,10 @@
 
 ## __NEXT__
 
+### Features
+
+* filter: Add `--empty-output-reporting={error,warn,silent}` option to allow filter to produce empty outputs without raising an error. The default behavior is still to raise an error when filter produces an empty output, so users will have to explicitly pass the "warn" or "silent" value to bypass the error. [#1175][] (@joverlee521)
+
 ### Bug fixes
 
 * translate: Fix error handling when features cannot be read from reference sequence file. [#1168][] (@victorlin)
@@ -13,6 +17,7 @@
 [#1160]: https://github.com/nextstrain/augur/pull/1160
 [#1168]: https://github.com/nextstrain/augur/pull/1168
 [#1169]: https://github.com/nextstrain/augur/pull/1169
+[#1175]: https://github.com/nextstrain/augur/pull/1175
 
 ## 21.0.1 (17 February 2023)
 

diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -2,6 +2,7 @@
 Filter and subsample a sequence set.
 """
 from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
+from augur.types import EmptyOutputReportingMethod
 
 
 # Use sorted() for reproducible output
@@ -63,11 +64,17 @@ def register_arguments(parser):
     Since priorities represent relative values between strains, these values can be arbitrary.""")
     subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).")
 
-    output_group = parser.add_argument_group("outputs", "possible representations of filtered data (at least one required)")
+    output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required")
     output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
     output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
     output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
     output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
+    output_group.add_argument(
+        '--empty-output-reporting',
+        type=EmptyOutputReportingMethod.argtype,
+        choices=list(EmptyOutputReportingMethod),
+        default=EmptyOutputReportingMethod.ERROR,
+        help="How should empty outputs be reported when no strains pass filtering and/or subsampling.")
 
     parser.set_defaults(probabilistic_sampling=True)
 

diff --git a/augur/filter/_run.py b/augur/filter/_run.py
@@ -14,6 +14,7 @@
 from augur.io.sequences import read_sequences, write_sequences
 from augur.io.print import print_err
 from augur.io.vcf import is_vcf as filename_is_vcf, write_vcf
+from augur.types import EmptyOutputReportingMethod
 from .io import cleanup_outputs, read_priority_scores
 from .include_exclude_rules import apply_filters, construct_filters
 from .subsample import PriorityQueue, TooManyGroupsError, calculate_sequences_per_group, create_queues_by_group, get_groups_for_subsampling
@@ -483,6 +484,14 @@ def run(args):
         print("\t%i of these were dropped because of subsampling criteria%s" % (num_excluded_subsamp, seed_txt))
 
     if total_strains_passed == 0:
-        raise AugurError("All samples have been dropped! Check filter rules and metadata file format.")
+        empty_results_message = "All samples have been dropped! Check filter rules and metadata file format."
+        if args.empty_output_reporting is EmptyOutputReportingMethod.ERROR:
+            raise AugurError(empty_results_message)
+        elif args.empty_output_reporting is EmptyOutputReportingMethod.WARN:
+            print_err(f"WARNING: {empty_results_message}")
+        elif args.empty_output_reporting is EmptyOutputReportingMethod.SILENT:
+            pass
+        else:
+            raise ValueError(f"Encountered unhandled --empty-output-reporting method {args.empty_output_reporting!r}")
 
     print(f"{total_strains_passed} strains passed all filters")
diff --git a/augur/types.py b/augur/types.py
@@ -1,8 +1,44 @@
+from argparse import ArgumentTypeError
 import enum
 
 
+class ArgparseEnum(enum.Enum):
+    """
+    Intended to be used as a parent class for any enum representation of
+    string values to be used with argparse options.
+
+    Can be replaced by :py:class:`enum.StrEnum` once Augur's minimum supported
+    Python version is 3.11.
+    """
+    def __str__(self) -> str:
+        """
+        Stringify to the enum member's :py:attr:`.value` instead of the default.
+
+        This let us use the enum's constructor and members with argparse's
+        ``type`` and ``choices`` parameters, respectively, without exposing the
+        enum class name to users.
+        """
+        return self.value
+
+    @classmethod
+    def argtype(cls, input_string):
+        """
+        Intended to be used as the argument type converter for argparse options
+        that use the enum values as inputs.
+
+        Raises a custom `argparse.ArgumentTypeError` so that the error
+        message can include a helpful list of the all valid enum values.
+        """
+        try:
+            return cls(input_string)
+        except ValueError as error:
+            choices = ", ".join(f"{str(x)!r}" for x in cls)
+            raise ArgumentTypeError(
+                f"invalid choice: {input_string!r} (choose from {choices})") from error
+
+
 @enum.unique
-class DataErrorMethod(enum.Enum):
+class DataErrorMethod(ArgparseEnum):
     """
     Enum representation of string values that represent how a data error should
     be handled.
@@ -14,21 +50,22 @@ class DataErrorMethod(enum.Enum):
 
 
 @enum.unique
-class ValidationMode(enum.Enum):
+class EmptyOutputReportingMethod(ArgparseEnum):
+    """
+    Enum representation of string values that represent how empty outputs should
+    be reported.
+    """
+    ERROR = 'error'
+    WARN  = 'warn'
+    SILENT  = 'silent'
+
+
+@enum.unique
+class ValidationMode(ArgparseEnum):
     """
     Enum representation of string values that represent how validation should
     be handled.
     """
     ERROR = 'error'
     WARN  = 'warn'
     SKIP  = 'skip'
-
-    def __str__(self) -> str:
-        """
-        Stringify to the enum member's :py:attr:`.value` instead of the default.
-
-        This let us use the enum's constructor and members with argparse's
-        ``type`` and ``choices`` parameters, respectively, without exposing the
-        enum class name to users.
-        """
-        return self.value
diff --git a/tests/functional/filter/cram/filter-empty-output-reporting.t b/tests/functional/filter/cram/filter-empty-output-reporting.t
@@ -0,0 +1,46 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Filter using `--exclude-all` to easily get an empty result.
+
+Test the default behavior for empty results is an error.
+
+  $ ${AUGUR} filter \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
+  >  --exclude-all \
+  >  --output-strains filtered_strains.txt > /dev/null
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [2]
+  $ wc -l filtered_strains.txt
+  \s*0 .* (re)
+
+Repeat with the --empty-output-reporting=warn option.
+This whould output a warning message but no error.
+
+  $ ${AUGUR} filter \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
+  >  --exclude-all \
+  >  --output-strains filtered_strains.txt \
+  >  --empty-output-reporting warn > /dev/null
+  WARNING: All samples have been dropped! Check filter rules and metadata file format.
+  $ wc -l filtered_strains.txt
+  \s*0 .* (re)
+
+Ignore empty results with the --empty-output-reporting=silent option.
+Make sure all 3 output types are empty, except the metadata output should still include the header.
+This should not output any messages to stderr.
+
+  $ ${AUGUR} filter \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
+  >  --sequences "$TESTDIR/../data/sequences.fasta" \
+  >  --exclude-all \
+  >  --output-sequences filtered_seqs.fasta \
+  >  --output-metadata filtered_metadata.tsv \
+  >  --output-strains filtered_strains.txt \
+  >  --empty-output-reporting silent > /dev/null
+  $ wc -l filtered_seqs.fasta
+  \s*0 .* (re)
+  $ diff <(head -n 1 filtered_metadata.tsv) <(head -n 1 "$TESTDIR/../data/metadata.tsv")
+  $ wc -l filtered_strains.txt
+  \s*0 .* (re)
diff --git a/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t b/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t
@@ -24,7 +24,7 @@ The query initially filters 3 strains from Colombia, one of which is added back
   \\t1 strains were added back because they were in .*include\.txt.* (re)
   9 strains passed all filters
 
-  $ (head -n1; sort -k 1,1) < filtered_log.tsv
+  $ head -n 1 filtered_log.tsv; tail -n +2 filtered_log.tsv | sort -k 1,1
   strain	filter	kwargs
   COL/FLR_00008/2015	filter_by_query	"[[""query"", ""country != 'Colombia'""]]"
   COL/FLR_00008/2015\tforce_include_strains\t"[[""include_file"", ""*/data/include.txt""]]" (esc) (glob)