Skip to content

Commit

Permalink
Merge pull request #1175 from nextstrain/filter-empty-results
Browse files Browse the repository at this point in the history
filter: Add --empty-results-reporting={error,warn,skip} option
  • Loading branch information
joverlee521 committed Mar 13, 2023
2 parents ec69172 + 361ee78 commit 8d24a79
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 15 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## __NEXT__

### Features

* filter: Add `--empty-output-reporting={error,warn,silent}` option to allow filter to produce empty outputs without raising an error. The default behavior is still to raise an error when filter produces an empty output, so users will have to explicitly pass the "warn" or "silent" value to bypass the error. [#1175][] (@joverlee521)

### Bug fixes

* translate: Fix error handling when features cannot be read from reference sequence file. [#1168][] (@victorlin)
Expand All @@ -13,6 +17,7 @@
[#1160]: https://github.com/nextstrain/augur/pull/1160
[#1168]: https://github.com/nextstrain/augur/pull/1168
[#1169]: https://github.com/nextstrain/augur/pull/1169
[#1175]: https://github.com/nextstrain/augur/pull/1175

## 21.0.1 (17 February 2023)

Expand Down
9 changes: 8 additions & 1 deletion augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Filter and subsample a sequence set.
"""
from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
from augur.types import EmptyOutputReportingMethod


# Use sorted() for reproducible output
Expand Down Expand Up @@ -63,11 +64,17 @@ def register_arguments(parser):
Since priorities represent relative values between strains, these values can be arbitrary.""")
subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).")

output_group = parser.add_argument_group("outputs", "possible representations of filtered data (at least one required)")
output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required")
output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
output_group.add_argument(
'--empty-output-reporting',
type=EmptyOutputReportingMethod.argtype,
choices=list(EmptyOutputReportingMethod),
default=EmptyOutputReportingMethod.ERROR,
help="How should empty outputs be reported when no strains pass filtering and/or subsampling.")

parser.set_defaults(probabilistic_sampling=True)

Expand Down
11 changes: 10 additions & 1 deletion augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from augur.io.sequences import read_sequences, write_sequences
from augur.io.print import print_err
from augur.io.vcf import is_vcf as filename_is_vcf, write_vcf
from augur.types import EmptyOutputReportingMethod
from .io import cleanup_outputs, read_priority_scores
from .include_exclude_rules import apply_filters, construct_filters
from .subsample import PriorityQueue, TooManyGroupsError, calculate_sequences_per_group, create_queues_by_group, get_groups_for_subsampling
Expand Down Expand Up @@ -483,6 +484,14 @@ def run(args):
print("\t%i of these were dropped because of subsampling criteria%s" % (num_excluded_subsamp, seed_txt))

if total_strains_passed == 0:
raise AugurError("All samples have been dropped! Check filter rules and metadata file format.")
empty_results_message = "All samples have been dropped! Check filter rules and metadata file format."
if args.empty_output_reporting is EmptyOutputReportingMethod.ERROR:
raise AugurError(empty_results_message)
elif args.empty_output_reporting is EmptyOutputReportingMethod.WARN:
print_err(f"WARNING: {empty_results_message}")
elif args.empty_output_reporting is EmptyOutputReportingMethod.SILENT:
pass
else:
raise ValueError(f"Encountered unhandled --empty-output-reporting method {args.empty_output_reporting!r}")

print(f"{total_strains_passed} strains passed all filters")
61 changes: 49 additions & 12 deletions augur/types.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,44 @@
from argparse import ArgumentTypeError
import enum


class ArgparseEnum(enum.Enum):
"""
Intended to be used as a parent class for any enum representation of
string values to be used with argparse options.
Can be replaced by :py:class:`enum.StrEnum` once Augur's minimum supported
Python version is 3.11.
"""
def __str__(self) -> str:
"""
Stringify to the enum member's :py:attr:`.value` instead of the default.
This let us use the enum's constructor and members with argparse's
``type`` and ``choices`` parameters, respectively, without exposing the
enum class name to users.
"""
return self.value

@classmethod
def argtype(cls, input_string):
"""
Intended to be used as the argument type converter for argparse options
that use the enum values as inputs.
Raises a custom `argparse.ArgumentTypeError` so that the error
message can include a helpful list of the all valid enum values.
"""
try:
return cls(input_string)
except ValueError as error:
choices = ", ".join(f"{str(x)!r}" for x in cls)
raise ArgumentTypeError(
f"invalid choice: {input_string!r} (choose from {choices})") from error


@enum.unique
class DataErrorMethod(enum.Enum):
class DataErrorMethod(ArgparseEnum):
"""
Enum representation of string values that represent how a data error should
be handled.
Expand All @@ -14,21 +50,22 @@ class DataErrorMethod(enum.Enum):


@enum.unique
class ValidationMode(enum.Enum):
class EmptyOutputReportingMethod(ArgparseEnum):
"""
Enum representation of string values that represent how empty outputs should
be reported.
"""
ERROR = 'error'
WARN = 'warn'
SILENT = 'silent'


@enum.unique
class ValidationMode(ArgparseEnum):
"""
Enum representation of string values that represent how validation should
be handled.
"""
ERROR = 'error'
WARN = 'warn'
SKIP = 'skip'

def __str__(self) -> str:
"""
Stringify to the enum member's :py:attr:`.value` instead of the default.
This let us use the enum's constructor and members with argparse's
``type`` and ``choices`` parameters, respectively, without exposing the
enum class name to users.
"""
return self.value
46 changes: 46 additions & 0 deletions tests/functional/filter/cram/filter-empty-output-reporting.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
Setup

$ source "$TESTDIR"/_setup.sh

Filter using `--exclude-all` to easily get an empty result.

Test the default behavior for empty results is an error.

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --exclude-all \
> --output-strains filtered_strains.txt > /dev/null
ERROR: All samples have been dropped! Check filter rules and metadata file format.
[2]
$ wc -l filtered_strains.txt
\s*0 .* (re)

Repeat with the --empty-output-reporting=warn option.
This whould output a warning message but no error.

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --exclude-all \
> --output-strains filtered_strains.txt \
> --empty-output-reporting warn > /dev/null
WARNING: All samples have been dropped! Check filter rules and metadata file format.
$ wc -l filtered_strains.txt
\s*0 .* (re)

Ignore empty results with the --empty-output-reporting=silent option.
Make sure all 3 output types are empty, except the metadata output should still include the header.
This should not output any messages to stderr.

$ ${AUGUR} filter \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --sequences "$TESTDIR/../data/sequences.fasta" \
> --exclude-all \
> --output-sequences filtered_seqs.fasta \
> --output-metadata filtered_metadata.tsv \
> --output-strains filtered_strains.txt \
> --empty-output-reporting silent > /dev/null
$ wc -l filtered_seqs.fasta
\s*0 .* (re)
$ diff <(head -n 1 filtered_metadata.tsv) <(head -n 1 "$TESTDIR/../data/metadata.tsv")
$ wc -l filtered_strains.txt
\s*0 .* (re)
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ The query initially filters 3 strains from Colombia, one of which is added back
\\t1 strains were added back because they were in .*include\.txt.* (re)
9 strains passed all filters

$ (head -n1; sort -k 1,1) < filtered_log.tsv
$ head -n 1 filtered_log.tsv; tail -n +2 filtered_log.tsv | sort -k 1,1
strain filter kwargs
COL/FLR_00008/2015 filter_by_query "[[""query"", ""country != 'Colombia'""]]"
COL/FLR_00008/2015\tforce_include_strains\t"[[""include_file"", ""*/data/include.txt""]]" (esc) (glob)
Expand Down

0 comments on commit 8d24a79

Please sign in to comment.