Skip to content

Commit

Permalink
apply suggestions from code review
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Feb 7, 2023
1 parent 8902211 commit dcddf9b
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 21 deletions.
4 changes: 2 additions & 2 deletions src/sourmash/tax/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def metagenome(args):
rank=args.rank, by_query=True)

with FileOutputCSV(lineage_outfile) as out_fp:
tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, format_lineage=True, sep='\t')
tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, sep='\t')

# write summarized --> krona output tsv
if "krona" in args.output_format:
Expand All @@ -148,7 +148,7 @@ def metagenome(args):
tax_utils.write_human_summary(query_gather_results, out_fp, args.rank or "species")

# write summarized output csv
single_query_results = query_gather_results[0]#.summarized_lineage_results
single_query_results = query_gather_results[0]
if "csv_summary" in args.output_format:
summary_outfile, limit_float = make_outfile(args.output_base, "csv_summary", output_dir=args.output_dir)
with FileOutputCSV(summary_outfile) as out_fp:
Expand Down
13 changes: 3 additions & 10 deletions src/sourmash/tax/tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,12 +505,7 @@ def report_missing_and_skipped_identities(gather_results):

if ident_missed:
notify(f'of {total_taxresults} gather results, lineage assignments for {total_n_missed} results were missed.')
#notify(f'of {total_taxresults} gather results, missed lineage assignments for {total_n_missed)} results.')
notify(f'The following are missing from the taxonomy information: {", ".join(ident_missed)}')
# skipping not actually enabled? do we want to enable?
# if ident_skipped:
# notify(f'The following idents were skipped during taxonomic assignment, as requested: {", ".join(ident_skipped)}')
# notify(f'of {total_taxresults} gather results, lineage assignments for {total_n_skipped} results were skipped.')


def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False):
Expand Down Expand Up @@ -585,7 +580,6 @@ def format_for_krona(query_gather_results, rank, *, classification=False):
unclassified_fraction = fraction
continue
else:
#lin_list = lin.display_lineage().split(';')
lin_list = lin.split(';')
krona_results.append((fraction, *lin_list))

Expand Down Expand Up @@ -658,7 +652,7 @@ def write_human_summary(query_gather_results, out_fp, display_rank, classificati
out_fp.write("{query_name:<15s} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format(**rD))


def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_lineage=False, sep='\t'):
def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'):
'''
takes in a lineage dictionary with sample counts (output of aggregate_by_lineage_at_rank)
and produces a tab-separated file with fractions for each sample.
Expand Down Expand Up @@ -1313,7 +1307,6 @@ class TaxResult:
Uses RankLineageInfo to store lineage information; this may need to be modified in the future.
"""
raw: GatherRow
# can we get rid of these / just choose default ident hacking/slashing for future?
keep_full_identifiers: bool = False
keep_identifier_versions: bool = False

Expand Down Expand Up @@ -1446,7 +1439,7 @@ def as_kreport_dict(self, query_info):
sD = {}
sD['num_bp_assigned'] = str(0)
# total percent containment, weighted to include abundance info
sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' #f'{proportion:.2f}'
sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}'
sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp))
if self.lineage != RankLineageInfo():
this_rank = self.lineage.lowest_rank
Expand Down Expand Up @@ -1492,8 +1485,8 @@ def set_status(self, query_info, containment_threshold=None, ani_threshold=None)
if ani_threshold is not None: # if provided, just use ani thresh, don't use containment threshold
if self.query_ani_at_rank >= ani_threshold:
self.status = 'match'
# should we switch to using weighted here? I think yes, but this would be behavior change
elif containment_threshold is not None and self.fraction >= containment_threshold:
# FUTURE: switch to using weighted here
#elif containment_threshold is not None and self.f_weighted_at_rank >= containment_threshold:
self.status = 'match'

Expand Down
6 changes: 0 additions & 6 deletions tests/test_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -1420,7 +1420,6 @@ def test_genome_gather_from_file_below_threshold(runtmp):

assert c.last_result.status == 0
assert "query_name,status,rank,fraction,lineage" in c.last_result.out
# assert "test1,below_threshold,,0.000," in c.last_result.out
assert "test1,below_threshold,superkingdom,0.204," in c.last_result.out


Expand Down Expand Up @@ -1685,7 +1684,6 @@ def test_genome_missing_taxonomy_fail_threshold(runtmp):
print(c.last_result.out)
print(c.last_result.err)

# assert "The following are missing from the taxonomy information: GCF_001881345" in str(exc.value)
assert "ident 'GCF_001881345' is not in the taxonomy database." in str(exc.value)
assert "Failing, as requested via --fail-on-missing-taxonomy" in str(exc.value)
assert c.last_result.status == -1
Expand All @@ -1712,7 +1710,6 @@ def test_genome_missing_taxonomy_fail_rank(runtmp):
print(c.last_result.out)
print(c.last_result.err)

# assert "The following are missing from the taxonomy information: GCF_001881345" in str(exc.value)
assert "ident 'GCF_001881345' is not in the taxonomy database." in str(exc.value)
assert "Failing, as requested via --fail-on-missing-taxonomy" in str(exc.value)
assert c.last_result.status == -1
Expand Down Expand Up @@ -1932,7 +1929,6 @@ def test_genome_over100percent_error(runtmp):

assert runtmp.last_result.status == -1
assert "fraction is > 100% of the query! This should not be possible." in runtmp.last_result.err
# assert "ERROR: The tax summary of query 'test1' is 1.1, which is > 100% of the query!!" in runtmp.last_result.err


def test_genome_ani_threshold_input_errors(runtmp):
Expand Down Expand Up @@ -2005,7 +2001,6 @@ def test_genome_ani_threshold(runtmp):
print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)
# assert "WARNING: classifying query test1 at desired rank species does not meet query ANI/AAI threshold 1.0" in c.last_result.err
assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000,0.92" in c.last_result.out


Expand Down Expand Up @@ -2050,7 +2045,6 @@ def test_genome_ani_lemonade_classify(runtmp):
'--ani', '0.8', '-F', 'human')

output = c.last_result.out
# assert 'MAG3_1 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output
assert 'MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output

# aaand classify to lineage_csv
Expand Down
5 changes: 2 additions & 3 deletions tests/test_tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,12 +786,11 @@ def test_write_lineage_sample_frac(runtmp):
def test_write_lineage_sample_frac_format_lineage(runtmp):
outfrac = runtmp.output('outfrac.csv')
sample_names = ['sample1', 'sample2']
# sk_lineage = lca_utils.make_lineage('a')
sk_lineage='a'
print(sk_lineage)
sk_linD = {sk_lineage: {'sample1': '0.500' ,'sample2': '0.700'}}
with open(outfrac, 'w') as out_fp:
write_lineage_sample_frac(sample_names, sk_linD, out_fp, format_lineage=True)
write_lineage_sample_frac(sample_names, sk_linD, out_fp)

frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')]
print("csv_lines: ", frac_lines)
Expand All @@ -803,7 +802,7 @@ def test_write_lineage_sample_frac_format_lineage(runtmp):
print(phy2_lineage)
phy_linD = {phy_lineage: {'sample1': '0.500'}, phy2_lineage: {'sample2': '0.700'}}
with open(outfrac, 'w') as out_fp:
write_lineage_sample_frac(sample_names, phy_linD, out_fp, format_lineage=True)
write_lineage_sample_frac(sample_names, phy_linD, out_fp)

frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')]
print("csv_lines: ", frac_lines)
Expand Down

0 comments on commit dcddf9b

Please sign in to comment.