apply suggestions from code review

sourmash-bio · Feb 7, 2023 · dcddf9b · dcddf9b
1 parent 8902211
commit dcddf9b
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 21 deletions.
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
@@ -131,7 +131,7 @@ def metagenome(args):
                                                                       rank=args.rank, by_query=True)
 
         with FileOutputCSV(lineage_outfile) as out_fp:
-            tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, format_lineage=True, sep='\t')
+            tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, sep='\t')
 
     # write summarized --> krona output tsv
     if "krona" in args.output_format:
@@ -148,7 +148,7 @@ def metagenome(args):
             tax_utils.write_human_summary(query_gather_results, out_fp, args.rank or "species")
 
     # write summarized output csv
-    single_query_results = query_gather_results[0]#.summarized_lineage_results
+    single_query_results = query_gather_results[0]
     if "csv_summary" in args.output_format:
         summary_outfile, limit_float = make_outfile(args.output_base, "csv_summary", output_dir=args.output_dir)
         with FileOutputCSV(summary_outfile) as out_fp:

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
@@ -505,12 +505,7 @@ def report_missing_and_skipped_identities(gather_results):
 
     if ident_missed:
         notify(f'of {total_taxresults} gather results, lineage assignments for {total_n_missed} results were missed.')
-        #notify(f'of {total_taxresults} gather results, missed lineage assignments for {total_n_missed)} results.')
         notify(f'The following are missing from the taxonomy information: {", ".join(ident_missed)}')
-    # skipping not actually enabled? do we want to enable?
-    # if ident_skipped:
-    #     notify(f'The following idents were skipped during taxonomic assignment, as requested: {", ".join(ident_skipped)}')
-    #     notify(f'of {total_taxresults} gather results, lineage assignments for {total_n_skipped} results were skipped.')
 
 
 def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False):
@@ -585,7 +580,6 @@ def format_for_krona(query_gather_results, rank, *, classification=False):
                 unclassified_fraction = fraction
                 continue
             else:
-                #lin_list = lin.display_lineage().split(';')
                 lin_list = lin.split(';')
                 krona_results.append((fraction, *lin_list))
 
@@ -658,7 +652,7 @@ def write_human_summary(query_gather_results, out_fp, display_rank, classificati
                 out_fp.write("{query_name:<15s}   {f_weighted_at_rank}     {query_ani_at_rank}  {lineage}\n".format(**rD))
 
 
-def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_lineage=False, sep='\t'):
+def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'):
     '''
     takes in a lineage dictionary with sample counts (output of aggregate_by_lineage_at_rank)
     and produces a tab-separated file with fractions for each sample.
@@ -1313,7 +1307,6 @@ class TaxResult:
     Uses RankLineageInfo to store lineage information; this may need to be modified in the future.            
     """
     raw: GatherRow
-    # can we get rid of these / just choose default ident hacking/slashing for future?
     keep_full_identifiers: bool = False
     keep_identifier_versions: bool = False
 
@@ -1446,7 +1439,7 @@ def as_kreport_dict(self, query_info):
         sD = {}
         sD['num_bp_assigned'] = str(0)
         # total percent containment, weighted to include abundance info
-        sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}'  #f'{proportion:.2f}'
+        sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}'
         sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp))
         if self.lineage != RankLineageInfo():
             this_rank = self.lineage.lowest_rank
@@ -1492,8 +1485,8 @@ def set_status(self, query_info, containment_threshold=None, ani_threshold=None)
         if ani_threshold is not None:  # if provided, just use ani thresh, don't use containment threshold
             if self.query_ani_at_rank >= ani_threshold:
                 self.status = 'match'
-        # should we switch to using weighted here? I think yes, but this would be behavior change
         elif containment_threshold is not None and self.fraction >= containment_threshold:
+        # FUTURE: switch to using weighted here
         #elif containment_threshold is not None and self.f_weighted_at_rank >= containment_threshold:
             self.status = 'match'
 

diff --git a/tests/test_tax.py b/tests/test_tax.py
@@ -1420,7 +1420,6 @@ def test_genome_gather_from_file_below_threshold(runtmp):
 
     assert c.last_result.status == 0
     assert "query_name,status,rank,fraction,lineage" in c.last_result.out
-    # assert "test1,below_threshold,,0.000," in c.last_result.out
     assert "test1,below_threshold,superkingdom,0.204," in c.last_result.out
 
 
@@ -1685,7 +1684,6 @@ def test_genome_missing_taxonomy_fail_threshold(runtmp):
     print(c.last_result.out)
     print(c.last_result.err)
 
-    # assert "The following are missing from the taxonomy information: GCF_001881345" in str(exc.value)
     assert "ident 'GCF_001881345' is not in the taxonomy database." in str(exc.value)
     assert "Failing, as requested via --fail-on-missing-taxonomy" in str(exc.value)
     assert c.last_result.status == -1
@@ -1712,7 +1710,6 @@ def test_genome_missing_taxonomy_fail_rank(runtmp):
     print(c.last_result.out)
     print(c.last_result.err)
 
-    # assert "The following are missing from the taxonomy information: GCF_001881345" in str(exc.value)
     assert "ident 'GCF_001881345' is not in the taxonomy database." in str(exc.value)
     assert "Failing, as requested via --fail-on-missing-taxonomy" in str(exc.value)
     assert c.last_result.status == -1
@@ -1932,7 +1929,6 @@ def test_genome_over100percent_error(runtmp):
 
     assert runtmp.last_result.status == -1
     assert "fraction is > 100% of the query! This should not be possible." in runtmp.last_result.err
-    # assert "ERROR: The tax summary of query 'test1' is 1.1, which is > 100% of the query!!" in runtmp.last_result.err
 
 
 def test_genome_ani_threshold_input_errors(runtmp):
@@ -2005,7 +2001,6 @@ def test_genome_ani_threshold(runtmp):
     print(c.last_result.status)
     print(c.last_result.out)
     print(c.last_result.err)
-    # assert "WARNING: classifying query test1 at desired rank species does not meet query ANI/AAI threshold 1.0" in c.last_result.err
     assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000,0.92" in c.last_result.out
 
 
@@ -2050,7 +2045,6 @@ def test_genome_ani_lemonade_classify(runtmp):
                    '--ani', '0.8', '-F', 'human')
 
     output = c.last_result.out
-    # assert 'MAG3_1            5.3%     91.0%  d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output
     assert 'MAG3_1            match     5.3%     91.0%  d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output
 
     # aaand classify to lineage_csv

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
@@ -786,12 +786,11 @@ def test_write_lineage_sample_frac(runtmp):
 def test_write_lineage_sample_frac_format_lineage(runtmp):
     outfrac = runtmp.output('outfrac.csv')
     sample_names = ['sample1', 'sample2']
-    # sk_lineage = lca_utils.make_lineage('a')
     sk_lineage='a'
     print(sk_lineage)
     sk_linD = {sk_lineage: {'sample1': '0.500' ,'sample2': '0.700'}}
     with open(outfrac, 'w') as out_fp:
-        write_lineage_sample_frac(sample_names, sk_linD, out_fp, format_lineage=True)
+        write_lineage_sample_frac(sample_names, sk_linD, out_fp)
 
     frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')]
     print("csv_lines: ", frac_lines)
@@ -803,7 +802,7 @@ def test_write_lineage_sample_frac_format_lineage(runtmp):
     print(phy2_lineage)
     phy_linD = {phy_lineage: {'sample1': '0.500'}, phy2_lineage: {'sample2': '0.700'}}
     with open(outfrac, 'w') as out_fp:
-        write_lineage_sample_frac(sample_names, phy_linD, out_fp, format_lineage=True)
+        write_lineage_sample_frac(sample_names, phy_linD, out_fp)
 
     frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')]
     print("csv_lines: ", frac_lines)