diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 1e042a8b3c..90f6aa0000 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -11,10 +11,10 @@ import sourmash from ..sourmash_args import FileOutputCSV, FileOutput from sourmash.logging import set_quiet, error, notify, print_results -from sourmash.lca.lca_utils import display_lineage, zip_lineage +from sourmash.lca.lca_utils import zip_lineage from . import tax_utils -from .tax_utils import ClassInf, MultiLineageDB, GatherRow +from .tax_utils import MultiLineageDB, GatherRow usage=''' sourmash taxonomy [] - manipulate/work with taxonomy information. diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 05c400b40a..295449afed 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -18,31 +18,21 @@ __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', - 'load_gather_results_old', 'check_and_load_gather_csvs_old', - 'find_match_lineage', 'summarize_gather_at', - 'find_missing_identities', 'make_krona_header', - 'aggregate_by_lineage_at_rank_old', 'format_for_krona_old', - 'write_krona_old', 'write_summary_old', 'write_classifications', + 'load_gather_results', 'check_and_load_gather_csvs' + 'report_missing_and_skipped_identities', 'aggregate_by_lineage_at_rank' + 'format_for_krona', 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', 'MultiLineageDB', 'RankLineageInfo'] from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file -# CTB: these could probably usefully be converted into dataclasses. -QInfo = namedtuple("QInfo", "query_md5, query_filename, query_bp, query_hashes, total_weighted_hashes") -SumGathInf = namedtuple("SumGathInf", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank, total_weighted_hashes") -ClassInf = namedtuple("ClassInf", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") - -# Essential Gather column names that must be in gather_csv to allow `tax` summarization -EssentialGatherColnames = ('query_name', 'name', 'f_unique_weighted', 'f_unique_to_query', 'unique_intersect_bp', 'remaining_bp', 'query_md5', 'query_filename') - RANKCODE = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C", "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"} # import lca utils as needed for now from sourmash.lca import lca_utils -from sourmash.lca.lca_utils import (taxlist, display_lineage, pop_to_rank) +from sourmash.lca.lca_utils import (taxlist) class LineagePair(NamedTuple): rank: str @@ -494,154 +484,6 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon return query_results_list -def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], - keep_full_identifiers=False, - keep_identifier_versions=False): - lineage="" - match_ident = get_ident(match_ident, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions) - # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match - if match_ident in skip_idents: - return lineage - try: - lineage = tax_assign[match_ident] - except KeyError: - raise ValueError(f"ident {match_ident} is not in the taxonomy database.") - return lineage - - -def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], - keep_full_identifiers=False, - keep_identifier_versions=False, best_only=False, - seen_perfect=set(), - estimate_query_ani=False): - """ - Summarize gather results at specified taxonomic rank - """ - # init dictionaries - sum_uniq_weighted = defaultdict(lambda: defaultdict(float)) - # store together w/ ^ instead? - sum_uniq_to_query = defaultdict(lambda: defaultdict(float)) - sum_uniq_bp = defaultdict(lambda: defaultdict(float)) - query_info = {} - - set_ksize = False - ksize, scaled, query_nhashes = None, 0, None - - for row in gather_results: - # get essential gather info - if not set_ksize and "ksize" in row.keys(): - set_ksize = True - ksize = int(row['ksize']) - scaled = int(row['scaled']) - - query_name = row['query_name'] - f_unique_to_query = float(row['f_unique_to_query']) - f_uniq_weighted = float(row['f_unique_weighted']) - unique_intersect_bp = int(row['unique_intersect_bp']) - total_weighted_hashes = int(row.get('total_weighted_hashes', 0)) - query_md5 = row['query_md5'] - query_filename = row['query_filename'] - # get query_bp - if query_name not in query_info.keys(): #REMOVING THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same for same query? bug? - if "query_nhashes" in row.keys(): - query_nhashes = int(row["query_nhashes"]) - if "query_bp" in row.keys(): - query_bp = int(row["query_bp"]) - else: - query_bp = unique_intersect_bp + int(row['remaining_bp']) - - # store query info - query_info[query_name] = QInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes=query_nhashes, total_weighted_hashes=total_weighted_hashes) - - if estimate_query_ani and (not ksize or not scaled): - if not set_ksize: - estimate_query_ani=False - notify("WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI...") - - match_ident = row['name'] - - # 100% match? are we looking at something in the database? - if f_unique_to_query >= 1.0 and query_name not in seen_perfect: # only want to notify once, not for each rank - ident = get_ident(match_ident, - keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions) - seen_perfect.add(query_name) - notify(f'WARNING: 100% match! Is query "{query_name}" identical to its database match, {ident}?') - - # get lineage for match - lineage = find_match_lineage(match_ident, tax_assign, - skip_idents=skip_idents, - keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions) - # ident was in skip_idents - if not lineage: - continue - - # summarize at rank! - lineage = pop_to_rank(lineage, rank) - assert lineage[-1].rank == rank, lineage[-1] - # record info - sum_uniq_to_query[query_name][lineage] += f_unique_to_query - sum_uniq_weighted[query_name][lineage] += f_uniq_weighted - sum_uniq_bp[query_name][lineage] += unique_intersect_bp - - # sort and store each as SumGathInf - sum_uniq_to_query_sorted = [] - for query_name, lineage_weights in sum_uniq_to_query.items(): - qInfo = query_info[query_name] - sumgather_items = list(lineage_weights.items()) - sumgather_items.sort(key = lambda x: -x[1]) - query_ani = None - if best_only: - lineage, fraction = sumgather_items[0] - if fraction > 1: - raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") - elif fraction == 0: - continue - f_weighted_at_rank = sum_uniq_weighted[query_name][lineage] - bp_intersect_at_rank = sum_uniq_bp[query_name][lineage] - if estimate_query_ani: - query_ani = containment_to_distance(fraction, ksize, scaled, - n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp).ani - sres = SumGathInf(query_name, rank, fraction, lineage, qInfo.query_md5, - qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani, qInfo.total_weighted_hashes * scaled) - sum_uniq_to_query_sorted.append(sres) - else: - total_f_weighted= 0.0 - total_f_classified = 0.0 - total_bp_classified = 0 - for lineage, fraction in sumgather_items: - query_ani = None - if fraction > 1: - raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") - elif fraction == 0: - continue - total_f_classified += fraction - f_weighted_at_rank = sum_uniq_weighted[query_name][lineage] - total_f_weighted += f_weighted_at_rank - bp_intersect_at_rank = int(sum_uniq_bp[query_name][lineage]) - total_bp_classified += bp_intersect_at_rank - if estimate_query_ani: - query_ani = containment_to_distance(fraction, ksize, scaled, - n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp).ani - sres = SumGathInf(query_name, rank, fraction, lineage, query_md5, - query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani, qInfo.total_weighted_hashes * scaled) - sum_uniq_to_query_sorted.append(sres) - - # record unclassified - lineage = () - query_ani = None - fraction = 1.0 - total_f_classified - if fraction > 0: - f_weighted_at_rank = 1.0 - total_f_weighted - bp_intersect_at_rank = qInfo.query_bp - total_bp_classified - sres = SumGathInf(query_name, rank, fraction, lineage, query_md5, - query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani, qInfo.total_weighted_hashes*scaled) - sum_uniq_to_query_sorted.append(sres) - - return sum_uniq_to_query_sorted, seen_perfect, estimate_query_ani - - def report_missing_and_skipped_identities(gather_results): """ Report match ids/accessions from gather results @@ -671,54 +513,6 @@ def report_missing_and_skipped_identities(gather_results): # notify(f'of {total_taxresults} gather results, lineage assignments for {total_n_skipped} results were skipped.') -def find_missing_identities_old(gather_results, tax_assign): - """ - Identify match ids/accessions from gather results - that are not present in taxonomic assignments. - """ - ident_missed= set() - for row in gather_results: - match_ident = row['name'] - match_ident = get_ident(match_ident) - if match_ident not in tax_assign: - ident_missed.add(match_ident) - - if ident_missed: - notify(f'of {len(gather_results)} gather results, missed {len(ident_missed)} lineage assignments.') - return ident_missed - - -# pass ranks; have ranks=[default_ranks] -def make_krona_header(min_rank, *, include_strain=False): - "make header for krona output" - header = ["fraction"] - tl = list(taxlist(include_strain=include_strain)) - try: - rank_index = tl.index(min_rank) - except ValueError: - raise ValueError(f"Rank {min_rank} not present in available ranks!") - return tuple(header + tl[:rank_index+1]) - - -def aggregate_by_lineage_at_rank_old(rank_results, *, by_query=False): - ''' - Aggregate list of rank SumGathInfs, - keeping query info or aggregating across queries. - ''' - lineage_summary = defaultdict(float) - if by_query: - lineage_summary = defaultdict(dict) - all_queries = [] - for res in rank_results: - if res.query_name not in all_queries: - all_queries.append(res.query_name) - if by_query: - lineage_summary[res.lineage][res.query_name] = res.fraction - else: - lineage_summary[res.lineage] += res.fraction - return lineage_summary, all_queries, len(all_queries) - - def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False): ''' Aggregate list of summarized_lineage_results at rank, keeping @@ -753,43 +547,6 @@ def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False): return lineage_summary, all_queries -def format_for_krona_old(rank, summarized_gather): - ''' - Aggregate list of SumGathInfs and format for krona output - ''' - num_queries=0 - for res_rank, rank_results in summarized_gather.items(): - if res_rank == rank: - lineage_summary, all_queries, num_queries = aggregate_by_lineage_at_rank_old(rank_results, by_query=False) - # if aggregating across queries divide fraction by the total number of queries - for lin, fraction in lineage_summary.items(): - # divide total fraction by total number of queries - lineage_summary[lin] = fraction/num_queries - - # sort by fraction - lin_items = list(lineage_summary.items()) - lin_items.sort(key = lambda x: -x[1]) - - # reformat lineage for krona_results printing - krona_results = [] - unclassified_fraction = 0 - for lin, fraction in lin_items: - # save unclassified fraction for the end - if lin == (): - unclassified_fraction = fraction - continue - lin_list = display_lineage(lin).split(';') - krona_results.append((fraction, *lin_list)) - - # handle unclassified - if unclassified_fraction: - len_unclassified_lin = len(krona_results[-1]) -1 - unclassifed_lin = ["unclassified"]*len_unclassified_lin - krona_results.append((unclassified_fraction, *unclassifed_lin)) - - return krona_results - - def format_for_krona(query_gather_results, rank, *, classification=False): ''' Aggregate and format for krona output. Single query recommended, but we don't want query headers. @@ -841,17 +598,6 @@ def format_for_krona(query_gather_results, rank, *, classification=False): return krona_results, header -def write_krona_old(rank, krona_results, out_fp, *, sep='\t'): - 'write krona output' - # CTB: do we want to optionally allow restriction to a specific rank - # & above? - header = make_krona_header(rank) - tsv_output = csv.writer(out_fp, delimiter='\t') - tsv_output.writerow(header) - for res in krona_results: - tsv_output.writerow(res) - - def write_krona(header, krona_results, out_fp, *, sep='\t'): 'write krona output' # CTB: do we want to optionally allow restriction to a specific rank @@ -864,6 +610,7 @@ def write_krona(header, krona_results, out_fp, *, sep='\t'): for res in krona_results: tsv_output.writerow(res) + def write_output(header, results, out_fp, *, sep=',', write_header=True): """ write pre-generated results list of rows, with each @@ -876,25 +623,6 @@ def write_output(header, results, out_fp, *, sep=',', write_header=True): output.writerow(res) -def write_summary_old(summarized_gather, csv_fp, *, sep=',', limit_float_decimals=False): - ''' - Write taxonomy-summarized gather results for each rank. - ''' - header = SumGathInf._fields - w = csv.DictWriter(csv_fp, header, delimiter=sep) - w.writeheader() - for rank, rank_results in summarized_gather.items(): - for res in rank_results: - rD = res._asdict() - if limit_float_decimals: - rD['fraction'] = f'{res.fraction:.3f}' - rD['f_weighted_at_rank'] = f'{res.f_weighted_at_rank:.3f}' - rD['lineage'] = display_lineage(res.lineage) - if rD['lineage'] == "": - rD['lineage'] = "unclassified" - w.writerow(rD) - - def write_summary(query_gather_results, csv_fp, *, sep=',', limit_float_decimals=False, classification=False): ''' Write taxonomy-summarized gather results for each rank. @@ -909,120 +637,6 @@ def write_summary(query_gather_results, csv_fp, *, sep=',', limit_float_decimals w.writerow(res) -def write_kreport_old(summarized_gather, csv_fp, *, sep='\t'): - ''' - Write taxonomy-summarized gather results as kraken-style kreport. - - While this format typically records the percent of number of reads assigned to taxa, - we can create comparable output by reporting the percent of k-mers (percent containment) - and the total number of k-mers matched. - - standard reads-based `kreport` columns: - - `Percent Reads Contained in Taxon`: The cumulative percentage of reads for this taxon and all descendants. - - `Number of Reads Contained in Taxon`: The cumulative number of reads for this taxon and all descendants. - - `Number of Reads Assigned to Taxon`: The number of reads assigned directly to this taxon (not a cumulative count of all descendants). - - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. - - `NCBI Taxon ID`: Numerical ID from the NCBI taxonomy database. - - `Scientific Name`: The scientific name of the taxon. - - Example reads-based `kreport` with all columns: - ``` - 88.41 2138742 193618 K 2 Bacteria - 0.16 3852 818 P 201174 Actinobacteria - 0.13 3034 0 C 1760 Actinomycetia - 0.13 3034 45 O 85009 Propionibacteriales - 0.12 2989 1847 F 31957 Propionibacteriaceae - 0.05 1142 352 G 1912216 Cutibacterium - 0.03 790 790 S 1747 Cutibacterium acnes - ``` - - sourmash `kreport` caveats: - - `Percent k-mers Contained in Taxon`: weighted by k-mer abundance - - `Estimated bp Contained in Taxon`: NOT WEIGHTED BY ABUNDANCE - - `Number of Reads Assigned to Taxon` and `NCBI Taxon ID` will not be reported (blank entries). - - In the future, we may wish to report the NCBI taxid when we can (NCBI taxonomy only). - ''' - columns = ["percent_containment", "num_bp_contained", "num_bp_assigned", "rank_code", "ncbi_taxid", "sci_name"] - w = csv.DictWriter(csv_fp, columns, delimiter=sep) - - # check - are we using v4.5.0 or later gather CSVs? - for rank, rank_results in summarized_gather.items(): - for res in rank_results: - if res.total_weighted_hashes == 0: - raise ValueError("ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0") - - unclassified_written=False - for rank, rank_results in summarized_gather.items(): - rcode = RANKCODE[rank] - for res in rank_results: - # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank. - # Here, we only need to report it once, since it will be the same fraction for all ranks - if not res.lineage: - rank_sciname = "unclassified" - rcode = "U" - # if we've already written the unclassified portion, skip and continue to next loop iteration - if unclassified_written: - continue - else: - unclassified_written=True - else: - rank_sciname = res.lineage[-1].name - kresD = {"rank_code": rcode, "ncbi_taxid": "", "sci_name": rank_sciname, "num_bp_assigned": 0} - # total percent containment, weighted to include abundance info - proportion = res.f_weighted_at_rank * 100 - kresD['percent_containment'] = f'{proportion:.2f}' - # weighted bp - kresD["num_bp_contained"] = int(res.f_weighted_at_rank * res.total_weighted_hashes) - if rank == 'species' or rank_sciname == "unclassified": - kresD["num_bp_assigned"] = kresD["num_bp_contained"] - w.writerow(kresD) - - -def write_human_summary_old(summarized_gather, out_fp, display_rank): - ''' - Write human-readable taxonomy-summarized gather results for a specific rank. - ''' - header = SumGathInf._fields - - found_ANI = False - results = [] - for rank, rank_results in summarized_gather.items(): - # only show results for a specified rank. - if rank == display_rank: - rank_results = list(rank_results) - rank_results.sort(key=lambda res: -res.f_weighted_at_rank) - - for res in rank_results: - rD = res._asdict() - rD['fraction'] = f'{res.fraction:.3f}' - rD['f_weighted_at_rank'] = f"{res.f_weighted_at_rank*100:>4.1f}%" - if rD['query_ani_at_rank'] is not None: - found_ANI = True - rD['query_ani_at_rank'] = f"{res.query_ani_at_rank*100:>3.1f}%" - else: - rD['query_ani_at_rank'] = '- ' - rD['lineage'] = display_lineage(res.lineage) - if rD['lineage'] == "": - rD['lineage'] = "unclassified" - - results.append(rD) - - - if found_ANI: - out_fp.write("sample name proportion cANI lineage\n") - out_fp.write("----------- ---------- ---- -------\n") - - for rD in results: - out_fp.write("{query_name:<15s} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format(**rD)) - else: - out_fp.write("sample name proportion lineage\n") - out_fp.write("----------- ---------- -------\n") - - for rD in results: - out_fp.write("{query_name:<15s} {f_weighted_at_rank} {lineage}\n".format(**rD)) - - def write_human_summary(query_gather_results, out_fp, display_rank, classification=False): ''' Write human-readable taxonomy-summarized gather results for a specific rank. @@ -1044,86 +658,6 @@ def write_human_summary(query_gather_results, out_fp, display_rank, classificati out_fp.write("{query_name:<15s} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format(**rD)) -def write_lineage_csv(summarized_gather, csv_fp): - ''' - Write a lineage-CSV format file suitable for use with sourmash tax ... -t. - ''' - ranks = lca_utils.taxlist(include_strain=False) - header = ['ident', *ranks] - w = csv.DictWriter(csv_fp, header) - w.writeheader() - for rank, rank_results in summarized_gather.items(): - for res in rank_results: - d = {} - d[rank] = "" - for rank, name in res.lineage: - d[rank] = name - - d['ident'] = res.query_name - w.writerow(d) - - -def write_classifications(classifications, csv_fp, *, sep=',', limit_float_decimals=False): - ''' - Write taxonomy-classifed gather results. - ''' - header = ClassInf._fields - w = csv.DictWriter(csv_fp, header, delimiter=sep) - w.writeheader() - for rank, rank_results in classifications.items(): - for res in rank_results: - rD = res._asdict() - if limit_float_decimals: - rD['fraction'] = f'{res.fraction:.3f}' - rD['f_weighted_at_rank'] = f'{res.f_weighted_at_rank:.3f}' - rD['lineage'] = display_lineage(res.lineage) - # needed? - if rD['lineage'] == "": - rD['lineage'] = "unclassified" - w.writerow(rD) - - -def combine_sumgather_csvs_by_lineage(gather_csvs, *, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): - ''' - Takes in one or more output csvs from `sourmash taxonomy summarize` - and combines the results into a nested dictionary with lineages - as the keys {lineage: {sample1: frac1, sample2: frac2}}. - Uses the file basename (minus .csv extension) as sample identifier. - - usage: - - linD, all_samples = combine_sumgather_by_lineage(["sample1.csv", "sample2.csv"], rank="genus") - - output: - - linD = {lin_a: {'sample1': 0.4, 'sample2': 0.17, 'sample3': 0.6} - lin_b: {'sample1': 0.0, 'sample2': 0.0, 'sample3': 0.1} - lin_c: {'sample1': 0.3, 'sample2': 0.4, 'sample3': 0.2} } - - all_samples = ['sample1','sample2','sample3'] - - ''' - if rank not in accept_ranks: - raise ValueError(f"Rank {rank} not available.") - - sgD = defaultdict(dict) - all_samples = [] - for g_csv in gather_csvs: - # collect lineage info for this sample - with open(g_csv, 'r') as fp: - r = csv.DictReader(fp) - for row in r: - if row["rank"] == rank: - query_name = row["query_name"] - lin = row["lineage"] - frac = row["fraction"] - if query_name not in all_samples: - all_samples.append(query_name) - sgD[lin][query_name] = frac - fp.close() - return sgD, all_samples - - def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_lineage=False, sep='\t'): ''' takes in a lineage dictionary with sample counts (output of aggregate_by_lineage_at_rank) @@ -1146,12 +680,7 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_line w.writeheader() blank_row = {query_name: 0 for query_name in sample_names} unclassified_row = None - # print(lineage_dict.keys()) for lin, sampleinfo in sorted(lineage_dict.items()): - # if format_lineage: - # lin=lin.display_lineage(null_as_unclassified=True) - #lin = display_lineage(lin) - #add lineage and 0 placeholders row = {'lineage': lin} row.update(blank_row) @@ -1679,50 +1208,64 @@ def load(cls, locations, **kwargs): return tax_assign -# strategy from: https://subscription.packtpub.com/book/programming/9781800207455/10/ch10lvl1sec01/using-dataclasses-to-simplify-working-with-csv-files @dataclass -class GatherRow(): # all cols should match "gather_write_cols" in `search.py` - # essential columns - query_name: str - name: str # match_name - f_unique_weighted: float - f_unique_to_query: float - unique_intersect_bp: int - remaining_bp: int - query_md5: str - query_filename: str - # new essential cols: requires 4.4x - query_bp: int - ksize: int - scaled: int - - # non-essential - intersect_bp: int = None - f_orig_query: float = None - f_match: float = None - average_abund: float = None - median_abund: float = None - std_abund: float = None - filename: str = None - md5: str = None - f_match_orig: float = None - gather_result_rank: str = None - moltype: str = None - query_n_hashes: int = None - query_abundance: int = None - query_containment_ani: float = None - match_containment_ani: float = None - average_containment_ani: float = None - max_containment_ani: float = None - potential_false_negative: bool = None - n_unique_weighted_found: int = None - sum_weighted_found: int = None - total_weighted_hashes: int = None - - -@dataclass() -class QueryInfo(): - """Class for storing query information""" +class GatherRow: + """ + Class to facilitate safely reading in Gather CSVs. The fields here should be match those + in "gather_write_cols" in `search.py` + + To ensure all columns required for taxonomic summarization are present, this class + contains no defaults for these columns and thus will throw a TypeError if any of these + columns are missing in the passed gather input. All other fields have default None. + + Usage: + + with sourmash_args.FileInputCSV(gather_csv) as r: + for row in enumerate(r): + gatherRow = GatherRow(**row) + """ + + # essential columns + query_name: str + name: str # match_name + f_unique_weighted: float + f_unique_to_query: float + unique_intersect_bp: int + remaining_bp: int + query_md5: str + query_filename: str + # new essential cols: requires 4.4x + query_bp: int + ksize: int + scaled: int + + # non-essential + intersect_bp: int = None + f_orig_query: float = None + f_match: float = None + average_abund: float = None + median_abund: float = None + std_abund: float = None + filename: str = None + md5: str = None + f_match_orig: float = None + gather_result_rank: str = None + moltype: str = None + query_n_hashes: int = None + query_abundance: int = None + query_containment_ani: float = None + match_containment_ani: float = None + average_containment_ani: float = None + max_containment_ani: float = None + potential_false_negative: bool = None + n_unique_weighted_found: int = None + sum_weighted_found: int = None + total_weighted_hashes: int = None + + +@dataclass +class QueryInfo: + "Class for storing query information" query_name: str query_md5: str query_filename: str @@ -1745,7 +1288,30 @@ def total_weighted_bp(self): return self.total_weighted_hashes * self.scaled @dataclass -class TaxResult(): +class TaxResult: + """ + Class to store taxonomic result of a single row from a gather CSV, including accessible + query information (QueryInfo) and matched taxonomic lineage. TaxResult tracks whether + lineage matching has been attempted and whether the lineage matching failed + due to missing or skipped lineage identifiers. + + Initialize TaxResult using GatherRow, which ensures all required fields are present. + The QueryInfo in TaxResult is used to ensure only compatible gather results generated + from the same query are summarized during taxonomic summarization. + + Usage: + + with sourmash_args.FileInputCSV(gather_csv) as r: + for row in enumerate(r): + gatherRow = GatherRow(**row) + # initialize TaxResult + tax_res = TaxResult(raw=gatherRow) + + # get match lineage + tax_res.get_match_lineage(taxD=taxonomic_assignments) + + Uses RankLineageInfo to store lineage information; this may need to be modified in the future. + """ raw: GatherRow # can we get rid of these / just choose default ident hacking/slashing for future? keep_full_identifiers: bool = False @@ -1804,8 +1370,13 @@ def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_t raise ValueError(f"Error: ident '{self.match_ident}' is not in the taxonomy database. Failing, as requested via --fail-on-missing-taxonomy") @dataclass -class SummarizedGatherResult(): -# """Class for storing summarized lineage information""" +class SummarizedGatherResult: + """ + Class for storing summarized lineage information. + Automatically checks for out-of-range values and estimates ANI. + + Methods included for returning formatted results for different outputs. + """ rank: str fraction: float lineage: RankLineageInfo @@ -1830,6 +1401,9 @@ def set_query_ani(self, query_info): def as_lineage_dict(self, query_info, ranks): + ''' + Format to dict for writing lineage-CSV file suitable for use with sourmash tax ... -t. + ''' lD = {} lD['ident'] = query_info.query_name for rank in ranks: @@ -1893,7 +1467,16 @@ def as_kreport_dict(self, query_info): @dataclass class ClassificationResult(SummarizedGatherResult): -# """Class for storing summarized lineage information""" + """ + Inherits from SummarizedGatherResult + + Class for storing query classification information. + Automatically checks for out-of-range values and estimates ANI. + Checks classification status according to provided containment and ANI thresholds. + + Methods included for returning formatted results for different outputs. + """ + "Class for storing query classification information" status: str = field(init=False) def __post_init__(self): @@ -1929,8 +1512,16 @@ def build_krona_result(self, rank=None): @dataclass -class QueryTaxResult(): - """Store all TaxResults for a query. Enable summarization.""" +class QueryTaxResult: + """ + Class for storing all TaxResults (gather results rows) for a query. + Checks query compatibility prior to adding a TaxResult. + Stores raw TaxResults and provides methods for summarizing up ranks + and reporting these summarized results as metagenome summaries or + genome classifications. + + Contains methods for formatting results for different outputs. + """ query_info: QueryInfo # initialize with QueryInfo dataclass def __post_init__(self): @@ -2182,6 +1773,51 @@ def make_full_summary(self, classification=False, limit_float=False): return header, results def make_kreport_results(self): + ''' + Format taxonomy-summarized gather results as kraken-style kreport. + + STANDARD KREPORT FORMAT: + - `Percent Reads Contained in Taxon`: The cumulative percentage of reads for this taxon and all descendants. + - `Number of Reads Contained in Taxon`: The cumulative number of reads for this taxon and all descendants. + - `Number of Reads Assigned to Taxon`: The number of reads assigned directly to this taxon (not a cumulative count of all descendants). + - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. + - `NCBI Taxon ID`: Numerical ID from the NCBI taxonomy database. + - `Scientific Name`: The scientific name of the taxon. + + Example reads-based `kreport` with all columns: + ``` + 88.41 2138742 193618 K 2 Bacteria + 0.16 3852 818 P 201174 Actinobacteria + 0.13 3034 0 C 1760 Actinomycetia + 0.13 3034 45 O 85009 Propionibacteriales + 0.12 2989 1847 F 31957 Propionibacteriaceae + 0.05 1142 352 G 1912216 Cutibacterium + 0.03 790 790 S 1747 Cutibacterium acnes + ``` + + SOURMASH KREPORT FORMAT: + + To best represent the sequence dataset, please build sourmash signatures with abundance tracking + to enable utilization of sequence abundance information during sourmash gather and taxonomic summarization. + + While this format typically records the percent of number of reads assigned to taxa, + we can create comparable output by reporting the percent of base pairs (percent containment) + the total number of base pairs matched. Using sourmash default scaled values, these numbers + will be estimates from FracMinHash k-mer comparisons. If using sourmash scaled=1 + (not recommended for most use cases), these results will be based on all k-mers. + + `sourmash gather` assigns k-mers to individual genoems. Since the lowest kreport rank is + "species," we use the "Assigned to Taxon" column to report assignments summarized to species level. + + - `Percent Contained in Taxon`: Percent of all base pairs contained by this taxon (weighted by abundance if tracked) + - `Estimated base pairs Contained in Taxon`: Number of base pairs contained by this taxon (weighted by abundance if tracked) + - `Estimated base pairs Assigned to Taxon`: Number of base pairs at species-level (weighted by abundance if tracked) + - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. + - `NCBI Taxon ID` will not be reported (blank entries). + - `Scientific Name`: The scientific name of the taxon. + + In the future, we may wish to report the NCBI taxid when we can (NCBI taxonomy only). + ''' self.check_summarization() header = ["percent_containment", "num_bp_contained", "num_bp_assigned", "rank_code", "ncbi_taxid", "sci_name"] if self.query_info.total_weighted_hashes == 0: diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index f83d09d291..0d36ffa661 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -11,36 +11,18 @@ import sourmash_tst_utils as utils from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, - summarize_gather_at, - write_summary_old, MultiLineageDB, collect_gather_csvs, check_and_load_gather_csvs, SummarizedGatherResult, ClassificationResult, QueryInfo, GatherRow, TaxResult, QueryTaxResult, BaseLineageInfo, RankLineageInfo, LineagePair, aggregate_by_lineage_at_rank, format_for_krona, - write_classifications, write_krona, - aggregate_by_lineage_at_rank_old, - make_krona_header, format_for_krona_old, write_krona_old, - combine_sumgather_csvs_by_lineage, write_lineage_sample_frac, - LineageDB, LineageDB_Sqlite, - SumGathInf, ClassInf, QInfo) - -# import lca utils as needed for now + write_krona, write_lineage_sample_frac, + LineageDB, LineageDB_Sqlite, MultiLineageDB) + +# import lca utils as needed from sourmash.lca import lca_utils # utility functions for testing -def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False): - # make mini gather_results - min_header = ["query_name", "name", "match_ident", "f_unique_to_query", "query_md5", "query_filename", "f_unique_weighted", "unique_intersect_bp", "remaining_bp"] - if include_ksize_and_scaled: - min_header.extend(['ksize', 'scaled']) - gather_results = [] - for g_info in g_infolist: - inf = dict(zip(min_header, g_info)) - gather_results.append(inf) - return gather_results - - def make_mini_taxonomy(tax_info): #pass in list of tuples: (name, lineage) taxD = {} @@ -688,480 +670,6 @@ def test_load_taxonomy_csv_duplicate_force(runtmp): assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] -def test_summarize_gather_at_0(): - """test two matches, equal f_unique_to_query""" - # make gather results - gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - g_res = make_mini_gather_results([gA,gB]) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - gB_tax = ("gB", "a;b;d") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - - # run summarize_gather_at and check results! - sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) - - # superkingdom - assert len(sk_sum) == 1 - print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].query_name == "queryA" - assert sk_sum[0].query_md5 == "queryA_md5" - assert sk_sum[0].query_filename == "queryA.sig" - assert sk_sum[0].rank == 'superkingdom' - assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) - assert sk_sum[0].fraction == 1.0 - assert sk_sum[0].f_weighted_at_rank == 1.0 - assert sk_sum[0].bp_match_at_rank == 100 - - # phylum - phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) - print("phylum summarized gather: ", phy_sum[0]) - assert len(phy_sum) == 1 - assert phy_sum[0].query_name == "queryA" - assert phy_sum[0].query_md5 == "queryA_md5" - assert phy_sum[0].query_filename == "queryA.sig" - assert phy_sum[0].rank == 'phylum' - assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) - assert phy_sum[0].fraction == 1.0 - assert phy_sum[0].f_weighted_at_rank == 1.0 - assert phy_sum[0].bp_match_at_rank == 100 - # class - cl_sum, _, _ = summarize_gather_at("class", taxD, g_res) - assert len(cl_sum) == 2 - print("class summarized gather: ", cl_sum) - assert cl_sum[0].query_name == "queryA" - assert cl_sum[0].query_md5 == "queryA_md5" - assert cl_sum[0].query_filename == "queryA.sig" - assert cl_sum[0].rank == 'class' - assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')) - assert cl_sum[0].fraction == 0.5 - assert cl_sum[0].f_weighted_at_rank == 0.5 - assert cl_sum[0].bp_match_at_rank == 50 - assert cl_sum[1].rank == 'class' - assert cl_sum[1].lineage == (LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b'), - LineagePair (rank='class', name='d')) - assert cl_sum[1].fraction == 0.5 - assert cl_sum[1].f_weighted_at_rank == 0.5 - assert cl_sum[1].bp_match_at_rank == 50 - - -def test_summarize_gather_at_1(): - """test two matches, diff f_unique_to_query""" - # make mini gather_results - ksize=31 - scaled=10 - gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled] - gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.1', '10', '90', ksize, scaled] - g_res = make_mini_gather_results([gA,gB], include_ksize_and_scaled=True) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - gB_tax = ("gB", "a;b;d") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - # run summarize_gather_at and check results! - sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, estimate_query_ani=True) - - # superkingdom - assert len(sk_sum) == 2 - print("\nsuperkingdom summarized gather 0: ", sk_sum[0]) - assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) - assert sk_sum[0].fraction == 0.7 - assert sk_sum[0].bp_match_at_rank == 70 - print("superkingdom summarized gather 1: ", sk_sum[1]) - assert sk_sum[1].lineage == () - assert round(sk_sum[1].fraction, 1) == 0.3 - assert sk_sum[1].bp_match_at_rank == 30 - assert sk_sum[0].query_ani_at_rank == 0.9885602934376099 - assert sk_sum[1].query_ani_at_rank == None - - # phylum - phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, estimate_query_ani=False) - print("phylum summarized gather 0: ", phy_sum[0]) - print("phylum summarized gather 1: ", phy_sum[1]) - assert len(phy_sum) == 2 - assert phy_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),LineagePair (rank='phylum', name='b')) - assert phy_sum[0].fraction == 0.7 - assert phy_sum[0].f_weighted_at_rank == 0.6 - assert phy_sum[0].bp_match_at_rank == 70 - assert phy_sum[1].lineage == () - assert round(phy_sum[1].fraction, 1) == 0.3 - assert phy_sum[1].bp_match_at_rank == 30 - assert phy_sum[0].query_ani_at_rank == None - assert phy_sum[1].query_ani_at_rank == None - # class - cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True) - assert len(cl_sum) == 3 - print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b'), - LineagePair (rank='class', name='c')) - assert cl_sum[0].fraction == 0.6 - assert cl_sum[0].f_weighted_at_rank == 0.5 - assert cl_sum[0].bp_match_at_rank == 60 - assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 - - assert cl_sum[1].rank == 'class' - assert cl_sum[1].lineage == (LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b'), - LineagePair (rank='class', name='d')) - assert cl_sum[1].fraction == 0.1 - assert cl_sum[1].f_weighted_at_rank == 0.1 - assert cl_sum[1].bp_match_at_rank == 10 - assert cl_sum[1].query_ani_at_rank == 0.9284145445194744 - assert cl_sum[2].lineage == () - assert round(cl_sum[2].fraction, 1) == 0.3 - assert cl_sum[2].query_ani_at_rank == None - - -def test_summarize_gather_at_perfect_match(): - """test 100% gather match (f_unique_to_query == 1)""" - # make mini gather_results - gA = ["queryA", "gA","0.5","1.0", "queryA_md5", "queryA.sig", '0.5', '100', '0'] - gB = ["queryA", "gB","0.3","0.0", "queryA_md5", "queryA.sig", '0.5', '0', '100'] - g_res = make_mini_gather_results([gA,gB]) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - gB_tax = ("gB", "a;b;d") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - - # run summarize_gather_at and check results! - sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) - # superkingdom - assert len(sk_sum) == 1 - print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) - assert sk_sum[0].fraction == 1.0 - - -def test_summarize_gather_at_over100percent_f_unique_to_query(): - """gather matches that add up to >100% f_unique_to_query""" - # make mini gather_results - gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - gB = ["queryA", "gB","0.3","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40'] - g_res = make_mini_gather_results([gA,gB]) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - gB_tax = ("gB", "a;b;d") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - - # run summarize_gather_at and check results! - with pytest.raises(ValueError) as exc: - sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) - assert "The tax summary of query 'queryA' is 1.1, which is > 100% of the query!!" in str(exc) - - # phylum - with pytest.raises(ValueError) as exc: - phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) - assert "The tax summary of query 'queryA' is 1.1, which is > 100% of the query!!" in str(exc) - - # class - cl_sum, _, _ = summarize_gather_at("class", taxD, g_res) - assert len(cl_sum) == 2 - print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b'), - LineagePair (rank='class', name='d')) - assert cl_sum[0].fraction == 0.6 - assert cl_sum[0].bp_match_at_rank == 60 - assert cl_sum[1].rank == 'class' - assert cl_sum[1].lineage == (LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b'), - LineagePair (rank='class', name='c')) - assert cl_sum[1].fraction == 0.5 - assert cl_sum[1].bp_match_at_rank == 50 - - -def test_summarize_gather_at_missing_ignore(): - """test two matches, ignore missing taxonomy""" - # make gather results - gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - g_res = make_mini_gather_results([gA,gB]) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - taxD = make_mini_taxonomy([gA_tax]) - - # run summarize_gather_at and check results! - sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) - # superkingdom - assert len(sk_sum) == 2 - print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) - assert sk_sum[0].fraction == 0.5 - assert sk_sum[0].bp_match_at_rank == 50 - assert sk_sum[1].lineage == () - assert sk_sum[1].fraction == 0.5 - assert sk_sum[1].bp_match_at_rank == 50 - - # phylum - phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) - print("phylum summarized gather: ", phy_sum[0]) - assert len(phy_sum) == 2 - assert phy_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),LineagePair (rank='phylum', name='b')) - assert phy_sum[0].fraction == 0.5 - assert phy_sum[0].bp_match_at_rank == 50 - assert phy_sum[1].lineage == () - assert phy_sum[1].fraction == 0.5 - assert phy_sum[1].bp_match_at_rank == 50 - # class - cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) - assert len(cl_sum) == 2 - print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b'), - LineagePair (rank='class', name='c')) - assert cl_sum[0].fraction == 0.5 - assert cl_sum[0].bp_match_at_rank == 50 - assert cl_sum[1].lineage == () - assert cl_sum[1].fraction == 0.5 - assert cl_sum[1].bp_match_at_rank == 50 - - -def test_summarize_gather_at_missing_fail(): - """test two matches, fail on missing taxonomy""" - # make gather results - gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - g_res = make_mini_gather_results([gA,gB]) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - taxD = make_mini_taxonomy([gA_tax]) - - # run summarize_gather_at and check results! - with pytest.raises(ValueError) as exc: - sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) - assert "ident gB is not in the taxonomy database." in str(exc.value) - - -def test_summarize_gather_at_best_only_0(): - """test two matches, diff f_unique_to_query""" - # make mini gather_results - ksize =31 - scaled=10 - gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled] - gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.5', '10', '90', ksize, scaled] - g_res = make_mini_gather_results([gA,gB],include_ksize_and_scaled=True) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - gB_tax = ("gB", "a;b;d") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - # run summarize_gather_at and check results! - sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True,estimate_query_ani=True) - # superkingdom - assert len(sk_sum) == 1 - print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) - assert sk_sum[0].fraction == 0.7 - assert sk_sum[0].bp_match_at_rank == 70 - print("superk ANI:",sk_sum[0].query_ani_at_rank) - assert sk_sum[0].query_ani_at_rank == 0.9885602934376099 - - # phylum - phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True) - print("phylum summarized gather: ", phy_sum[0]) - assert len(phy_sum) == 1 - assert phy_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),LineagePair (rank='phylum', name='b')) - assert phy_sum[0].fraction == 0.7 - assert phy_sum[0].bp_match_at_rank == 70 - print("phy ANI:",phy_sum[0].query_ani_at_rank) - assert phy_sum[0].query_ani_at_rank == 0.9885602934376099 - # class - cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True) - assert len(cl_sum) == 1 - print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b'), - LineagePair (rank='class', name='c')) - assert cl_sum[0].fraction == 0.6 - assert cl_sum[0].bp_match_at_rank == 60 - print("cl ANI:",cl_sum[0].query_ani_at_rank) - assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 - - -def test_summarize_gather_at_best_only_equal_choose_first(): - """test two matches, equal f_unique_to_query. best_only chooses first""" - # make mini gather_results - gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - g_res = make_mini_gather_results([gA,gB]) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - gB_tax = ("gB", "a;b;d") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - - # run summarize_gather_at and check results! - # class - cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True) - assert len(cl_sum) == 1 - print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b'), - LineagePair (rank='class', name='c')) - assert cl_sum[0].fraction == 0.5 - assert cl_sum[0].bp_match_at_rank == 50 - - -def test_write_summary_old_csv(runtmp): - """test summary csv write function""" - - sum_gather = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=1.0, - query_md5='queryA_md5', query_filename='queryA.sig', - f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair (rank='superkingdom', name='a'),), - query_ani_at_rank=None, - total_weighted_hashes=0)], - 'phylum': [SumGathInf(query_name='queryA', rank='phylum', fraction=1.0, - query_md5='queryA_md5', query_filename='queryA.sig', - f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b')), - query_ani_at_rank=None, - total_weighted_hashes=0)]} - - outs= runtmp.output("outsum.csv") - with open(outs, 'w') as out_fp: - write_summary_old(sum_gather, out_fp) - - sr = [x.rstrip().split(',') for x in open(outs, 'r')] - print("gather_summary_results_from_file: \n", sr) - assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank', 'total_weighted_hashes'] == sr[0] - assert ['queryA', 'superkingdom', '1.0', 'a', 'queryA_md5', 'queryA.sig', '1.0', '100', '', '0'] == sr[1] - assert ['queryA', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100','','0'] == sr[2] - - -def test_write_classification(runtmp): - """test classification csv write function""" - classif = ClassInf('queryA', 'match', 'phylum', 1.0, - (LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b')), - 'queryA_md5', 'queryA.sig', 1.0, 100, - query_ani_at_rank=None) - - classification = {'phylum': [classif]} - - outs= runtmp.output("outsum.csv") - with open(outs, 'w') as out_fp: - write_classifications(classification, out_fp) - - sr = [x.rstrip().split(',') for x in open(outs, 'r')] - print("gather_classification_results_from_file: \n", sr) - assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank'] == sr[0] - assert ['queryA', 'match', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100', ''] == sr[1] - - -def test_make_krona_header_0(): - hd = make_krona_header("species") - print("header: ", hd) - assert hd == ("fraction", "superkingdom", "phylum", "class", "order", "family", "genus", "species") - - -def test_make_krona_header_1(): - hd = make_krona_header("order") - print("header: ", hd) - assert hd == ("fraction", "superkingdom", "phylum", "class", "order") - - -def test_make_krona_header_strain(): - hd = make_krona_header("strain", include_strain=True) - print("header: ", hd) - assert hd == ("fraction", "superkingdom", "phylum", "class", "order", "family", "genus", "species", "strain") - - -def test_make_krona_header_fail(): - with pytest.raises(ValueError) as exc: - make_krona_header("strain") - assert "Rank strain not present in available ranks" in str(exc.value) - - -def test_aggregate_by_lineage_at_rank_old_by_query(): - """test two queries, aggregate lineage at rank for each""" - # make gather results - gA = ["queryA","gA","0.5","0.5", "queryA_md5", "queryA.sig", '0.5', '100', '100'] - gB = ["queryA","gB","0.3","0.4", "queryA_md5", "queryA.sig", '0.5', '60', '140'] - gC = ["queryB","gB","0.3","0.3", "queryB_md5", "queryB.sig", '0.5', '60', '140'] - g_res = make_mini_gather_results([gA,gB,gC]) - - # make mini taxonomy - gA_tax = ("gA", "a;b") - gB_tax = ("gB", "a;c") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - - # aggregate by lineage at rank - sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) - print("superkingdom summarized gather results:", sk_sum) - assert len(sk_sum) ==4 - assert sk_sum[0].query_name == "queryA" - assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) - assert sk_sum[0].fraction == 0.9 - assert sk_sum[0].bp_match_at_rank == 160 - # check for unassigned for queryA - assert sk_sum[1].query_name == "queryA" - assert sk_sum[1].lineage == () - assert sk_sum[1].bp_match_at_rank == 40 - assert round(sk_sum[1].fraction,1) == 0.1 - # queryB - assert sk_sum[2].query_name == "queryB" - assert sk_sum[2].lineage == (LineagePair (rank='superkingdom', name='a'),) - assert sk_sum[2].fraction == 0.3 - assert sk_sum[2].bp_match_at_rank == 60 - # check for unassigned for queryA - assert sk_sum[3].query_name == "queryB" - assert sk_sum[3].lineage == () - assert sk_sum[3].fraction == 0.7 - assert sk_sum[3].bp_match_at_rank == 140 - sk_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank_old(sk_sum, by_query=True) - print("superkingdom lineage summary:", sk_lin_sum, '\n') - assert sk_lin_sum == {(LineagePair (rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3}, - (): {'queryA': 0.09999999999999998, 'queryB': 0.7}} - assert num_queries == 2 - assert query_names == ['queryA', 'queryB'] - - phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) - print("phylum summary:", phy_sum, ']\n') - phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank_old(phy_sum, by_query=True) - print("phylum lineage summary:", phy_lin_sum, '\n') - assert phy_lin_sum == {(LineagePair (rank='superkingdom', name='a'), LineagePair (rank='phylum', name='b')): {'queryA': 0.5}, - (LineagePair (rank='superkingdom', name='a'), LineagePair (rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3}, - (): {'queryA': 0.09999999999999998, 'queryB': 0.7}} - assert num_queries == 2 - assert query_names == ['queryA', 'queryB'] - - -def test_format_for_krona_old_0(): - """test format for krona, equal matches""" - # make gather results - gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - g_res = make_mini_gather_results([gA,gB]) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - gB_tax = ("gB", "a;b;d") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - - # check krona format and check results! - sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res) - print("superkingdom summarized gather results:", sk_sum) - krona_res = format_for_krona_old("superkingdom", {"superkingdom": sk_sum}) - print("krona_res: ", krona_res) - assert krona_res == [(1.0, 'a')] - - phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res) - krona_res = format_for_krona_old("phylum", {"phylum": phy_sum}) - print("krona_res: ", krona_res) - assert krona_res == [(1.0, 'a', 'b')] - def test_format_for_krona_summarization(): """test format for krona""" # make gather results @@ -1255,129 +763,6 @@ def test_write_krona(runtmp): assert kr[2] == ["0.5", "a", "b", "d"] -def test_format_for_krona_old_1(): - """test format for krona at each rank""" - # make gather results - gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - g_res = make_mini_gather_results([gA,gB]) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - gB_tax = ("gB", "a;b;d") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - - # summarize with all ranks - sum_res = {} - #for rank in lca_utils.taxlist(include_strain=False): - for rank in ['superkingdom', 'phylum', 'class']: - sum_res[rank], _, _ = summarize_gather_at(rank, taxD, g_res) - print('summarized gather: ', sum_res) - # check krona format - sk_krona = format_for_krona_old("superkingdom", sum_res) - print("sk_krona: ", sk_krona) - assert sk_krona == [(1.0, 'a')] - phy_krona = format_for_krona_old("phylum", sum_res) - print("phy_krona: ", phy_krona) - assert phy_krona == [(1.0, 'a', 'b')] - cl_krona = format_for_krona_old("class", sum_res) - print("cl_krona: ", cl_krona) - assert cl_krona == [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] - - -def test_format_for_krona_old_best_only(): - """test two matches, equal f_unique_to_query""" - # make gather results - gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig", '0.5', '50', '50'] - g_res = make_mini_gather_results([gA,gB]) - - # make mini taxonomy - gA_tax = ("gA", "a;b;c") - gB_tax = ("gB", "a;b;d") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) - - # summarize with all ranks - sum_res = {} - #for rank in lca_utils.taxlist(include_strain=False): - for rank in ['superkingdom', 'phylum', 'class']: - sum_res[rank], _, _ = summarize_gather_at(rank, taxD, g_res, best_only=True) - print('summarized gather: ', sum_res) - # check krona format - sk_krona = format_for_krona_old("superkingdom", sum_res) - print("sk_krona: ", sk_krona) - assert sk_krona == [(1.0, 'a')] - phy_krona = format_for_krona_old("phylum", sum_res) - print("phy_krona: ", phy_krona) - assert phy_krona == [(1.0, 'a', 'b')] - cl_krona = format_for_krona_old("class", sum_res) - print("cl_krona: ", cl_krona) - assert cl_krona == [(0.5, 'a', 'b', 'c')] - - -def test_write_krona_old(runtmp): - """test two matches, equal f_unique_to_query""" - class_krona_results = [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] - outk= runtmp.output("outkrona.tsv") - with open(outk, 'w') as out_fp: - write_krona_old("class", class_krona_results, out_fp) - - kr = [x.strip().split('\t') for x in open(outk, 'r')] - print("krona_results_from_file: \n", kr) - assert kr[0] == ["fraction", "superkingdom", "phylum", "class"] - assert kr[1] == ["0.5", "a", "b", "c"] - assert kr[2] == ["0.5", "a", "b", "d"] - - -def test_combine_sumgather_csvs_by_lineage(runtmp): - # some summarized gather dicts - sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5, - query_md5='queryA_md5', query_filename='queryA.sig', - f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair (rank='superkingdom', name='a'),), - query_ani_at_rank=None, - total_weighted_hashes=0)], - 'phylum': [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5, - query_md5='queryA_md5', query_filename='queryA.sig', - f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b')), - query_ani_at_rank=None, - total_weighted_hashes=0)]} - sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7, - query_md5='queryB_md5', query_filename='queryB.sig', - f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair (rank='superkingdom', name='a'),), - query_ani_at_rank=None, - total_weighted_hashes=0)], - 'phylum': [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7, - query_md5='queryB_md5', query_filename='queryB.sig', - f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='c')), - query_ani_at_rank=None, - total_weighted_hashes=0)]} - - # write summarized gather results csvs - sg1= runtmp.output("sample1.csv") - with open(sg1, 'w') as out_fp: - write_summary_old(sum_gather1, out_fp) - - sg2= runtmp.output("sample2.csv") - with open(sg2, 'w') as out_fp: - write_summary_old(sum_gather2, out_fp) - - # test combine_summarized_gather_csvs_by_lineage_at_rank - linD, query_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="phylum") - print("lineage_dict", linD) - assert linD == {'a;b': {'queryA': '0.5'}, 'a;c': {'queryB': '0.7'}} - assert query_names == ['queryA', 'queryB'] - linD, query_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="superkingdom") - print("lineage dict: \n", linD) - assert linD, query_names == {'a': {'queryA': '0.5', 'queryB': '0.7'}} - assert query_names == ['queryA', 'queryB'] - - def test_write_lineage_sample_frac(runtmp): outfrac = runtmp.output('outfrac.csv') sample_names = ['sample1', 'sample2'] @@ -1413,10 +798,8 @@ def test_write_lineage_sample_frac_format_lineage(runtmp): assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']] phy_lineage='a;b' - # phy_lineage = lca_utils.make_lineage('a;b') print(phy_lineage) phy2_lineage = 'a;c' - # phy2_lineage = lca_utils.make_lineage('a;c') print(phy2_lineage) phy_linD = {phy_lineage: {'sample1': '0.500'}, phy2_lineage: {'sample2': '0.700'}} with open(outfrac, 'w') as out_fp: @@ -1427,51 +810,6 @@ def test_write_lineage_sample_frac_format_lineage(runtmp): assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] -def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): - # some summarized gather dicts - sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5, - query_md5='queryA_md5', query_filename='queryA.sig', - f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(LineagePair (rank='superkingdom', name='a'),), - query_ani_at_rank=None, - total_weighted_hashes=0)], - 'phylum': [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5, - query_md5='queryA_md5', query_filename='queryA.sig', - f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='b')), - query_ani_at_rank=None, - total_weighted_hashes=0)]} - sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7, - query_md5='queryB_md5', query_filename='queryB.sig', - f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair (rank='superkingdom', name='a'),), - query_ani_at_rank=None, - total_weighted_hashes=0)], - 'phylum': [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7, - query_md5='queryB_md5', query_filename='queryB.sig', - f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair (rank='superkingdom', name='a'), - LineagePair (rank='phylum', name='c')), - query_ani_at_rank=None, - total_weighted_hashes=0)]} - - # write summarized gather results csvs - sg1= runtmp.output("sample1.csv") - with open(sg1, 'w') as out_fp: - write_summary_old(sum_gather1, out_fp) - - sg2= runtmp.output("sample2.csv") - with open(sg2, 'w') as out_fp: - write_summary_old(sum_gather2, out_fp) - - # test combine_summarized_gather_csvs_by_lineage_at_rank - with pytest.raises(ValueError) as exc: - linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="strain") - print("ValueError: ", exc.value) - assert "Rank strain not available." in str(exc.value) - - def test_tax_multi_load_files(runtmp): # test loading various good and bad files taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -1707,6 +1045,7 @@ def test_BaseLineageInfo_init_lineage_str(): assert taxinf.lowest_rank == "C" assert taxinf.name_at_rank("A") == "a" + def test_BaseLineageInfo_init_lineage_str_comma_sep(): x = "a,b,c" ranks=["A", "B", "C"] @@ -1831,6 +1170,7 @@ def test_RankLineageInfo_init_lineage_str(): print(taxinf.lineage_str) assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', ''] + def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): x = "a;b;c" taxranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] @@ -1885,6 +1225,7 @@ def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): print("lin2: ", lin2) assert lin1 == lin2 + def test_RankLineageInfo_init_lineage_str_1_truncate(): x = "a;b;c" taxinf = RankLineageInfo(lineage_str=x) @@ -1969,6 +1310,7 @@ def test_display_taxid_1(): print(taxinf) assert taxinf.display_taxid() == "1;2" + def test_display_taxid_2(): x = [ LineagePair('superkingdom', 'name1', 1), LineagePair(None, ''), LineagePair ('class', 'name2',2) ] taxinf = RankLineageInfo(lineage=x) @@ -2368,7 +1710,6 @@ def test_QueryTaxResult_summarize_up_ranks_2(): RankLineageInfo(lineage_str="a;b;d"): 10} - def test_QueryTaxResult_summarize_up_ranks_missing_lineage(): "basic functionality: summarize up ranks" taxD = make_mini_taxonomy([("gA", "a;b;c")]) @@ -2699,6 +2040,7 @@ def test_build_summarized_result_rank_fail_not_available_resummarize(): print(str(exc)) assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc) + def test_aggregate_by_lineage_at_rank(): """test aggregate by lineage at rank""" # make mini taxonomy @@ -2773,7 +2115,6 @@ def test_build_classification_result_containment_threshold_fail(): assert "Containment threshold must be between 0 and 1 (input value: -0.1)." in str(exc) - def test_build_classification_result_containment_threshold(): "basic functionality: build classification result using containment threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])