From a1cd8b22a03733e34ed891239bd763d0974352d0 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Mon, 7 Mar 2022 19:38:12 -0500 Subject: [PATCH 01/17] update shasum --- scripts/fetchPufferfish.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fetchPufferfish.sh b/scripts/fetchPufferfish.sh index c5227453f..5fe95769a 100755 --- a/scripts/fetchPufferfish.sh +++ b/scripts/fetchPufferfish.sh @@ -27,7 +27,7 @@ SVER=salmon-v1.8.0 #SVER=develop #SVER=sketch-mode -EXPECTED_SHA256=63811834c41745f3f7530f5f3457f10993bcff604662b9b1b05c88fd8ef179d5 +EXPECTED_SHA256=2180da8163cf8f134d5c6b3d3bd6c18d3501be3526d49417e18b4e38acedd77c mkdir -p ${EXTERNAL_DIR} curl -k -L https://github.com/COMBINE-lab/pufferfish/archive/${SVER}.zip -o ${EXTERNAL_DIR}/pufferfish.zip From f4b2b2cf17fea6cc71c0a80309890149b413230a Mon Sep 17 00:00:00 2001 From: Jakub Kaczmarzyk Date: Wed, 27 Apr 2022 12:40:55 -0400 Subject: [PATCH 02/17] minimize image size by using multi-stage builds This commit uses multi-stage builds to minimize the Docker image size. The uncompressed size is down to 101 MB from 1.38 GB. The second stage of the build begins with a fresh ubuntu:18.04 image and copies in the compiled salmon binaries. It also installs one system library that is linked by one of the salmon shared libraries. --- docker/Dockerfile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 53f069b8f..61f3ffcf2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,7 +1,7 @@ # image: COMBINE-lab/salmon # This dockerfile is based on the one created by # Titus Brown (available at https://github.com/ctb/2015-docker-building/blob/master/salmon/Dockerfile) -FROM ubuntu:18.04 +FROM ubuntu:18.04 as base MAINTAINER salmon.maintainer@gmail.com ENV PACKAGES git gcc make g++ libboost-all-dev liblzma-dev libbz2-dev \ @@ -36,7 +36,7 @@ RUN curl -k -L https://github.com/COMBINE-lab/salmon/archive/v${SALMON_VERSION}. cd salmon-${SALMON_VERSION} && \ mkdir build && \ cd build && \ - cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && make && make install + cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local/salmon && make && make install # For dev version #RUN git clone https://github.com/COMBINE-lab/salmon.git && \ @@ -46,6 +46,11 @@ RUN curl -k -L https://github.com/COMBINE-lab/salmon/archive/v${SALMON_VERSION}. # cd build && \ # cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && make && make install +FROM ubuntu:18.04 +RUN apt-get update \ + && apt-get install -y --no-install-recommends libhwloc5 \ + && rm -rf /var/lib/apt/lists/* +COPY --from=base /usr/local/salmon/ /usr/local/ ENV PATH /home/salmon-${SALMON_VERSION}/bin:${PATH} ENV LD_LIBRARY_PATH "/usr/local/lib:${LD_LIBRARY_PATH}" From 78b5ebd27e890f6763c94d9794c4e045ea6ba99c Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 6 May 2022 17:24:20 -0400 Subject: [PATCH 03/17] check ofstream fail after close --- scripts/fetchPufferfish.sh | 4 ++-- src/SalmonAlevin.cpp | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/scripts/fetchPufferfish.sh b/scripts/fetchPufferfish.sh index 5fe95769a..0d5df31d5 100755 --- a/scripts/fetchPufferfish.sh +++ b/scripts/fetchPufferfish.sh @@ -23,8 +23,8 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then rm -fr ${INSTALL_DIR}/src/pufferfish fi -SVER=salmon-v1.8.0 -#SVER=develop +#SVER=salmon-v1.8.0 +SVER=develop #SVER=sketch-mode EXPECTED_SHA256=2180da8163cf8f134d5c6b3d3bd6c18d3501be3526d49417e18b4e38acedd77c diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index e5f45d988..fb3bf2401 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -2223,7 +2223,7 @@ void processReadLibrary( * Selectively align the reads and write a PAM file out. */ template -void do_sc_align(ReadExperimentT& experiment, +bool do_sc_align(ReadExperimentT& experiment, SalmonOpts& salmonOpts, MappingStatistics& mstats, uint32_t numQuantThreads, @@ -2455,8 +2455,24 @@ void do_sc_align(ReadExperimentT& experiment, rad_file.write(reinterpret_cast(&nc), sizeof(nc)); rad_file.close(); + + // we want to check if the RAD file stream was written to properly + // while we likely would have caught this earlier, it is possible the + // badbit may not be set until the stream actually flushes (perhaps even + // at close), so we check here one final time that the status of the + // stream is as expected. + // see : https://stackoverflow.com/questions/28342660/error-handling-in-stdofstream-while-writing-data + if ( !rad_file ) { + jointLog->critical("The RAD file stream had an invalid status after close; so some operation(s) may" + "have failed!\nA common cause for this is lack of output disk space.\n" + "Consequently, the output may be corrupted."); + jointLog->flush(); + return false; + } + jointLog->info("finished sc_align()"); jointLog->flush(); + return true; } /** @@ -2783,13 +2799,18 @@ int alevin_sc_align(AlevinOpts& aopt, alevin::types::AlevinCellBarcodeKmer::k(aopt.protocol.barcodeLength); } - do_sc_align(experiment, sopt, - mstats, sopt.numThreads, aopt); + bool mapping_ok = do_sc_align(experiment, sopt, + mstats, sopt.numThreads, aopt); // write meta-information about the run GZipWriter gzw(outputDirectory, jointLog); sopt.runStopTime = salmon::utils::getCurrentTimeAsString(); gzw.writeMetaFryMode(sopt, experiment, mstats); + + if ( !mapping_ok ) { + std::cerr << "Error : There was an error during mapping; please check the log for further details.\n"; + std::exit(1); + } } catch (po::error& e) { std::cerr << "Exception : [" << e.what() << "]. Exiting.\n"; std::exit(1); From cf4cc3d0e71c09416659e7c2e36b7628f38de6d6 Mon Sep 17 00:00:00 2001 From: Christopher Bottoms Date: Thu, 26 May 2022 14:04:02 -0500 Subject: [PATCH 04/17] Fixed minor typos --- doc/source/salmon.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/doc/source/salmon.rst b/doc/source/salmon.rst index 77eebadb9..48f65f048 100644 --- a/doc/source/salmon.rst +++ b/doc/source/salmon.rst @@ -10,8 +10,8 @@ alignments (in the form of a SAM/BAM file) to the transcripts rather than the raw reads. The **mapping**-based mode of Salmon runs in two phases; indexing and -quantification. The indexing step is independent of the reads, and only need to -be run one for a particular set of reference transcripts. The quantification +quantification. The indexing step is independent of the reads, and only needs to +be run once for a particular set of reference transcripts. The quantification step, obviously, is specific to the set of RNA-seq reads and is thus run more frequently. For a more complete description of all available options in Salmon, see below. @@ -24,7 +24,7 @@ see below. salmon. When salmon is run with selective alignment, it adopts a considerably more sensitive scheme that we have developed for finding the potential mapping loci of a read, and score potential mapping loci using - the chaining algorithm introdcued in minimap2 [#minimap2]_. It scores and + the chaining algorithm introduced in minimap2 [#minimap2]_. It scores and validates these mappings using the score-only, SIMD, dynamic programming algorithm of ksw2 [#ksw2]_. Finally, we recommend using selective alignment with a *decoy-aware* transcriptome, to mitigate potential @@ -90,7 +90,7 @@ set of alignments. For quasi-mapping-based Salmon, the story is somewhat different. Generally, performance continues to improve as more threads are made - available. This is because the determiniation of the potential mapping + available. This is because the determination of the potential mapping locations of each read is, generally, the slowest step in quasi-mapping-based quantification. Since this process is trivially parallelizable (and well-parallelized within Salmon), more @@ -140,9 +140,9 @@ This will build the mapping-based index, using an auxiliary k-mer hash over k-mers of length 31. While the mapping algorithms will make used of arbitrarily long matches between the query and reference, the `k` size selected here will act as the *minimum* acceptable length for a valid match. Thus, a smaller -value of `k` may slightly improve sensitivty. We find that a `k` of 31 seems +value of `k` may slightly improve sensitivity. We find that a `k` of 31 seems to work well for reads of 75bp or longer, but you might consider a smaller -`k` if you plan to deal with shorter reads. Also, a shoter value of `k` may +`k` if you plan to deal with shorter reads. Also, a shorter value of `k` may improve sensitivity even more when using selective alignment (enabled via the `--validateMappings` flag). So, if you are seeing a smaller mapping rate than you might expect, consider building the index with a slightly smaller `k`. @@ -415,7 +415,7 @@ distribution of the sequencing library. This value will affect the effective length correction, and hence the estimated effective lengths of the transcripts and the TPMs. The value passed to ``--fldSD`` will be used as the standard deviation of the assumed fragment length -distribution (which is modeled as a truncated Gaussan with a mean +distribution (which is modeled as a truncated Gaussian with a mean given by ``--fldMean``). @@ -529,7 +529,7 @@ have a prior count of 1 fragment, while a transcript of length 50000 will have a prior count of 0.5 fragments, etc. This behavior can be modified in two ways. First, the prior itself can be modified via Salmon's ``--vbPrior`` option. The argument to this option is the value you wish to place as the -*per-nucleotide* prior. Additonally, you can modify the behavior to use +*per-nucleotide* prior. Additionally, you can modify the behavior to use a *per-transcript* rather than a *per-nucleotide* prior by passing the flag ``--perTranscriptPrior`` to Salmon. In this case, whatever value is set by ``--vbPrior`` will be used as the transcript-level prior, so that the @@ -559,7 +559,7 @@ bootstraps allows us to assess technical variance in the main abundance estimate we produce. Such estimates can be useful for downstream (e.g. differential expression) tools that can make use of such uncertainty estimates. This option takes a positive integer that dictates the number of bootstrap samples to compute. -The more samples computed, the better the estimates of varaiance, but the +The more samples computed, the better the estimates of variance, but the more computation (and time) required. """"""""""""""""""""""""""""""" @@ -664,7 +664,7 @@ the length of the transcriptome --- though each evaluation itself is efficient and the process is highly parallelized. It is possible to speed this process up by a multiplicative factor by -considering only every *i*:sup:`th` fragment length, and interploating +considering only every *i*:sup:`th` fragment length, and interpolating the intermediate results. The ``--biasSpeedSamp`` option allows the user to set this sampling factor. Larger values speed up effective length correction, but may decrease the fidelity of bias modeling. @@ -683,7 +683,7 @@ map to the transcriptome. When mapping paired-end reads, the entire fragment (both ends of the pair) are identified by the name of the first read (i.e. the read appearing in the ``_1`` file). Each line of the unmapped reads file contains the name of the unmapped read followed by a simple flag -that designates *how* the read failed to map completely. If fragmetns are +that designates *how* the read failed to map completely. If fragments are aligned against a decoy-aware index, then fragments that are confidently assigned as decoys are written in this file followed by the ``d`` (decoy) flag. Apart from the decoy flag, for single-end @@ -694,7 +694,7 @@ reads, there are a number of different possibilities, outlined below: u = The entire pair was unmapped. No mappings were found for either the left or right read. m1 = Left orphan (mappings were found for the left (i.e. first) read, but not the right). - m2 = Right orphan (mappinds were found for the right read, but not the left). + m2 = Right orphan (mappings were found for the right read, but not the left). m12 = Left and right orphans. Both the left and right read mapped, but never to the same transcript. By reading through the file of unmapped reads and selecting the appropriate From dde4459b0588c9f4d1f0257da813b3625b6840d2 Mon Sep 17 00:00:00 2001 From: Christopher Bottoms Date: Thu, 26 May 2022 14:07:00 -0500 Subject: [PATCH 05/17] fasta -> FASTA --- doc/source/salmon.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/salmon.rst b/doc/source/salmon.rst index 48f65f048..092c1e11c 100644 --- a/doc/source/salmon.rst +++ b/doc/source/salmon.rst @@ -243,7 +243,7 @@ mode, and a description of each, run ``salmon quant --help-alignment``. .. note:: Genomic vs. Transcriptomic alignments Salmon expects that the alignment files provided are with respect to the - transcripts given in the corresponding fasta file. That is, Salmon expects + transcripts given in the corresponding FASTA file. That is, Salmon expects that the reads have been aligned directly to the transcriptome (like RSEM, eXpress, etc.) rather than to the genome (as does, e.g. Cufflinks). If you have reads that have already been aligned to the genome, there are From 0fd4dbb39a833bb49bd0a51e6feedeb69b20a590 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 3 Jun 2022 13:30:01 -0400 Subject: [PATCH 06/17] fix: use proper code for decoy reads in unmaped names In single-end mode, *all* unmapped reads were being reported with the code 'u'. This fixes the output so the proper code is reported (e.g. 'd') for decoys. This addresses #748. --- src/SalmonQuantify.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp index 357829f5f..579383f6f 100644 --- a/src/SalmonQuantify.cpp +++ b/src/SalmonQuantify.cpp @@ -1424,7 +1424,7 @@ void processReads( if (writeUnmapped and mapType != salmon::utils::MappingType::PAIRED_MAPPED) { // If we have no mappings --- then there's nothing to do - // unless we're outputting names for un-mapped reads + // unless we're outputting names for un-mapped / decoy-mapped reads unmappedNames << rp.first.name << ' ' << salmon::utils::str(mapType) << '\n'; } @@ -1854,8 +1854,9 @@ void processReads( if (writeUnmapped and mapType != salmon::utils::MappingType::SINGLE_MAPPED) { // If we have no mappings --- then there's nothing to do - // unless we're outputting names for un-mapped reads - unmappedNames << rp.name << " u\n"; + // unless we're outputting names for un-mapped / decoy mapped reads + unmappedNames << rp.name << ' ' << salmon::utils::str(mapType) + << '\n'; } validHits += jointAlignments.size(); From 7a1f0c63ae178fb11461b97a1ae0cc2e9c13086b Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 3 Jun 2022 15:44:50 -0400 Subject: [PATCH 07/17] fix: flush loggers even when exceptions are caught --- src/SalmonQuantify.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp index 579383f6f..ecdc8ab7e 100644 --- a/src/SalmonQuantify.cpp +++ b/src/SalmonQuantify.cpp @@ -2385,6 +2385,15 @@ void quantifyLibrary(ReadExperimentT& experiment, jointLog->info("Mapping rate = {}\%\n", experiment.effectiveMappingRate() * 100.0); jointLog->info("finished quantifyLibrary()"); + + // clean up loggers + { + if (salmonOpts.writeUnmappedNames) { + spdlog::logger* unmappedLogger = salmonOpts.unmappedLog.get(); + unmappedLogger->flush(); + } + } + } int salmonQuantify(int argc, const char* argv[], std::unique_ptr& salmonIndex) { @@ -2547,6 +2556,12 @@ transcript abundance from RNA-seq reads std::vector errors{"insufficient_assigned_fragments"}; sopt.runStopTime = salmon::utils::getCurrentTimeAsString(); gzw.writeEmptyMeta(sopt, experiment, errors); + sopt.jointLog->flush(); + if (sopt.writeUnmappedNames) { + spdlog::logger* unmappedLogger = sopt.unmappedLog.get(); + unmappedLogger->flush(); + } + spdlog::drop_all(); std::exit(1); } @@ -2755,6 +2770,10 @@ transcript abundance from RNA-seq reads // Write meta-information about the run gzw.writeMeta(sopt, experiment, mstats); + // at this point we can (and should) drop all loggers to force them to + // flush. + spdlog::drop_all(); + } catch (po::error& e) { std::cerr << "(mapping-based mode) Exception : [" << e.what() << "].\n"; std::cerr << "Please be sure you are passing correct options, and that you are running in the intended mode.\n"; From 2109c8f66794c1002260be147d71e55660ae191d Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Wed, 8 Jun 2022 00:44:45 -0400 Subject: [PATCH 08/17] exclude libm from whats included when making a release --- scripts/make-release.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/make-release.sh b/scripts/make-release.sh index 0a4ce153a..7324e9cb8 100755 --- a/scripts/make-release.sh +++ b/scripts/make-release.sh @@ -56,6 +56,7 @@ rm ${DIR}/../RELEASES/${betaname}/lib/libc.so.6 rm ${DIR}/../RELEASES/${betaname}/lib/ld-linux-x86-64.so.2 rm ${DIR}/../RELEASES/${betaname}/lib/libdl.so.2 rm ${DIR}/../RELEASES/${betaname}/lib/libpthread*.so.* +rm ${DIR}/../RELEASES/${betaname}/lib/libm.so.6 # now make the tarball echo -e "Making the tarball\n" From dd9f361113c3dee702666bcd82d9a06c750f898a Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 21 Jun 2022 13:10:35 -0400 Subject: [PATCH 09/17] feat: enable optional writing of quality values to SAM output This addresses the feature request made in #756. --- include/SalmonDefaults.hpp | 1 + include/SalmonOpts.hpp | 1 + src/ProgramOptionsGenerator.cpp | 5 +++++ src/SalmonAlevin.cpp | 21 +++++++++++---------- src/SalmonQuantify.cpp | 10 ++++++++++ 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/include/SalmonDefaults.hpp b/include/SalmonDefaults.hpp index c68ce4660..18c6bfcea 100644 --- a/include/SalmonDefaults.hpp +++ b/include/SalmonDefaults.hpp @@ -14,6 +14,7 @@ namespace defaults { constexpr const bool posBiasCorrect{false}; constexpr const uint32_t numThreads{8}; constexpr const double incompatPrior{0.0}; + constexpr const bool writeQualities{false}; constexpr const char quasiMappingDefaultFile[] = ""; constexpr const char quasiMappingImplicitFile[] = "-"; constexpr const bool metaMode{false}; diff --git a/include/SalmonOpts.hpp b/include/SalmonOpts.hpp index 3d0be1989..c357eae1a 100644 --- a/include/SalmonOpts.hpp +++ b/include/SalmonOpts.hpp @@ -202,6 +202,7 @@ struct SalmonOpts { uint32_t eelCountCutoff{50}; // For writing quasi-mappings + bool writeQualities; std::string qmFileName; std::ofstream qmFile; std::unique_ptr qmStream{nullptr}; diff --git a/src/ProgramOptionsGenerator.cpp b/src/ProgramOptionsGenerator.cpp index e7f4da381..84682c277 100644 --- a/src/ProgramOptionsGenerator.cpp +++ b/src/ProgramOptionsGenerator.cpp @@ -269,6 +269,11 @@ namespace salmon { "format. By default, output will be directed to " "stdout, but an alternative file name can be " "provided instead.") + ("writeQualities", po::bool_switch(&(sopt.writeQualities))->default_value(salmon::defaults::writeQualities), + "This flag only has meaning if mappings are being written (with --writeMappings/-z). " + "If this flag is provided, then the output SAM file will contain quality strings as well as " + "read sequences. Note that this can greatly increase the size of the output file." + ) ("hitFilterPolicy", po::value(&sopt.hitFilterPolicyStr)->default_value(salmon::defaults::hitFilterPolicyStr), "[selective-alignment mode only] : Determines the policy by which hits are filtered in selective alignment. Filtering hits after " diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index fb3bf2401..a2aa03c93 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -455,6 +455,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); + if (salmonOpts.writeQualities) { + salmonOpts.jointLog->warn("The --writeQualities flag has no effect when writing results to a RAD file."); + } pufferfish::util::QueryCache qc; bool mimicStrictBT2 = salmonOpts.mimicStrictBT2; @@ -965,11 +968,6 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re bw << num_reads_in_chunk; bw << num_reads_in_chunk; } - /* - if (writeQuasimappings) { - writeAlignmentsToStream(rp, formatter, jointAlignments, sstream, true, true, extraBAMtags); - } - */ validHits += accepted_hits.size(); localNumAssignedFragments += (accepted_hits.size() > 0); @@ -1104,6 +1102,9 @@ void process_reads_sc_align(paired_parser* parser, ReadExperimentT& readExp, Rea std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); + if (salmonOpts.writeQualities) { + salmonOpts.jointLog->warn("The --writeQualities flag has no effect when writing results to a RAD file."); + } pufferfish::util::QueryCache qc; bool mimicStrictBT2 = salmonOpts.mimicStrictBT2; @@ -1424,11 +1425,6 @@ void process_reads_sc_align(paired_parser* parser, ReadExperimentT& readExp, Rea bw << num_reads_in_chunk; bw << num_reads_in_chunk; } - /* - if (writeQuasimappings) { - writeAlignmentsToStream(rp, formatter, jointAlignments, sstream, true, true, extraBAMtags); - } - */ validHits += jointAlignments.size(); localNumAssignedFragments += (jointAlignments.size() > 0); @@ -1575,6 +1571,11 @@ void processReadsQuasi( std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); + if (salmonOpts.writeQualities) { + formatter.enable_qualities(); + } else { + formatter.disable_qualities(); + } pufferfish::util::QueryCache qc; bool mimicStrictBT2 = salmonOpts.mimicStrictBT2; diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp index ecdc8ab7e..9824d3934 100644 --- a/src/SalmonQuantify.cpp +++ b/src/SalmonQuantify.cpp @@ -841,6 +841,11 @@ void processReads( std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); + if (salmonOpts.writeQualities) { + formatter.enable_qualities(); + } else { + formatter.disable_qualities(); + } pufferfish::util::QueryCache qc; bool mimicStrictBT2 = salmonOpts.mimicStrictBT2; @@ -1605,6 +1610,11 @@ void processReads( std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); + if (salmonOpts.writeQualities) { + formatter.enable_qualities(); + } else { + formatter.disable_qualities(); + } pufferfish::util::QueryCache qc; bool mimicStrictBT2 = salmonOpts.mimicStrictBT2; From 014c5804d3d5adf97d4880ab651125574c46a891 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 21 Jun 2022 13:28:27 -0400 Subject: [PATCH 10/17] remove extra blank line --- src/SalmonQuantify.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp index 9824d3934..c3a6fbd99 100644 --- a/src/SalmonQuantify.cpp +++ b/src/SalmonQuantify.cpp @@ -1937,7 +1937,6 @@ void processReads( mstats.numFragmentsFiltered += numFragsDropped; mstats.numDecoyFragments += numDecoyFrags; } - /// DONE QUASI From 10fe80488768232fb14e7e849982f27dcd94e073 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 21 Jun 2022 14:40:19 -0400 Subject: [PATCH 11/17] extra blank --- src/SalmonQuantify.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp index c3a6fbd99..d17cb94af 100644 --- a/src/SalmonQuantify.cpp +++ b/src/SalmonQuantify.cpp @@ -997,7 +997,6 @@ void processReads( } */ - bool haveOrphans = false; MergeResult mergeRes{MergeResult::HAD_NONE}; // Consider a read as too short if both ends are too short From 5a3a0b1deaea2736b56e7b7efcc1c35d5e7f5ab9 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 21 Jun 2022 16:31:53 -0400 Subject: [PATCH 12/17] update docker image used for drone ci --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index a965508f1..a86a06e1b 100644 --- a/.drone.yml +++ b/.drone.yml @@ -1,6 +1,6 @@ pipeline: setup: - image: hbb:salmon_build_new + image: combinelab/hbb_salmon_build:latest commands: - echo "Starting build" - ./.drone/build.sh From 7ecb8358cb425e76f434f2a9ea7041faa43d93d0 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 21 Jun 2022 16:49:10 -0400 Subject: [PATCH 13/17] update image for all steps --- .drone.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index a86a06e1b..ff1797cec 100644 --- a/.drone.yml +++ b/.drone.yml @@ -5,7 +5,7 @@ - echo "Starting build" - ./.drone/build.sh test_indexing: - image: hbb:salmon_build_new + image: combinelab/hbb_salmon_build:latest commands: - echo "[Testing quant]" - ./.drone/test_quant.sh @@ -13,7 +13,7 @@ - /mnt/scratch6/avi/data:/mnt/data - /mnt/scratch6/salmon_ci:/mnt/ci_res copy_build: - image: hbb:salmon_build_new + image: combinelab/hbb_salmon_build:latest commands: - echo "[Packaging binary]" - ./.drone/copy_build.sh From db72fd31d4e4fe170761591f15a58024cb249ab2 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 21 Jun 2022 17:08:06 -0400 Subject: [PATCH 14/17] no notify gitter anymore --- .drone.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index ff1797cec..05627a6b7 100644 --- a/.drone.yml +++ b/.drone.yml @@ -20,7 +20,3 @@ volumes: - /mnt/scratch6/avi/data:/mnt/data - /mnt/scratch6/salmon_ci:/mnt/ci_res - notify_gitter: - image: plugins/gitter - commands: - - echo "[Notifying gitter]" From 0657c4e2c65c27fc9347478f09f0681417780d69 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Wed, 22 Jun 2022 10:10:20 -0400 Subject: [PATCH 15/17] style: clang-format SalmonQuantify.cpp --- src/SalmonQuantify.cpp | 1402 +++++++++++++++++++++------------------- 1 file changed, 731 insertions(+), 671 deletions(-) diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp index d17cb94af..88f7a1227 100644 --- a/src/SalmonQuantify.cpp +++ b/src/SalmonQuantify.cpp @@ -90,10 +90,10 @@ #include "SalmonDefaults.hpp" #include "SalmonExceptions.hpp" #include "SalmonIndex.hpp" +#include "SalmonMappingUtils.hpp" #include "SalmonMath.hpp" #include "SalmonUtils.hpp" #include "Transcript.hpp" -#include "SalmonMappingUtils.hpp" #include "AlignmentGroup.hpp" #include "BiasParams.hpp" @@ -114,17 +114,17 @@ //#include "HitManager.hpp" #include "SalmonOpts.hpp" //#include "SingleAlignmentFormatter.hpp" -#include "tsl/hopscotch_map.h" #include "edlib.h" +#include "tsl/hopscotch_map.h" -#include "pufferfish/Util.hpp" -#include "pufferfish/MemCollector.hpp" #include "pufferfish/MemChainer.hpp" -#include "pufferfish/SAMWriter.hpp" +#include "pufferfish/MemCollector.hpp" #include "pufferfish/PuffAligner.hpp" +#include "pufferfish/SAMWriter.hpp" +#include "pufferfish/SelectiveAlignmentUtils.hpp" +#include "pufferfish/Util.hpp" #include "pufferfish/ksw2pp/KSW2Aligner.hpp" #include "pufferfish/metro/metrohash64.h" -#include "pufferfish/SelectiveAlignmentUtils.hpp" /****** QUASI MAPPING DECLARATIONS *********/ using MateStatus = pufferfish::util::MateStatus; @@ -160,22 +160,20 @@ using AlnGroupQueue = oneapi::tbb::concurrent_queue*>; using ReadExperimentT = ReadExperiment>; template -void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc, - uint64_t firstTimestepOfRound, ReadLibrary& readLib, - const SalmonOpts& salmonOpts, - AlnGroupVecRange batchHits, - std::vector& transcripts, - ClusterForest& clusterForest, - FragmentLengthDistribution& fragLengthDist, - BiasParams& observedBiasParams, - /** - * NOTE : test new el model in future - * EffectiveLengthStats& obsEffLens, - */ - std::atomic& numAssignedFragments, - std::default_random_engine& randEng, bool initialRound, - std::atomic& burnedIn, double& maxZeroFrac, - distribution_utils::LogCMFCache& logCMFCache) { +void processMiniBatch( + ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc, + uint64_t firstTimestepOfRound, ReadLibrary& readLib, + const SalmonOpts& salmonOpts, AlnGroupVecRange batchHits, + std::vector& transcripts, ClusterForest& clusterForest, + FragmentLengthDistribution& fragLengthDist, BiasParams& observedBiasParams, + /** + * NOTE : test new el model in future + * EffectiveLengthStats& obsEffLens, + */ + std::atomic& numAssignedFragments, + std::default_random_engine& randEng, bool initialRound, + std::atomic& burnedIn, double& maxZeroFrac, + distribution_utils::LogCMFCache& logCMFCache) { using salmon::math::LOG_0; using salmon::math::LOG_1; @@ -200,7 +198,7 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc std::vector& fragStartDists = readExp.fragmentStartPositionDistributions(); - //auto& biasModel = readExp.sequenceBiasModel(); + // auto& biasModel = readExp.sequenceBiasModel(); auto& observedGCMass = observedBiasParams.observedGCMass; auto& obsFwd = observedBiasParams.massFwd; auto& obsRC = observedBiasParams.massRC; @@ -247,7 +245,8 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc double startingCumulativeMass = fmCalc.cumulativeLogMassAt(firstTimestepOfRound); - auto isUnexpectedOrphan = [](AlnT& aln, LibraryFormat expectedLibFormat) -> bool { + auto isUnexpectedOrphan = [](AlnT& aln, + LibraryFormat expectedLibFormat) -> bool { return (expectedLibFormat.type == ReadType::PAIRED_END and aln.mateStatus != MateStatus::PAIRED_END_PAIRED); }; @@ -257,9 +256,13 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc } const size_t maxCacheLen{salmonOpts.fragLenDistMax}; - // Caches to avoid fld updates _within_ the set of alignments of a fragment - auto fetchPMF = [&fragLengthDist](size_t l) -> double { return fragLengthDist.pmf(l); }; - auto fetchCMF = [&fragLengthDist](size_t l) -> double { return fragLengthDist.cmf(l); }; + // Caches to avoid fld updates _within_ the set of alignments of a fragment + auto fetchPMF = [&fragLengthDist](size_t l) -> double { + return fragLengthDist.pmf(l); + }; + auto fetchCMF = [&fragLengthDist](size_t l) -> double { + return fragLengthDist.cmf(l); + }; distribution_utils::IndexedVersionedCache pmfCache(maxCacheLen); distribution_utils::IndexedVersionedCache cmfCache(maxCacheLen); @@ -365,10 +368,15 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc // if we are modeling fragment probabilities for single-end mappings // and this is either a single-end library or an orphan. - if (modelSingleFragProb and useFragLengthDist and (singleEndLib or isUnexpectedOrphan(aln, expectedLibraryFormat))) { - // get the probability for a fragment of "ambiguous" length --- i.e. only the maximum length is bounded but - // the fragment length is not completely characterized. - logFragProb = logCMFCache.getAmbigFragLengthProb(aln.fwd, aln.pos, aln.readLen, transcript.CompleteLength, burnedIn.load()); + if (modelSingleFragProb and useFragLengthDist and + (singleEndLib or + isUnexpectedOrphan(aln, expectedLibraryFormat))) { + // get the probability for a fragment of "ambiguous" length --- i.e. + // only the maximum length is bounded but the fragment length is not + // completely characterized. + logFragProb = logCMFCache.getAmbigFragLengthProb( + aln.fwd, aln.pos, aln.readLen, transcript.CompleteLength, + burnedIn.load()); } else if (isUnexpectedOrphan(aln, expectedLibraryFormat)) { // If we are expecting a paired-end library, and this is an orphan, // then logFragProb should be small @@ -626,13 +634,13 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc int32_t posFW = aln.fwd ? aln.pos : aln.matePos; int32_t posRC = aln.fwd ? aln.matePos : aln.pos; posFW = posFW < 0 ? 0 : posFW; - posFW = posFW >= static_cast(transcript.RefLength) ? - static_cast(transcript.RefLength) - 1 - : posFW; + posFW = posFW >= static_cast(transcript.RefLength) + ? static_cast(transcript.RefLength) - 1 + : posFW; posRC = posRC < 0 ? 0 : posRC; - posRC = posRC >= static_cast(transcript.RefLength) ? - static_cast(transcript.RefLength) - 1 - : posRC; + posRC = posRC >= static_cast(transcript.RefLength) + ? static_cast(transcript.RefLength) - 1 + : posRC; observedPosBiasFwd[lengthClassIndex].addMass( posFW, transcript.RefLength, aln.logProb); observedPosBiasRC[lengthClassIndex].addMass( @@ -644,8 +652,9 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc case MateStatus::SINGLE_END: { int32_t pos = aln.pos; pos = pos < 0 ? 0 : pos; - pos = pos >= static_cast(transcript.RefLength) ? - static_cast(transcript.RefLength) - 1 : pos; + pos = pos >= static_cast(transcript.RefLength) + ? static_cast(transcript.RefLength) - 1 + : pos; if (aln.fwd) { observedPosBiasFwd[lengthClassIndex].addMass( pos, transcript.RefLength, aln.logProb); @@ -664,7 +673,8 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc int32_t start = std::min(aln.pos, aln.matePos); int32_t stop = start + aln.fragLen - 1; // WITH CONTEXT - if (start >= 0 and stop < static_cast(transcript.RefLength)) { + if (start >= 0 and + stop < static_cast(transcript.RefLength)) { bool valid{false}; auto desc = transcript.gcDesc(start, stop, valid); if (valid) { @@ -684,7 +694,8 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc int32_t start = aln.fwd ? aln.pos : std::max(0, aln.pos - cmean); int32_t stop = start + cmean; // WITH CONTEXT - if (start >= 0 and stop < static_cast(transcript.RefLength)) { + if (start >= 0 and + stop < static_cast(transcript.RefLength)) { bool valid{false}; auto desc = transcript.gcDesc(start, stop, valid); if (valid) { @@ -701,7 +712,6 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc if (fragLength > 0) { fragLengthDist.addVal(fragLength, logForgettingMass); } - } } // end normalize @@ -720,7 +730,7 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc logForgettingMass, updateCounts); } - for(size_t i=0; i < libTypeCounts.size(); ++i) { + for (size_t i = 0; i < libTypeCounts.size(); ++i) { libTypeCounts[i] += (libTypeCountsPerFrag[i] > 0); } } // end read group @@ -745,7 +755,6 @@ void processMiniBatch(ReadExperimentT& readExp, ForgettingMassCalculator& fmCalc } } - /// START QUASI template void processReads( @@ -761,19 +770,17 @@ void processReads( * NOTE : test new el model in future * EffectiveLengthStats& obsEffLengths, **/ - SalmonOpts& salmonOpts, - std::mutex& iomutex, bool initialRound, std::atomic& burnedIn, - volatile bool& /*writeToCache*/, - MappingStatistics& mstats, - size_t /*threadID*/) { + SalmonOpts& salmonOpts, std::mutex& iomutex, bool initialRound, + std::atomic& burnedIn, volatile bool& /*writeToCache*/, + MappingStatistics& mstats, size_t /*threadID*/) { uint64_t count_fwd = 0, count_bwd = 0; - // Seed with a real random value, if available - #if defined(__linux) && defined(__GLIBCXX__) && __GLIBCXX__ >= 20200128 - std::random_device rd("/dev/urandom"); - #else - std::random_device rd; - #endif // defined(__GLIBCXX__) && __GLIBCXX__ >= 2020012 +// Seed with a real random value, if available +#if defined(__linux) && defined(__GLIBCXX__) && __GLIBCXX__ >= 20200128 + std::random_device rd("/dev/urandom"); +#else + std::random_device rd; +#endif // defined(__GLIBCXX__) && __GLIBCXX__ >= 2020012 // Create a random uniform distribution std::default_random_engine eng(rd()); @@ -806,17 +813,16 @@ void processReads( .seqBiasModelRC; // readExp.readBias(salmon::utils::Direction::REVERSE_COMPLEMENT); // k-mers for sequence bias context - //Mer leftMer; - //Mer rightMer; + // Mer leftMer; + // Mer rightMer; SBMer leftMer; SBMer rightMer; - uint64_t firstTimestepOfRound = fmCalc.getCurrentTimestep(); size_t minK = qidx->k(); size_t locRead{0}; - //uint64_t localUpperBoundHits{0}; + // uint64_t localUpperBoundHits{0}; size_t rangeSize{0}; uint64_t localNumAssignedFragments{0}; bool consistentHits = salmonOpts.consistentHits; @@ -833,11 +839,17 @@ void processReads( ksw2pp::KSW2Aligner aligner; pufferfish::util::AlignmentConfig aconf; pufferfish::util::MappingConstraintPolicy mpol; - bool initOK = salmon::mapping_utils::initMapperSettings(salmonOpts, memCollector, aligner, aconf, mpol); - PuffAligner puffaligner(qidx->refseq_, qidx->refAccumLengths_, qidx->k(), aconf, aligner); - - pufferfish::util::CachedVectorMap, std::hash> leftHits; - pufferfish::util::CachedVectorMap, std::hash> rightHits; + bool initOK = salmon::mapping_utils::initMapperSettings( + salmonOpts, memCollector, aligner, aconf, mpol); + PuffAligner puffaligner(qidx->refseq_, qidx->refAccumLengths_, qidx->k(), + aconf, aligner); + + pufferfish::util::CachedVectorMap< + size_t, std::vector, std::hash> + leftHits; + pufferfish::util::CachedVectorMap< + size_t, std::vector, std::hash> + rightHits; std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); @@ -898,15 +910,15 @@ void processReads( spdlog::drop_all(); std::exit(1); } - + LibraryFormat expectedLibraryFormat = rl.format(); bool tryAlign{salmonOpts.validateMappings}; for (size_t i = 0; i < rangeSize; ++i) { // For all the reads in this batch auto& rp = rg[i]; - readLen = static_cast(rp.first.seq.length()); - mateLen = static_cast(rp.second.seq.length()); + readLen = static_cast(rp.first.seq.length()); + mateLen = static_cast(rp.second.seq.length()); totLen = readLen + mateLen; // -- start resetting local variables @@ -934,43 +946,40 @@ void processReads( bool tooShortLeft = (readLenLeft < minK); bool tooShortRight = (readLenRight < minK); - bool lh = tooShortLeft ? false : - memCollector(rp.first.seq, - qc, - true, // isLeft - false // verbose - ); - bool rh = tooShortRight ? false : - memCollector(rp.second.seq, - qc, - false, // isLeft - false // verbose - ); - memCollector.findChains(rp.first.seq, - leftHits, - salmonOpts.fragLenDistMax, + bool lh = tooShortLeft ? false + : memCollector(rp.first.seq, qc, + true, // isLeft + false // verbose + ); + bool rh = tooShortRight ? false + : memCollector(rp.second.seq, qc, + false, // isLeft + false // verbose + ); + memCollector.findChains(rp.first.seq, leftHits, salmonOpts.fragLenDistMax, MateStatus::PAIRED_END_LEFT, useChainingHeuristic, // heuristic chaining - true, // isLeft - false // verbose - ); - memCollector.findChains(rp.second.seq, - rightHits, + true, // isLeft + false // verbose + ); + memCollector.findChains(rp.second.seq, rightHits, salmonOpts.fragLenDistMax, MateStatus::PAIRED_END_RIGHT, - useChainingHeuristic, // heuristic chaining - false, // isLeft - false // verbose - ); + useChainingHeuristic, // heuristic chaining + false, // isLeft + false // verbose + ); - hctr.numMappedAtLeastAKmer += (leftHits.size() > 0 || rightHits.size() > 0) ? 1 : 0; + hctr.numMappedAtLeastAKmer += + (leftHits.size() > 0 || rightHits.size() > 0) ? 1 : 0; /* salmonOpts.jointLog->info("\n\n mappings for left end \n\n"); for (auto&& h : leftHits) { salmonOpts.jointLog->info("hit to : {}", qidx->refName( h.first )); for (auto&& mc : *h.second) { - salmonOpts.jointLog->info("\t ori : {}, pos : {}, score : {}", (mc.isFw ? "fw" : "rc"), mc.getTrFirstHitPos(), mc.score); + salmonOpts.jointLog->info("\t ori : {}, pos : {}, score : {}", + (mc.isFw ? "fw" : "rc"), mc.getTrFirstHitPos(), mc.score); } } @@ -978,7 +987,8 @@ void processReads( for (auto&& h : rightHits) { salmonOpts.jointLog->info("hit to : {}", qidx->refName( h.first )); for (auto&& mc : *h.second) { - salmonOpts.jointLog->info("\t ori : {}, pos : {}, score : {}", (mc.isFw ? "fw" : "rc"), mc.getTrFirstHitPos(), mc.score); + salmonOpts.jointLog->info("\t ori : {}, pos : {}, score : {}", + (mc.isFw ? "fw" : "rc"), mc.getTrFirstHitPos(), mc.score); } } */ @@ -992,8 +1002,8 @@ void processReads( leftHits.end() ); rightHits.erase( std::remove_if(rightHits.begin(), rightHits.end(), [&transcripts](QuasiAlignment& a) { - return a.tid >= transcripts.size(); }), - rightHits.end() ); + return a.tid >= transcripts.size(); + }), rightHits.end() ); } */ @@ -1009,12 +1019,9 @@ void processReads( // // bool noDiscordant = false; - mergeRes = pufferfish::util::joinReadsAndFilter(leftHits, rightHits, jointHits, - salmonOpts.fragLenDistMax, - totLen, - memCollector.getConsensusFraction(), - firstDecoyIndex, - mpol, hctr); + mergeRes = pufferfish::util::joinReadsAndFilter( + leftHits, rightHits, jointHits, salmonOpts.fragLenDistMax, totLen, + memCollector.getConsensusFraction(), firstDecoyIndex, mpol, hctr); tooManyHits = jointHits.size() > salmonOpts.maxReadOccs; @@ -1023,13 +1030,17 @@ void processReads( // so (IU, ISF, ISR). If the library type is different // we should either raise a warning / error, or implement // library-type generic recovery. - bool mergeStatusOR = (mergeRes == pufferfish::util::MergeResult::HAD_EMPTY_INTERSECTION or - mergeRes == pufferfish::util::MergeResult::HAD_ONLY_LEFT or - mergeRes == pufferfish::util::MergeResult::HAD_ONLY_RIGHT); + bool mergeStatusOR = + (mergeRes == + pufferfish::util::MergeResult::HAD_EMPTY_INTERSECTION or + mergeRes == pufferfish::util::MergeResult::HAD_ONLY_LEFT or + mergeRes == pufferfish::util::MergeResult::HAD_ONLY_RIGHT); haveOrphans = mergeStatusOR; - if ( mergeStatusOR and salmonOpts.recoverOrphans and !tooManyHits ) { + if (mergeStatusOR and salmonOpts.recoverOrphans and !tooManyHits) { // TODO NOTE : do futher testing - bool recoveredAny = selective_alignment::utils::recoverOrphans(rp.first.seq, rp.second.seq, recoveredHits, jointHits, puffaligner, false); + bool recoveredAny = selective_alignment::utils::recoverOrphans( + rp.first.seq, rp.second.seq, recoveredHits, jointHits, + puffaligner, false); numOrphansRescued += recoveredAny ? 1 : 0; // if we recovered a mate, then we have no orphans. haveOrphans = !recoveredAny; @@ -1041,20 +1052,20 @@ void processReads( upperBoundHits += (jointHits.size() > 0); } - /* - salmonOpts.jointLog->info("\n\n mappings for joined ends \n\n"); - for (auto&& h : jointHits) { - salmonOpts.jointLog->info("hit to : {}", qidx->refName( h.tid )); - salmonOpts.jointLog->info("\t lpos : {}, rpos : {}, score : {}", h.leftClust->getTrFirstHitPos(), - h.rightClust->getTrFirstHitPos(), h.coverage()); - } - */ - + /* + salmonOpts.jointLog->info("\n\n mappings for joined ends \n\n"); + for (auto&& h : jointHits) { + salmonOpts.jointLog->info("hit to : {}", qidx->refName( h.tid )); + salmonOpts.jointLog->info("\t lpos : {}, rpos : {}, score : {}", + h.leftClust->getTrFirstHitPos(), h.rightClust->getTrFirstHitPos(), + h.coverage()); + } + */ - // FIXME: This clears the alignment group, but that contains nothing - // at this point. We should either check only once we are at the alignment - // phase (and therefore filter nothing based on pre-alignment hits), or - // clear the jointHits at this point. + // FIXME: This clears the alignment group, but that contains nothing + // at this point. We should either check only once we are at the + // alignment phase (and therefore filter nothing based on pre-alignment + // hits), or clear the jointHits at this point. // If the read mapped to > maxReadOccs places, discard it if (tooManyHits) { @@ -1064,36 +1075,42 @@ void processReads( // NOTE : Under our new definition of orphans, alignments of read ends // can be orphans even if the other read end aligns to the same reference. - // It only matters that the alignments were not paired. Thus, it is possible - // below to have the same reference appear on the left and right side of an orphan link. + // It only matters that the alignments were not paired. Thus, it is + // possible below to have the same reference appear on the left and right + // side of an orphan link. if (writeOrphanLinks) { // We have orphans if either // 1) there are *no* hits or // 2) there are hits for *both* the left and right reads, but not to the // same txp salmonOpts.jointLog->info("{} :: {} ", rp.first.name, rp.second.name); - if (haveOrphans and mergeRes == pufferfish::util::MergeResult::HAD_EMPTY_INTERSECTION) { + if (haveOrphans and + mergeRes == pufferfish::util::MergeResult::HAD_EMPTY_INTERSECTION) { auto it = jointHits.begin(); - // NOTE : if we have orphans from both left and right (and hence HAD_EMPTY_INTERSECTION) - // then joinReadsAndFilter will put all of the left orphans first, followed by right orphans. - while (it != jointHits.end() and it->isLeftAvailable() ) { - orphanLinks << qidx->getRefId(it->tid) << ',' << it->leftClust->firstRefPos() << "\t"; + // NOTE : if we have orphans from both left and right (and hence + // HAD_EMPTY_INTERSECTION) then joinReadsAndFilter will put all of the + // left orphans first, followed by right orphans. + while (it != jointHits.end() and it->isLeftAvailable()) { + orphanLinks << qidx->getRefId(it->tid) << ',' + << it->leftClust->firstRefPos() << "\t"; ++it; } orphanLinks << ":"; - while (it != jointHits.end() and it->isRightAvailable() ) { - orphanLinks << qidx->getRefId(it->tid) << ',' << it->rightClust->firstRefPos() << "\t"; + while (it != jointHits.end() and it->isRightAvailable()) { + orphanLinks << qidx->getRefId(it->tid) << ',' + << it->rightClust->firstRefPos() << "\t"; ++it; } orphanLinks << "\n"; } } - //salmonOpts.jointLog->info("num hits before alignment = {:n}", jointHits.size()); - // If we have mappings, then process them. + // salmonOpts.jointLog->info("num hits before alignment = {:n}", + // jointHits.size()); + // If we have mappings, then process them. if (!jointHits.empty()) { - bool isPaired = jointHits.front().mateStatus == - MateStatus::PAIRED_END_PAIRED; + bool isPaired = + jointHits.front().mateStatus == MateStatus::PAIRED_END_PAIRED; /* if (isPaired) { mapType = salmon::utils::MappingType::PAIRED_MAPPED; @@ -1115,114 +1132,134 @@ void processReads( puffaligner.clear(); puffaligner.getScoreStatus().reset(); msi.clear(jointHits.size()); - + size_t idx{0}; bool is_multimapping = (jointHits.size() > 1); - for (auto &&jointHit : jointHits) { - auto hitScore = puffaligner.calculateAlignments(rp.first.seq, rp.second.seq, jointHit, hctr, is_multimapping, false); + for (auto&& jointHit : jointHits) { + auto hitScore = puffaligner.calculateAlignments( + rp.first.seq, rp.second.seq, jointHit, hctr, is_multimapping, + false); bool validScore = (hitScore != invalidScore); numMappingsDropped += validScore ? 0 : 1; auto tid = qidx->getRefId(jointHit.tid); - // ----- compatibility determination - // This code is to determine if a given PE mapping is _compatible_ or not with - // the expected library format. This is to remove stochasticity in mapping. - // Specifically, if there are equally highest-scoring alignments to a target - // we will always prefer the one that is compatible. + // ----- compatibility determination + // This code is to determine if a given PE mapping is _compatible_ + // or not with the expected library format. This is to remove + // stochasticity in mapping. Specifically, if there are equally + // highest-scoring alignments to a target we will always prefer the + // one that is compatible. - bool isUnstranded = expectedLibraryFormat.strandedness == ReadStrandedness::U; + bool isUnstranded = + expectedLibraryFormat.strandedness == ReadStrandedness::U; bool isOrphan = jointHit.isOrphan(); - + // if the protocol is unstranded: // (1) an orphan is always compatible - // (2) a paired-end mapping is compatible if the ends are on separate strands - bool isCompat = isUnstranded ? (isOrphan ? true : (jointHit.leftClust->isFw != jointHit.rightClust->isFw)) : false; + // (2) a paired-end mapping is compatible if the ends are on + // separate strands + bool isCompat = isUnstranded + ? (isOrphan ? true + : (jointHit.leftClust->isFw != + jointHit.rightClust->isFw)) + : false; // if the mapping hasn't been determined to be compatible yet if (!isCompat) { - // if this is an orphan mapping + // if this is an orphan mapping if (isOrphan) { bool isLeft = jointHit.isLeftAvailable(); // ISF // if the expectation is ISF, then this read is compatible if - // (1) we observed the left read and it is forward + // (1) we observed the left read and it is forward // (2) we observed the right read and it is not foward // ISR - // if the expectation is ISR, then this read is compatible if + // if the expectation is ISR, then this read is compatible if // (1) we observed the left read and it is not forward // (2) we observed the right read and it is foward - isCompat = (expectedLibraryFormat.strandedness == ReadStrandedness::SA) ? - ((isLeft and jointHit.leftClust->isFw) or (!isLeft and !jointHit.rightClust->isFw)) : - ((expectedLibraryFormat.strandedness == ReadStrandedness::AS) ? - ((isLeft and !jointHit.leftClust->isFw) or (!isLeft and jointHit.rightClust->isFw)) : false); + isCompat = + (expectedLibraryFormat.strandedness == ReadStrandedness::SA) + ? ((isLeft and jointHit.leftClust->isFw) or + (!isLeft and !jointHit.rightClust->isFw)) + : ((expectedLibraryFormat.strandedness == + ReadStrandedness::AS) + ? ((isLeft and !jointHit.leftClust->isFw) or + (!isLeft and jointHit.rightClust->isFw)) + : false); } else { bool leftIsFw = jointHit.leftClust->isFw; bool rightIsFw = jointHit.rightClust->isFw; // paired-end paired - isCompat = (expectedLibraryFormat.strandedness == ReadStrandedness::SA) ? - (leftIsFw and !rightIsFw) : - ((expectedLibraryFormat.strandedness == ReadStrandedness::AS) ? - (!leftIsFw and rightIsFw) : false); + isCompat = + (expectedLibraryFormat.strandedness == ReadStrandedness::SA) + ? (leftIsFw and !rightIsFw) + : ((expectedLibraryFormat.strandedness == + ReadStrandedness::AS) + ? (!leftIsFw and rightIsFw) + : false); } } - // ----- end compatibility determination + // ----- end compatibility determination // alternative compat /** - LibraryFormat lf(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U); - MateStatus ms = isOrphan ? - (jointHit.isLeftAvailable() ? MateStatus::PAIRED_END_LEFT : MateStatus::PAIRED_END_RIGHT) : + LibraryFormat lf(ReadType::SINGLE_END, ReadOrientation::NONE, + ReadStrandedness::U); MateStatus ms = isOrphan ? + (jointHit.isLeftAvailable() ? + MateStatus::PAIRED_END_LEFT : MateStatus::PAIRED_END_RIGHT) : MateStatus::PAIRED_END_PAIRED; switch (ms) { - case MateStatus::PAIRED_END_LEFT: + case MateStatus::PAIRED_END_LEFT: case MateStatus::PAIRED_END_RIGHT: { - lf = salmon::utils::hitType(jointHit.orphanClust()->getTrFirstHitPos(), jointHit.orphanClust()->isFw); - } break; - case MateStatus::PAIRED_END_PAIRED: { - uint32_t end1Pos = (jointHit.leftClust->isFw) ? jointHit.leftClust->getTrFirstHitPos() : - jointHit.leftClust->getTrFirstHitPos() + jointHit.leftClust->readLen; - uint32_t end2Pos = (jointHit.rightClust->isFw) ? jointHit.rightClust->getTrFirstHitPos() : - jointHit.rightClust->getTrFirstHitPos() + jointHit.rightClust->readLen; - bool canDovetail = false; lf = - salmon::utils::hitType(end1Pos, jointHit.leftClust->isFw, jointHit.leftClust->readLen, end2Pos, - jointHit.rightClust->isFw, jointHit.rightClust->readLen, canDovetail); - } break; - case MateStatus::SINGLE_END: { + salmon::utils::hitType(jointHit.orphanClust()->getTrFirstHitPos(), + jointHit.orphanClust()->isFw); } break; case + MateStatus::PAIRED_END_PAIRED: { uint32_t end1Pos = + (jointHit.leftClust->isFw) ? jointHit.leftClust->getTrFirstHitPos() + : jointHit.leftClust->getTrFirstHitPos() + + jointHit.leftClust->readLen; uint32_t end2Pos = + (jointHit.rightClust->isFw) ? + jointHit.rightClust->getTrFirstHitPos() : + jointHit.rightClust->getTrFirstHitPos() + + jointHit.rightClust->readLen; bool canDovetail = false; lf = + salmon::utils::hitType(end1Pos, jointHit.leftClust->isFw, + jointHit.leftClust->readLen, end2Pos, jointHit.rightClust->isFw, + jointHit.rightClust->readLen, canDovetail); } break; case + MateStatus::SINGLE_END: { // do nothing } break; default: break; } - auto p = isOrphan ? jointHit.orphanClust()->getTrFirstHitPos() : jointHit.leftClust->getTrFirstHitPos(); - auto fw = isOrphan ? jointHit.orphanClust()->isFw : jointHit.leftClust->isFw; - bool isCompatRef = salmon::utils::isCompatible(lf, expectedLibraryFormat, static_cast(p), fw, ms); + auto p = isOrphan ? jointHit.orphanClust()->getTrFirstHitPos() : + jointHit.leftClust->getTrFirstHitPos(); auto fw = isOrphan ? + jointHit.orphanClust()->isFw : jointHit.leftClust->isFw; bool + isCompatRef = salmon::utils::isCompatible(lf, expectedLibraryFormat, + static_cast(p), fw, ms); if (isCompat != isCompatRef) { - std::cerr << "\n\n\nERROR: simple implemntation says compatiable is [" << isCompat << "], but ref. implementation says [" << isCompatRef << "]\n\n"; + std::cerr << "\n\n\nERROR: simple implemntation says compatiable + is [" << isCompat << "], but ref. implementation says [" << + isCompatRef << "]\n\n"; } **/ // end of alternative compat - salmon::mapping_utils::updateRefMappings(tid, hitScore, isCompat, idx, transcripts, invalidScore, msi); + salmon::mapping_utils::updateRefMappings( + tid, hitScore, isCompat, idx, transcripts, invalidScore, msi); ++idx; } bool bestHitDecoy = msi.haveOnlyDecoyMappings(); if (msi.bestScore > invalidScore and !bestHitDecoy) { - salmon::mapping_utils::filterAndCollectAlignments(jointHits, - readLen, - mateLen, - false, // true for single-end false otherwise - tryAlign, - hardFilter, - salmonOpts.scoreExp, - salmonOpts.minAlnProb, - msi, - jointAlignments); + salmon::mapping_utils::filterAndCollectAlignments( + jointHits, readLen, mateLen, + false, // true for single-end false otherwise + tryAlign, hardFilter, salmonOpts.scoreExp, + salmonOpts.minAlnProb, msi, jointAlignments); // if we have alignments if (!jointAlignments.empty()) { // chose the mapType based on the mate status @@ -1245,59 +1282,56 @@ void processReads( } else { // if we had decoy hits, our type is decoy, otherwise it's unmapped - mapType = bestHitDecoy ? salmon::utils::MappingType::DECOY : salmon::utils::MappingType::UNMAPPED; + mapType = bestHitDecoy ? salmon::utils::MappingType::DECOY + : salmon::utils::MappingType::UNMAPPED; numDecoyFrags += bestHitDecoy ? 1 : 0; ++numFragsDropped; - // TODO: Create alignment objects for the decoys so that we can write - // decoy alignments to file - + // TODO: Create alignment objects for the decoys so that we can + // write decoy alignments to file + if (bestHitDecoy) { - salmon::mapping_utils::filterAndCollectAlignmentsDecoy(jointHits, - readLen, - mateLen, - false, // true for single-end false otherwise - tryAlign, - hardFilter, - salmonOpts.scoreExp, - salmonOpts.minAlnProb, - msi, - jointAlignments); + salmon::mapping_utils::filterAndCollectAlignmentsDecoy( + jointHits, readLen, mateLen, + false, // true for single-end false otherwise + tryAlign, hardFilter, salmonOpts.scoreExp, + salmonOpts.minAlnProb, msi, jointAlignments); } else { jointAlignmentGroup.clearAlignments(); } - - //jointAlignmentGroup.clearAlignments(); + + // jointAlignmentGroup.clearAlignments(); } } else if (isPaired and noDovetail) { - salmonOpts.jointLog->critical("This code path is not yet implemented!"); + salmonOpts.jointLog->critical( + "This code path is not yet implemented!"); jointAlignments.erase( - std::remove_if(jointAlignments.begin(), jointAlignments.end(), - [](const QuasiAlignment& h) -> bool { - if (h.fwd != h.mateIsFwd) { - if (h.fwd and (h.pos > h.matePos)) { - return true; - } else if (h.mateIsFwd and (h.matePos > h.pos)) { - return true; - } - } - return false; - }), - jointAlignments.end()); + std::remove_if(jointAlignments.begin(), jointAlignments.end(), + [](const QuasiAlignment& h) -> bool { + if (h.fwd != h.mateIsFwd) { + if (h.fwd and (h.pos > h.matePos)) { + return true; + } else if (h.mateIsFwd and + (h.matePos > h.pos)) { + return true; + } + } + return false; + }), + jointAlignments.end()); } if (writeQuasimappings) { - writeAlignmentsToStream(rp, formatter, - jointAlignments, - sstream, + writeAlignmentsToStream(rp, formatter, jointAlignments, sstream, true, // write orphans - true // transcript ID's already decoded (taking care of short refs) - ); + true // transcript ID's already decoded + // (taking care of short refs) + ); } // We've kept decoy aignments around to this point so that we can - // potentially write these alignments to the SAM file. However, if - // we got to this point and only have decoy mappings, then clear the - // mappings here because none of the procesing below is relevant for + // potentially write these alignments to the SAM file. However, if + // we got to this point and only have decoy mappings, then clear the + // mappings here because none of the procesing below is relevant for // decoys. if (mapType == salmon::utils::MappingType::DECOY) { jointAlignmentGroup.clearAlignments(); @@ -1338,8 +1372,10 @@ void processReads( bool success = false; if ((dir1 != dir2) and // Shouldn't be from the same strand - (startPos1 > 0 and startPos1 < static_cast(t.RefLength)) and - (startPos2 > 0 and startPos2 < static_cast(t.RefLength))) { + (startPos1 > 0 and + startPos1 < static_cast(t.RefLength)) and + (startPos2 > 0 and + startPos2 < static_cast(t.RefLength))) { const char* txpStart = t.Sequence(); const char* txpEnd = txpStart + t.RefLength; @@ -1361,17 +1397,18 @@ void processReads( if ((startPos1 >= readBias1.contextBefore(read1RC) and startPos1 + readBias1.contextAfter(read1RC) < - static_cast(t.RefLength)) and + static_cast(t.RefLength)) and (startPos2 >= readBias2.contextBefore(read2RC) and - startPos2 + readBias2.contextAfter(read2RC) < static_cast(t.RefLength))) { + startPos2 + readBias2.contextAfter(read2RC) < + static_cast(t.RefLength))) { int32_t fwPos = (h.fwd) ? startPos1 : startPos2; int32_t rcPos = (h.fwd) ? startPos2 : startPos1; if (fwPos < rcPos) { leftMer.fromChars(txpStart + startPos1 - - readBias1.contextBefore(read1RC)); + readBias1.contextBefore(read1RC)); rightMer.fromChars(txpStart + startPos2 - - readBias2.contextBefore(read2RC)); + readBias2.contextBefore(read2RC)); if (read1RC) { leftMer.rc(); } else { @@ -1428,7 +1465,7 @@ void processReads( if (writeUnmapped and mapType != salmon::utils::MappingType::PAIRED_MAPPED) { // If we have no mappings --- then there's nothing to do - // unless we're outputting names for un-mapped / decoy-mapped reads + // unless we're outputting names for un-mapped / decoy-mapped reads unmappedNames << rp.first.name << ' ' << salmon::utils::str(mapType) << '\n'; } @@ -1489,7 +1526,8 @@ void processReads( } prevObservedFrags = numObservedFragments; - AlnGroupVecRange hitLists = {structureVec.begin(), structureVec.begin()+rangeSize}; + AlnGroupVecRange hitLists = { + structureVec.begin(), structureVec.begin() + rangeSize}; /* if (cachingMappings) { @@ -1504,7 +1542,8 @@ void processReads( * NOTE : test new el model in future * obsEffLengths, */ - numAssignedFragments, eng, initialRound, burnedIn, maxZeroFrac, logCMFCache); + numAssignedFragments, eng, initialRound, burnedIn, maxZeroFrac, + logCMFCache); } if (maxZeroFrac > 0.0) { @@ -1522,7 +1561,8 @@ void processReads( salmonOpts.jointLog->info("Number of orphans rescued in this thread {}", numOrphansRescued); */ - //salmonOpts.jointLog->info("Score filtering dropped {} total mappings", numDropped); + // salmonOpts.jointLog->info("Score filtering dropped {} total mappings", + // numDropped); readExp.updateShortFrags(shortFragStats); } @@ -1544,56 +1584,52 @@ void processReads( * NOTE : test new el model in future * EffectiveLengthStats& obsEffLengths, **/ - SalmonOpts& salmonOpts, - std::mutex& iomutex, bool initialRound, std::atomic& burnedIn, - volatile bool& /*writeToCache*/, - MappingStatistics& mstats, - size_t /*threadID*/) { - - uint64_t count_fwd = 0, count_bwd = 0; - // Seed with a real random value, if available - #if defined(__linux) && defined(__GLIBCXX__) && __GLIBCXX__ >= 20200128 - std::random_device rd("/dev/urandom"); - #else - std::random_device rd; - #endif // defined(__GLIBCXX__) && __GLIBCXX__ >= 2020012 - - // Create a random uniform distribution - std::default_random_engine eng(rd()); - - uint64_t prevObservedFrags{1}; - uint64_t leftHitCount{0}; - uint64_t hitListCount{0}; - salmon::utils::ShortFragStats shortFragStats; - bool tooShort{false}; - double maxZeroFrac{0.0}; - // true below because in this function, we have a single-end library - distribution_utils::LogCMFCache logCMFCache(&fragLengthDist, true); - - // Write unmapped reads - fmt::MemoryWriter unmappedNames; - bool writeUnmapped = salmonOpts.writeUnmappedNames; - spdlog::logger* unmappedLogger = - (writeUnmapped) ? salmonOpts.unmappedLog.get() : nullptr; - - auto& readBiasFW = observedBiasParams.seqBiasModelFW; - auto& readBiasRC = observedBiasParams.seqBiasModelRC; - //Mer context; - SBMer context; - - uint64_t firstTimestepOfRound = fmCalc.getCurrentTimestep(); - size_t minK = qidx->k(); - - size_t locRead{0}; - //uint64_t localUpperBoundHits{0}; - size_t rangeSize{0}; - - bool tooManyHits{false}; - size_t readLen{0}; - bool consistentHits{salmonOpts.consistentHits}; - bool quiet{salmonOpts.quiet}; + SalmonOpts& salmonOpts, std::mutex& iomutex, bool initialRound, + std::atomic& burnedIn, volatile bool& /*writeToCache*/, + MappingStatistics& mstats, size_t /*threadID*/) { + + uint64_t count_fwd = 0, count_bwd = 0; + // Seed with a real random value, if available +#if defined(__linux) && defined(__GLIBCXX__) && __GLIBCXX__ >= 20200128 + std::random_device rd("/dev/urandom"); +#else + std::random_device rd; +#endif // defined(__GLIBCXX__) && __GLIBCXX__ >= 2020012 + + // Create a random uniform distribution + std::default_random_engine eng(rd()); + uint64_t prevObservedFrags{1}; + uint64_t leftHitCount{0}; + uint64_t hitListCount{0}; + salmon::utils::ShortFragStats shortFragStats; + bool tooShort{false}; + double maxZeroFrac{0.0}; + // true below because in this function, we have a single-end library + distribution_utils::LogCMFCache logCMFCache(&fragLengthDist, true); + + // Write unmapped reads + fmt::MemoryWriter unmappedNames; + bool writeUnmapped = salmonOpts.writeUnmappedNames; + spdlog::logger* unmappedLogger = + (writeUnmapped) ? salmonOpts.unmappedLog.get() : nullptr; + auto& readBiasFW = observedBiasParams.seqBiasModelFW; + auto& readBiasRC = observedBiasParams.seqBiasModelRC; + // Mer context; + SBMer context; + + uint64_t firstTimestepOfRound = fmCalc.getCurrentTimestep(); + size_t minK = qidx->k(); + + size_t locRead{0}; + // uint64_t localUpperBoundHits{0}; + size_t rangeSize{0}; + + bool tooManyHits{false}; + size_t readLen{0}; + bool consistentHits{salmonOpts.consistentHits}; + bool quiet{salmonOpts.quiet}; //******* Setting up pufferfish mapping constexpr const int32_t invalidScore = std::numeric_limits::min(); @@ -1602,10 +1638,14 @@ void processReads( ksw2pp::KSW2Aligner aligner; pufferfish::util::AlignmentConfig aconf; pufferfish::util::MappingConstraintPolicy mpol; - bool initOK = salmon::mapping_utils::initMapperSettings(salmonOpts, memCollector, aligner, aconf, mpol); - PuffAligner puffaligner(qidx->refseq_, qidx->refAccumLengths_, qidx->k(), aconf, aligner); - - pufferfish::util::CachedVectorMap, std::hash> hits; + bool initOK = salmon::mapping_utils::initMapperSettings( + salmonOpts, memCollector, aligner, aconf, mpol); + PuffAligner puffaligner(qidx->refseq_, qidx->refAccumLengths_, qidx->k(), + aconf, aligner); + + pufferfish::util::CachedVectorMap< + size_t, std::vector, std::hash> + hits; std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); @@ -1623,322 +1663,320 @@ void processReads( size_t numOrphansRescued{0}; //******* - bool hardFilter = salmonOpts.hardFilter; pufferfish::util::HitCounters hctr; salmon::utils::MappingType mapType{salmon::utils::MappingType::UNMAPPED}; - fmt::MemoryWriter sstream; - auto* qmLog = salmonOpts.qmLog.get(); - bool writeQuasimappings = (qmLog != nullptr); - - std::string rc1; rc1.reserve(300); - - size_t numMappingsDropped{0}; - size_t numFragsDropped{0}; - size_t numDecoyFrags{0}; - const double decoyThreshold = salmonOpts.decoyThreshold; - - salmon::mapping_utils::MappingScoreInfo msi(decoyThreshold); - // we only collect detailed decoy information if we will be - // writing output to SAM. - msi.collect_decoys(writeQuasimappings); - - //std::vector alnCache; alnCache.reserve(15); - AlnCacheMap alnCache; alnCache.reserve(16); - - auto rg = parser->getReadGroup(); - while (parser->refill(rg)) { - rangeSize = rg.size(); - if (rangeSize > structureVec.size()) { - salmonOpts.jointLog->error("rangeSize = {}, but structureVec.size() = {} " - "--- this shouldn't happen.\n" - "Please report this bug on GitHub", - rangeSize, structureVec.size()); - salmonOpts.jointLog->flush(); - spdlog::drop_all(); - std::exit(1); - } - - LibraryFormat expectedLibraryFormat = rl.format(); - - bool tryAlign{salmonOpts.validateMappings}; - for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch - auto& rp = rg[i]; - readLen = rp.seq.length(); - tooShort = (readLen < minK); - auto& jointHitGroup = structureVec[i]; - jointHitGroup.clearAlignments(); - auto& jointAlignments = jointHitGroup.alignments(); - - mapType = salmon::utils::MappingType::UNMAPPED; - hits.clear(); - jointHits.clear(); - memCollector.clear(); - jointAlignments.clear(); - tooManyHits = false; - - bool lh = tooShort ? false : - memCollector(rp.seq, - qc, - true, // isLeft - false // verbose - ); - - memCollector.findChains(rp.seq, - hits, - salmonOpts.fragLenDistMax, - MateStatus::SINGLE_END, - useChainingHeuristic, // heuristic chaining - true, // isLeft - false // verbose - ); - // TODO : PF_INTEGRATION -/* - if (!tryAlign) { - jointHits.erase( std::remove_if(jointHits.begin(), jointHits.end(), - [&transcripts](QuasiAlignment& a) { - return a.tid >= transcripts.size(); }), - jointHits.end() ); - } -*/ - - // If the fragment was too short, record it - if (tooShort) { - ++shortFragStats.numTooShort; - shortFragStats.shortest = std::min(shortFragStats.shortest, readLen); - } else { - pufferfish::util::joinReadsAndFilterSingle(hits, jointHits, - readLen, - memCollector.getConsensusFraction()); - hctr.peHits += jointHits.size(); - - if (initialRound) { - upperBoundHits += (jointHits.size() > 0); - } - - // FIXME: This clears the alignment group, but that contains nothing - // at this point. We should either check only once we are at the alignment - // phase (and therefore filter nothing based on pre-alignment hits), or - // clear the jointHits at this point. - - // If the read mapped to > maxReadOccs places, discard it - tooManyHits = jointHits.size() > salmonOpts.maxReadOccs; - if (tooManyHits) { - jointHitGroup.clearAlignments(); - } - - } - - - if (tryAlign and !jointHits.empty()) { - - // clear the aligner for this read - puffaligner.clear(); - puffaligner.getScoreStatus().reset(); - msi.clear(jointHits.size()); - - size_t idx{0}; - bool is_multimapping = (jointHits.size() > 1); - - for (auto &&jointHit : jointHits) { - auto hitScore = puffaligner.calculateAlignments(rp.seq, jointHit, hctr, is_multimapping, false); - bool validScore = (hitScore != invalidScore); - numMappingsDropped += validScore ? 0 : 1; - auto tid = qidx->getRefId(jointHit.tid); - - bool isCompat = (expectedLibraryFormat.strandedness == ReadStrandedness::U) or - (jointHit.orphanClust()->isFw and (expectedLibraryFormat.strandedness == ReadStrandedness::S)) or - (!jointHit.orphanClust()->isFw and (expectedLibraryFormat.strandedness == ReadStrandedness::A)); - - salmon::mapping_utils::updateRefMappings( - tid, hitScore, isCompat, idx, transcripts, invalidScore, msi); - ++idx; - } - - bool bestHitDecoy = msi.haveOnlyDecoyMappings(); - if (msi.bestScore > invalidScore and !bestHitDecoy) { - salmon::mapping_utils::filterAndCollectAlignments(jointHits, - readLen, - readLen, - true, // true for single-end false otherwise - tryAlign, - hardFilter, - salmonOpts.scoreExp, - salmonOpts.minAlnProb, - msi, - jointAlignments); - // if we have any alignments, then they are - // just single mapped. - if (!jointAlignments.empty()) { - mapType = salmon::utils::MappingType::SINGLE_MAPPED; - } - } else { - // if we had decoy hits, our type is decoy, otherwise it's unmapped - mapType = (bestHitDecoy) ? salmon::utils::MappingType::DECOY : salmon::utils::MappingType::UNMAPPED; - numDecoyFrags += bestHitDecoy ? 1 : 0; - ++numFragsDropped; - if (bestHitDecoy) { - salmon::mapping_utils::filterAndCollectAlignmentsDecoy( - jointHits, readLen, readLen, - true, // true for single-end false otherwise - tryAlign, hardFilter, salmonOpts.scoreExp, - salmonOpts.minAlnProb, msi, - jointAlignments); - } else { - jointHitGroup.clearAlignments(); - } - } - } - - if (writeQuasimappings) { - writeAlignmentsToStreamSingle(rp, formatter, jointAlignments, sstream, false, true); - } - - // We've kept decoy aignments around to this point so that we can - // potentially write these alignments to the SAM file. However, if - // we got to this point and only have decoy mappings, then clear the - // mappings here because none of the procesing below is relevant for - // decoys. - if (mapType == salmon::utils::MappingType::DECOY) { - jointHitGroup.clearAlignments(); - } - - bool needBiasSample = salmonOpts.biasCorrect; - - std::uniform_int_distribution<> dis(0, jointAlignments.size()); - // Randomly select a hit from which to draw the bias sample. - int32_t hitSamp{dis(eng)}; - int32_t hn{0}; - - // ---- Collect bias samples ------ // - for (auto& h : jointAlignments) { - - int32_t pos = static_cast(h.pos); - - // If bias correction is turned on, and we haven't sampled a mapping - // for this read yet, and we haven't collected the required number of - // samples overall. - if (needBiasSample and salmonOpts.numBiasSamples > 0 and hn == hitSamp) { - // the "start" position is the leftmost position if - // we hit the forward strand, and the leftmost - // position + the read length if we hit the reverse complement - int32_t startPos = h.fwd ? pos : pos + h.readLen; - - auto& t = transcripts[h.tid]; - if (startPos > 0 and startPos < static_cast(t.RefLength)) { - auto& readBias = (h.fwd) ? readBiasFW : readBiasRC; - const char* txpStart = t.Sequence(); - - bool success{false}; - // If the context exists around the read, add it to the observed - // read start sequences. - if (startPos >= readBias.contextBefore(!h.fwd) and - startPos + readBias.contextAfter(!h.fwd) < static_cast(t.RefLength)) { - context.fromChars(txpStart + startPos - - readBias.contextBefore(!h.fwd)); - if (!h.fwd) { - context.rc(); - } - success = readBias.addSequence(context, 1.0); - } + fmt::MemoryWriter sstream; + auto* qmLog = salmonOpts.qmLog.get(); + bool writeQuasimappings = (qmLog != nullptr); + + std::string rc1; + rc1.reserve(300); + + size_t numMappingsDropped{0}; + size_t numFragsDropped{0}; + size_t numDecoyFrags{0}; + const double decoyThreshold = salmonOpts.decoyThreshold; + + salmon::mapping_utils::MappingScoreInfo msi(decoyThreshold); + // we only collect detailed decoy information if we will be + // writing output to SAM. + msi.collect_decoys(writeQuasimappings); + + // std::vector alnCache; alnCache.reserve(15); + AlnCacheMap alnCache; + alnCache.reserve(16); - if (success) { - salmonOpts.numBiasSamples -= 1; - needBiasSample = false; + auto rg = parser->getReadGroup(); + while (parser->refill(rg)) { + rangeSize = rg.size(); + if (rangeSize > structureVec.size()) { + salmonOpts.jointLog->error("rangeSize = {}, but structureVec.size() = {} " + "--- this shouldn't happen.\n" + "Please report this bug on GitHub", + rangeSize, structureVec.size()); + salmonOpts.jointLog->flush(); + spdlog::drop_all(); + std::exit(1); + } + + LibraryFormat expectedLibraryFormat = rl.format(); + + bool tryAlign{salmonOpts.validateMappings}; + for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch + auto& rp = rg[i]; + readLen = rp.seq.length(); + tooShort = (readLen < minK); + auto& jointHitGroup = structureVec[i]; + jointHitGroup.clearAlignments(); + auto& jointAlignments = jointHitGroup.alignments(); + + mapType = salmon::utils::MappingType::UNMAPPED; + hits.clear(); + jointHits.clear(); + memCollector.clear(); + jointAlignments.clear(); + tooManyHits = false; + + bool lh = tooShort ? false + : memCollector(rp.seq, qc, + true, // isLeft + false // verbose + ); + + memCollector.findChains(rp.seq, hits, salmonOpts.fragLenDistMax, + MateStatus::SINGLE_END, + useChainingHeuristic, // heuristic chaining + true, // isLeft + false // verbose + ); + // TODO : PF_INTEGRATION + /* + if (!tryAlign) { + jointHits.erase( std::remove_if(jointHits.begin(), + jointHits.end(), + [&transcripts](QuasiAlignment& a) + { return a.tid >= transcripts.size(); }), jointHits.end() ); } - } - } - // ---- Collect bias samples ------ // - - switch (h.mateStatus) { - case MateStatus::SINGLE_END: { - h.format = salmon::utils::hitType(h.pos, h.fwd); - } break; - default: - break; - } - } - - if (writeUnmapped and mapType != salmon::utils::MappingType::SINGLE_MAPPED) { - // If we have no mappings --- then there's nothing to do - // unless we're outputting names for un-mapped / decoy mapped reads - unmappedNames << rp.name << ' ' << salmon::utils::str(mapType) - << '\n'; - } - - validHits += jointAlignments.size(); - locRead++; - ++numObservedFragments; - if (!quiet and numObservedFragments % 500000 == 0) { - iomutex.lock(); - const char RESET_COLOR[] = "\x1b[0m"; - char green[] = "\x1b[30m"; - green[3] = '0' + static_cast(fmt::GREEN); - char red[] = "\x1b[30m"; - red[3] = '0' + static_cast(fmt::RED); - if (initialRound) { - fmt::print(stderr, "\033[A\r\r{}processed{} {:n} {}fragments{}\n", - green, red, numObservedFragments, green, RESET_COLOR); - fmt::print(stderr, "hits: {:n}; hits per frag: {}", validHits, - validHits / static_cast(prevObservedFrags)); - } else { - fmt::print(stderr, "\r\r{}processed{} {:n} {}fragments{}", green, red, - numObservedFragments, green, RESET_COLOR); - } - iomutex.unlock(); - } - - } // end for i < j->nb_filled - - if (writeUnmapped) { - std::string outStr(unmappedNames.str()); - // Get rid of last newline - if (!outStr.empty()) { - outStr.pop_back(); - unmappedLogger->info(std::move(outStr)); - } - unmappedNames.clear(); - } - - if (writeQuasimappings) { - std::string outStr(sstream.str()); - // Get rid of last newline - if (!outStr.empty()) { - outStr.pop_back(); - qmLog->info(std::move(outStr)); - } - sstream.clear(); - } - - prevObservedFrags = numObservedFragments; - AlnGroupVecRange hitLists = {structureVec.begin(), structureVec.begin()+rangeSize}; - /*boost::make_iterator_range( - structurevec.begin(), structurevec.begin() + rangesize);*/ - processMiniBatch( - readExp, fmCalc, firstTimestepOfRound, rl, salmonOpts, hitLists, - transcripts, clusterForest, fragLengthDist, observedBiasParams, - /** - * NOTE : test new el model in future - * obsEffLengths, - **/ - numAssignedFragments, eng, initialRound, burnedIn, maxZeroFrac, logCMFCache); - } - readExp.updateShortFrags(shortFragStats); - - if (maxZeroFrac > 0.0) { - salmonOpts.jointLog->info("Thread saw mini-batch with a maximum of " - "{0:.2f}\% zero probability fragments", - maxZeroFrac); - } - mstats.numMappingsFiltered += numMappingsDropped; - mstats.numFragmentsFiltered += numFragsDropped; - mstats.numDecoyFragments += numDecoyFrags; + */ + + // If the fragment was too short, record it + if (tooShort) { + ++shortFragStats.numTooShort; + shortFragStats.shortest = std::min(shortFragStats.shortest, readLen); + } else { + pufferfish::util::joinReadsAndFilterSingle( + hits, jointHits, readLen, memCollector.getConsensusFraction()); + hctr.peHits += jointHits.size(); + + if (initialRound) { + upperBoundHits += (jointHits.size() > 0); + } + + // FIXME: This clears the alignment group, but that contains nothing + // at this point. We should either check only once we are at the + // alignment phase (and therefore filter nothing based on pre-alignment + // hits), or clear the jointHits at this point. + + // If the read mapped to > maxReadOccs places, discard it + tooManyHits = jointHits.size() > salmonOpts.maxReadOccs; + if (tooManyHits) { + jointHitGroup.clearAlignments(); + } + } + + if (tryAlign and !jointHits.empty()) { + + // clear the aligner for this read + puffaligner.clear(); + puffaligner.getScoreStatus().reset(); + msi.clear(jointHits.size()); + + size_t idx{0}; + bool is_multimapping = (jointHits.size() > 1); + + for (auto&& jointHit : jointHits) { + auto hitScore = puffaligner.calculateAlignments( + rp.seq, jointHit, hctr, is_multimapping, false); + bool validScore = (hitScore != invalidScore); + numMappingsDropped += validScore ? 0 : 1; + auto tid = qidx->getRefId(jointHit.tid); + + bool isCompat = + (expectedLibraryFormat.strandedness == ReadStrandedness::U) or + (jointHit.orphanClust()->isFw and + (expectedLibraryFormat.strandedness == ReadStrandedness::S)) or + (!jointHit.orphanClust()->isFw and + (expectedLibraryFormat.strandedness == ReadStrandedness::A)); + + salmon::mapping_utils::updateRefMappings( + tid, hitScore, isCompat, idx, transcripts, invalidScore, msi); + ++idx; + } + + bool bestHitDecoy = msi.haveOnlyDecoyMappings(); + if (msi.bestScore > invalidScore and !bestHitDecoy) { + salmon::mapping_utils::filterAndCollectAlignments( + jointHits, readLen, readLen, + true, // true for single-end false otherwise + tryAlign, hardFilter, salmonOpts.scoreExp, salmonOpts.minAlnProb, + msi, jointAlignments); + // if we have any alignments, then they are + // just single mapped. + if (!jointAlignments.empty()) { + mapType = salmon::utils::MappingType::SINGLE_MAPPED; + } + } else { + // if we had decoy hits, our type is decoy, otherwise it's unmapped + mapType = (bestHitDecoy) ? salmon::utils::MappingType::DECOY + : salmon::utils::MappingType::UNMAPPED; + numDecoyFrags += bestHitDecoy ? 1 : 0; + ++numFragsDropped; + if (bestHitDecoy) { + salmon::mapping_utils::filterAndCollectAlignmentsDecoy( + jointHits, readLen, readLen, + true, // true for single-end false otherwise + tryAlign, hardFilter, salmonOpts.scoreExp, + salmonOpts.minAlnProb, msi, jointAlignments); + } else { + jointHitGroup.clearAlignments(); + } + } + } + + if (writeQuasimappings) { + writeAlignmentsToStreamSingle(rp, formatter, jointAlignments, sstream, + false, true); + } + + // We've kept decoy aignments around to this point so that we can + // potentially write these alignments to the SAM file. However, if + // we got to this point and only have decoy mappings, then clear the + // mappings here because none of the procesing below is relevant for + // decoys. + if (mapType == salmon::utils::MappingType::DECOY) { + jointHitGroup.clearAlignments(); + } + + bool needBiasSample = salmonOpts.biasCorrect; + + std::uniform_int_distribution<> dis(0, jointAlignments.size()); + // Randomly select a hit from which to draw the bias sample. + int32_t hitSamp{dis(eng)}; + int32_t hn{0}; + + // ---- Collect bias samples ------ // + for (auto& h : jointAlignments) { + + int32_t pos = static_cast(h.pos); + + // If bias correction is turned on, and we haven't sampled a mapping + // for this read yet, and we haven't collected the required number of + // samples overall. + if (needBiasSample and salmonOpts.numBiasSamples > 0 and + hn == hitSamp) { + // the "start" position is the leftmost position if + // we hit the forward strand, and the leftmost + // position + the read length if we hit the reverse complement + int32_t startPos = h.fwd ? pos : pos + h.readLen; + + auto& t = transcripts[h.tid]; + if (startPos > 0 and startPos < static_cast(t.RefLength)) { + auto& readBias = (h.fwd) ? readBiasFW : readBiasRC; + const char* txpStart = t.Sequence(); + + bool success{false}; + // If the context exists around the read, add it to the observed + // read start sequences. + if (startPos >= readBias.contextBefore(!h.fwd) and + startPos + readBias.contextAfter(!h.fwd) < + static_cast(t.RefLength)) { + context.fromChars(txpStart + startPos - + readBias.contextBefore(!h.fwd)); + if (!h.fwd) { + context.rc(); + } + success = readBias.addSequence(context, 1.0); + } + + if (success) { + salmonOpts.numBiasSamples -= 1; + needBiasSample = false; + } + } + } + // ---- Collect bias samples ------ // + + switch (h.mateStatus) { + case MateStatus::SINGLE_END: { + h.format = salmon::utils::hitType(h.pos, h.fwd); + } break; + default: + break; + } + } + + if (writeUnmapped and + mapType != salmon::utils::MappingType::SINGLE_MAPPED) { + // If we have no mappings --- then there's nothing to do + // unless we're outputting names for un-mapped / decoy mapped reads + unmappedNames << rp.name << ' ' << salmon::utils::str(mapType) << '\n'; + } + + validHits += jointAlignments.size(); + locRead++; + ++numObservedFragments; + if (!quiet and numObservedFragments % 500000 == 0) { + iomutex.lock(); + const char RESET_COLOR[] = "\x1b[0m"; + char green[] = "\x1b[30m"; + green[3] = '0' + static_cast(fmt::GREEN); + char red[] = "\x1b[30m"; + red[3] = '0' + static_cast(fmt::RED); + if (initialRound) { + fmt::print(stderr, "\033[A\r\r{}processed{} {:n} {}fragments{}\n", + green, red, numObservedFragments, green, RESET_COLOR); + fmt::print(stderr, "hits: {:n}; hits per frag: {}", validHits, + validHits / static_cast(prevObservedFrags)); + } else { + fmt::print(stderr, "\r\r{}processed{} {:n} {}fragments{}", green, red, + numObservedFragments, green, RESET_COLOR); + } + iomutex.unlock(); + } + + } // end for i < j->nb_filled + + if (writeUnmapped) { + std::string outStr(unmappedNames.str()); + // Get rid of last newline + if (!outStr.empty()) { + outStr.pop_back(); + unmappedLogger->info(std::move(outStr)); + } + unmappedNames.clear(); + } + + if (writeQuasimappings) { + std::string outStr(sstream.str()); + // Get rid of last newline + if (!outStr.empty()) { + outStr.pop_back(); + qmLog->info(std::move(outStr)); + } + sstream.clear(); + } + + prevObservedFrags = numObservedFragments; + AlnGroupVecRange hitLists = { + structureVec.begin(), structureVec.begin() + rangeSize}; + /*boost::make_iterator_range( + structurevec.begin(), structurevec.begin() + rangesize);*/ + processMiniBatch( + readExp, fmCalc, firstTimestepOfRound, rl, salmonOpts, hitLists, + transcripts, clusterForest, fragLengthDist, observedBiasParams, + /** + * NOTE : test new el model in future + * obsEffLengths, + **/ + numAssignedFragments, eng, initialRound, burnedIn, maxZeroFrac, + logCMFCache); + } + readExp.updateShortFrags(shortFragStats); + + if (maxZeroFrac > 0.0) { + salmonOpts.jointLog->info("Thread saw mini-batch with a maximum of " + "{0:.2f}\% zero probability fragments", + maxZeroFrac); + } + mstats.numMappingsFiltered += numMappingsDropped; + mstats.numFragmentsFiltered += numFragsDropped; + mstats.numDecoyFragments += numDecoyFrags; } /// DONE QUASI - template void processReadLibrary( ReadExperimentT& readExp, ReadLibrary& rl, SalmonIndex* sidx, @@ -1950,10 +1988,10 @@ void processReadLibrary( std::atomic& upperBoundHits, // upper bound on # of mapped frags bool initialRound, std::atomic& burnedIn, ForgettingMassCalculator& fmCalc, - FragmentLengthDistribution& fragLengthDist, - SalmonOpts& salmonOpts, + FragmentLengthDistribution& fragLengthDist, SalmonOpts& salmonOpts, std::mutex& iomutex, size_t numThreads, - std::vector>& structureVec, volatile bool& writeToCache, MappingStatistics& mstats) { + std::vector>& structureVec, volatile bool& writeToCache, + MappingStatistics& mstats) { std::vector threads; @@ -1981,9 +2019,9 @@ void processReadLibrary( }; /** C++14 version **/ std::unique_ptr pairedParserPtr( - nullptr, parserPtrDeleter); + nullptr, parserPtrDeleter); std::unique_ptr singleParserPtr( - nullptr, parserPtrDeleter); + nullptr, parserPtrDeleter); /** sequence-specific and GC-fragment bias vectors --- each thread gets it's * own **/ @@ -1996,19 +2034,18 @@ void processReadLibrary( * std::vector observedEffectiveLengths(numThreads, *EffectiveLengthStats(numTxp)); **/ - // NOTE : When we can support C++14, we can replace the entire ProcessFunctor class above with this - // generic lambda. + // NOTE : When we can support C++14, we can replace the entire ProcessFunctor + // class above with this generic lambda. auto processFunctor = [&](size_t i, auto* parserPtr, auto* index) { if (salmonOpts.qmFileName != "" and i == 0) { writeSAMHeader(*index, salmonOpts.qmLog); } auto threadFun = [&, i, parserPtr, index]() -> void { processReads(parserPtr, readExp, rl, structureVec[i], - numObservedFragments, numAssignedFragments, numValidHits, - upperBoundHits, index, transcripts, - fmCalc, clusterForest, fragLengthDist, observedBiasParams[i], - salmonOpts, iomutex, initialRound, - burnedIn, writeToCache, mstats, i); + numObservedFragments, numAssignedFragments, numValidHits, + upperBoundHits, index, transcripts, fmCalc, clusterForest, + fragLengthDist, observedBiasParams[i], salmonOpts, iomutex, + initialRound, burnedIn, writeToCache, mstats, i); }; threads.emplace_back(threadFun); }; @@ -2058,31 +2095,35 @@ void processReadLibrary( fmt::MemoryWriter infostr; infostr << "This version of salmon does not support RapMap-based indexing."; throw std::invalid_argument(infostr.str()); - } - break; + } break; case SalmonIndexType::PUFF: { bool isSparse = sidx->isSparse(); for (size_t i = 0; i < numThreads; ++i) { // NOTE: we *must* capture i by value here, b/c it can (sometimes, does) // change value before the lambda below is evaluated --- crazy! if (isSparse) { - if (isPairedEnd) {processFunctor(i, pairedParserPtr.get(), sidx->puffSparseIndex());} - else if (isSingleEnd) {processFunctor(i, singleParserPtr.get(), sidx->puffSparseIndex());} + if (isPairedEnd) { + processFunctor(i, pairedParserPtr.get(), sidx->puffSparseIndex()); + } else if (isSingleEnd) { + processFunctor(i, singleParserPtr.get(), sidx->puffSparseIndex()); + } } else { // dense index - if (isPairedEnd) {processFunctor(i, pairedParserPtr.get(), sidx->puffIndex());} - else if (isSingleEnd) {processFunctor(i, singleParserPtr.get(), sidx->puffIndex());} + if (isPairedEnd) { + processFunctor(i, pairedParserPtr.get(), sidx->puffIndex()); + } else if (isSingleEnd) { + processFunctor(i, singleParserPtr.get(), sidx->puffIndex()); + } } } - } - break; + } break; } // end switch for (auto& t : threads) { t.join(); } - // At this point, if we were using decoy transcripts, we don't need them anymore and can get - // rid of them. + // At this point, if we were using decoy transcripts, we don't need them + // anymore and can get rid of them. readExp.dropDecoyTranscripts(); // If we don't have a sufficient number of assigned fragments, then @@ -2091,7 +2132,7 @@ void processReadLibrary( readExp.setNumObservedFragments(numObservedFragments); readExp.numAssignedFragmentsAtomic().store(numAssignedFragments); double mappingRate = numAssignedFragments.load() / - static_cast(numObservedFragments.load()); + static_cast(numObservedFragments.load()); readExp.setEffectiveMappingRate(mappingRate); throw InsufficientAssignedFragments(numAssignedFragments.load(), salmonOpts.minRequiredFrags); @@ -2124,8 +2165,7 @@ void processReadLibrary( auto& gcm = gcp.observedGCMass; globalGCMass.combineCounts(gcm); - auto& fw = - readExp.readBiasModelObserved(salmon::utils::Direction::FORWARD); + auto& fw = readExp.readBiasModelObserved(salmon::utils::Direction::FORWARD); auto& rc = readExp.readBiasModelObserved( salmon::utils::Direction::REVERSE_COMPLEMENT); @@ -2135,8 +2175,8 @@ void processReadLibrary( rc.combineCounts(rcloc); /** - * positional biases - **/ + * positional biases + **/ auto& posBiasesFW = readExp.posBias(salmon::utils::Direction::FORWARD); auto& posBiasesRC = readExp.posBias(salmon::utils::Direction::REVERSE_COMPLEMENT); @@ -2185,10 +2225,8 @@ void processReadLibrary( * */ template -void quantifyLibrary(ReadExperimentT& experiment, - SalmonOpts& salmonOpts, - MappingStatistics& mstats, - uint32_t numQuantThreads) { +void quantifyLibrary(ReadExperimentT& experiment, SalmonOpts& salmonOpts, + MappingStatistics& mstats, uint32_t numQuantThreads) { bool burnedIn = (salmonOpts.numBurninFrags == 0); uint64_t numRequiredFragments = salmonOpts.numRequiredFragments; @@ -2271,12 +2309,10 @@ void quantifyLibrary(ReadExperimentT& experiment, FragmentLengthDistribution& fragLengthDist, std::atomic& numAssignedFragments, size_t numQuantThreads, std::atomic& burnedIn) -> void { - processReadLibrary(experiment, rl, sidx, transcripts, clusterForest, numObservedFragments, totalAssignedFragments, upperBoundHits, initialRound, burnedIn, fmCalc, - fragLengthDist, salmonOpts, - ioMutex, + fragLengthDist, salmonOpts, ioMutex, numQuantThreads, groupVec, writeToCache, mstats); numAssignedFragments = totalAssignedFragments - prevNumAssignedFragments; @@ -2349,15 +2385,26 @@ void quantifyLibrary(ReadExperimentT& experiment, } if (salmonOpts.recoverOrphans) { - salmonOpts.jointLog->info("Number of orphans recovered using orphan rescue : {:n}", mstats.numOrphansRescued.load()); + salmonOpts.jointLog->info( + "Number of orphans recovered using orphan rescue : {:n}", + mstats.numOrphansRescued.load()); } if (salmonOpts.validateMappings) { - salmonOpts.jointLog->info("Number of mappings discarded because of alignment score : {:n}", mstats.numMappingsFiltered.load()); - salmonOpts.jointLog->info("Number of fragments entirely discarded because of alignment score : {:n}", mstats.numFragmentsFiltered.load()); - salmonOpts.jointLog->info("Number of fragments discarded because they are best-mapped to decoys : {:n}", mstats.numDecoyFragments.load()); + salmonOpts.jointLog->info( + "Number of mappings discarded because of alignment score : {:n}", + mstats.numMappingsFiltered.load()); + salmonOpts.jointLog->info("Number of fragments entirely discarded because " + "of alignment score : {:n}", + mstats.numFragmentsFiltered.load()); + salmonOpts.jointLog->info("Number of fragments discarded because they are " + "best-mapped to decoys : {:n}", + mstats.numDecoyFragments.load()); } if (!salmonOpts.allowDovetail) { - salmonOpts.jointLog->info("Number of fragments discarded because they have only dovetail (discordant) mappings to valid targets : {:n}", mstats.numDovetails.load()); + salmonOpts.jointLog->info( + "Number of fragments discarded because they have only dovetail " + "(discordant) mappings to valid targets : {:n}", + mstats.numDovetails.load()); } // If we didn't achieve burnin, then at least compute effective @@ -2386,7 +2433,7 @@ void quantifyLibrary(ReadExperimentT& experiment, prevNumObservedFragments); experiment.setUpperBoundHits(upperBoundHits.load()); double mappingRate = totalAssignedFragments.load() / - static_cast(numObservedFragments.load()); + static_cast(numObservedFragments.load()); experiment.setEffectiveMappingRate(mappingRate); } @@ -2401,13 +2448,13 @@ void quantifyLibrary(ReadExperimentT& experiment, unmappedLogger->flush(); } } - } -int salmonQuantify(int argc, const char* argv[], std::unique_ptr& salmonIndex) { +int salmonQuantify(int argc, const char* argv[], + std::unique_ptr& salmonIndex) { using std::cerr; - using std::vector; using std::string; + using std::vector; namespace bfs = boost::filesystem; namespace po = boost::program_options; @@ -2428,7 +2475,13 @@ int salmonQuantify(int argc, const char* argv[], std::unique_ptr& s auto deprecatedOpt = pogen.getDeprecatedOptions(sopt); po::options_description all("salmon quant options"); - all.add(inputOpt).add(basicOpt).add(mapSpecOpt).add(advancedOpt).add(testingOpt).add(hiddenOpt).add(deprecatedOpt); + all.add(inputOpt) + .add(basicOpt) + .add(mapSpecOpt) + .add(advancedOpt) + .add(testingOpt) + .add(hiddenOpt) + .add(deprecatedOpt); po::options_description visible("salmon quant options"); visible.add(inputOpt).add(basicOpt).add(mapSpecOpt).add(advancedOpt); @@ -2460,7 +2513,8 @@ transcript abundance from RNA-seq reads } std::stringstream commentStream; - commentStream << "### salmon (selective-alignment-based) v" << salmon::version << "\n"; + commentStream << "### salmon (selective-alignment-based) v" + << salmon::version << "\n"; commentStream << "### [ program ] => salmon \n"; commentStream << "### [ command ] => quant \n"; for (auto& opt : orderedOptions.options) { @@ -2495,7 +2549,7 @@ transcript abundance from RNA-seq reads // ==== Library format processing === vector readLibraries = - salmon::utils::extractReadLibraries(orderedOptions); + salmon::utils::extractReadLibraries(orderedOptions); if (readLibraries.size() == 0) { jointLog->error( @@ -2508,7 +2562,7 @@ transcript abundance from RNA-seq reads } // ==== END: Library format processing === - if(!salmonIndex) { + if (!salmonIndex) { salmonIndex = checkLoadIndex(indexDirectory, sopt.jointLog); } @@ -2517,7 +2571,8 @@ transcript abundance from RNA-seq reads // This will be the class in charge of maintaining our // rich equivalence classes - experiment.equivalenceClassBuilder().setMaxResizeThreads(sopt.maxHashResizeThreads); + experiment.equivalenceClassBuilder().setMaxResizeThreads( + sopt.maxHashResizeThreads); experiment.equivalenceClassBuilder().start(); auto indexType = experiment.getIndex()->indexType(); @@ -2531,7 +2586,8 @@ transcript abundance from RNA-seq reads } break; case SalmonIndexType::QUASI: { fmt::MemoryWriter infostr; - infostr << "This version of salmon does not support RapMap-based indexing."; + infostr + << "This version of salmon does not support RapMap-based indexing."; throw std::invalid_argument(infostr.str()); } break; case SalmonIndexType::PUFF: { @@ -2551,8 +2607,8 @@ transcript abundance from RNA-seq reads sopt.allowOrphans = !sopt.discardOrphansQuasi; sopt.useQuasi = true; - quantifyLibrary(experiment, - sopt, mstats, sopt.numThreads); + quantifyLibrary(experiment, sopt, mstats, + sopt.numThreads); } break; } } catch (const InsufficientAssignedFragments& iaf) { @@ -2593,9 +2649,9 @@ transcript abundance from RNA-seq reads if (!optSuccess) { jointLog->error( - "The optimization algorithm failed. This is likely the result of " - "bad input (or a bug). If you cannot track down the cause, please " - "report this issue on GitHub."); + "The optimization algorithm failed. This is likely the result of " + "bad input (or a bug). If you cannot track down the cause, please " + "report this issue on GitHub."); return 1; } jointLog->info("Finished optimizer"); @@ -2612,14 +2668,14 @@ transcript abundance from RNA-seq reads gzw.setSamplingPath(sopt); // The function we'll use as a callback to write samples std::function&)> bsWriter = - [&gzw](const std::vector& alphas) -> bool { - return gzw.writeBootstrap(alphas, true); - }; + [&gzw](const std::vector& alphas) -> bool { + return gzw.writeBootstrap(alphas, true); + }; bool sampleSuccess = - // sampler.sampleMultipleChains(experiment, sopt, bsWriter, - // sopt.numGibbsSamples); - sampler.sample(experiment, sopt, bsWriter, sopt.numGibbsSamples); + // sampler.sampleMultipleChains(experiment, sopt, bsWriter, + // sopt.numGibbsSamples); + sampler.sample(experiment, sopt, bsWriter, sopt.numGibbsSamples); if (!sampleSuccess) { jointLog->error("Encountered error during Gibbs sampling.\n" "This should not happen.\n" @@ -2631,13 +2687,13 @@ transcript abundance from RNA-seq reads gzw.setSamplingPath(sopt); // The function we'll use as a callback to write samples std::function&)> bsWriter = - [&gzw](const std::vector& alphas) -> bool { - return gzw.writeBootstrap(alphas); - }; + [&gzw](const std::vector& alphas) -> bool { + return gzw.writeBootstrap(alphas); + }; jointLog->info("Starting Bootstrapping"); bool bootstrapSuccess = - optimizer.gatherBootstraps(experiment, sopt, bsWriter, 0.01, 10000); + optimizer.gatherBootstraps(experiment, sopt, bsWriter, 0.01, 10000); jointLog->info("Finished Bootstrapping"); if (!bootstrapSuccess) { jointLog->error("Encountered error during bootstrapping.\n" @@ -2647,7 +2703,8 @@ transcript abundance from RNA-seq reads } } - /** If the user requested gene-level abundances, then compute those now **/ + /** If the user requested gene-level abundances, then compute those now + * **/ if (vm.count("geneMap")) { try { salmon::utils::generateGeneLevelEstimates(sopt.geneMapPath, @@ -2661,10 +2718,10 @@ transcript abundance from RNA-seq reads } } else if (sopt.dumpEqWeights) { // sopt.skipQuant == true jointLog->info("Finalizing combined weights for equivalence classes."); - // if we are skipping the quantification, and we are dumping equivalence class weights, - // then fill in the combinedWeights of the equivalence classes so that `--dumpEqWeights` makes sense. - auto& eqVec = - experiment.equivalenceClassBuilder().eqVec(); + // if we are skipping the quantification, and we are dumping equivalence + // class weights, then fill in the combinedWeights of the equivalence + // classes so that `--dumpEqWeights` makes sense. + auto& eqVec = experiment.equivalenceClassBuilder().eqVec(); bool noRichEq = sopt.noRichEqClasses; bool useEffectiveLengths = !sopt.noEffectiveLengthCorrection; std::vector& transcripts = experiment.transcripts(); @@ -2673,11 +2730,11 @@ transcript abundance from RNA-seq reads for (size_t i = 0; i < transcripts.size(); ++i) { auto& txp = transcripts[i]; effLens(i) = useEffectiveLengths - ? std::exp(txp.getCachedLogEffectiveLength()) - : txp.RefLength; + ? std::exp(txp.getCachedLogEffectiveLength()) + : txp.RefLength; } - for (size_t eqID = 0; eqID < eqVec.size(); ++eqID){ + for (size_t eqID = 0; eqID < eqVec.size(); ++eqID) { // The vector entry auto& kv = eqVec[eqID]; // The label of the equivalence class @@ -2704,7 +2761,8 @@ transcript abundance from RNA-seq reads auto probStartPos = 1.0 / el; // combined weight - double wt = sopt.eqClassMode ? v.weights[i] : v.count * v.weights[i] * probStartPos; + double wt = sopt.eqClassMode ? v.weights[i] + : v.count * v.weights[i] * probStartPos; v.combinedWeights.push_back(wt); wsum += wt; } @@ -2733,7 +2791,7 @@ transcript abundance from RNA-seq reads bfs::path distFileName = sopt.paramsDirectory / "flenDist.txt"; { std::unique_ptr distOut( - std::fopen(distFileName.c_str(), "w"), std::fclose); + std::fopen(distFileName.c_str(), "w"), std::fclose); fmt::print(distOut.get(), "{}\n", experiment.fragmentLengthDistribution()->toString()); } @@ -2778,14 +2836,16 @@ transcript abundance from RNA-seq reads // Write meta-information about the run gzw.writeMeta(sopt, experiment, mstats); - // at this point we can (and should) drop all loggers to force them to + // at this point we can (and should) drop all loggers to force them to // flush. spdlog::drop_all(); - + } catch (po::error& e) { std::cerr << "(mapping-based mode) Exception : [" << e.what() << "].\n"; - std::cerr << "Please be sure you are passing correct options, and that you are running in the intended mode.\n"; - std::cerr << "alignment-based mode is detected and enabled via the \'-a\' flag. Exiting.\n"; + std::cerr << "Please be sure you are passing correct options, and that you " + "are running in the intended mode.\n"; + std::cerr << "alignment-based mode is detected and enabled via the \'-a\' " + "flag. Exiting.\n"; std::exit(1); } catch (const spdlog::spdlog_ex& ex) { std::cerr << "logger failed with : [" << ex.what() << "]. Exiting.\n"; From ddf24430fc4508aaaf0cb5ace124cce8cb58ad7a Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Wed, 22 Jun 2022 11:43:47 -0400 Subject: [PATCH 16/17] guard: check record type If the user passed the `--writeQualities` flag and we are writing the mappings to an output file, check to make sure the records actually contain quality values (i.e. the input is not in FASTA format). If they do not, then print a warning and disable writing of the quality scores. --- src/SalmonAlevin.cpp | 36 ++++++++++++++++++++++-- src/SalmonQuantify.cpp | 64 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 2 deletions(-) diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index a2aa03c93..f17dd0699 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -1571,9 +1571,19 @@ void processReadsQuasi( std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); + + // Says if we should check that quality values exist + // in the case the user requested to `--writeQualities`, + // because they may have accidentially passed in a FASTA + // file. + bool check_qualities = true; if (salmonOpts.writeQualities) { formatter.enable_qualities(); } else { + // we don't have to worry about + // checking qualities because + // we aren't writing them. + check_qualities = false; formatter.disable_qualities(); } pufferfish::util::QueryCache qc; @@ -1590,6 +1600,9 @@ void processReadsQuasi( fmt::MemoryWriter sstream; auto* qmLog = salmonOpts.qmLog.get(); bool writeQuasimappings = (qmLog != nullptr); + // if we aren't writing output at all, don't bother + // checking for quality scores either. + if (!writeQuasimappings) { check_qualities = false; } ////////////////////// // NOTE: validation mapping based new parameters @@ -1641,6 +1654,25 @@ void processReadsQuasi( extraBAMtags.reserve(reserveSize); } + // if we need to disable writing quality values + // because the user passed in a FASTA file, do that + // check here. + if (check_qualities and (rangeSize > 0)) { + auto& rp = rg[0]; + // a valid FASTQ record can't have an + // empty quality string, so then we will + // treat this as a FASTA. + if (rp.first.qual.empty() or rp.second.qual.empty()) { + formatter.disable_qualities(); + salmonOpts.jointLog->warn("The flag --writeQualities was provided,\n" + "but read records (e.g. {}/{}) appear not to have quality strings!\n" + "The input is being interpreted as a FASTA file, and the writing\n" + "of quality scores is being disabled.\n", rp.first.name, rp.second.name); + } + // we won't bother to perform this check more than once. + check_qualities = false; + } + auto localProtocol = alevinOpts.protocol; for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch auto& rp = rg[i]; @@ -2457,8 +2489,8 @@ bool do_sc_align(ReadExperimentT& experiment, rad_file.close(); - // we want to check if the RAD file stream was written to properly - // while we likely would have caught this earlier, it is possible the + // We want to check if the RAD file stream was written to properly. + // While we likely would have caught this earlier, it is possible the // badbit may not be set until the stream actually flushes (perhaps even // at close), so we check here one final time that the status of the // stream is as expected. diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp index 88f7a1227..59e82620d 100644 --- a/src/SalmonQuantify.cpp +++ b/src/SalmonQuantify.cpp @@ -853,9 +853,19 @@ void processReads( std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); + + // Says if we should check that quality values exist + // in the case the user requested to `--writeQualities`, + // because they may have accidentially passed in a FASTA + // file. + bool check_qualities = true; if (salmonOpts.writeQualities) { formatter.enable_qualities(); } else { + // we don't have to worry about + // checking qualities because + // we aren't writing them. + check_qualities = false; formatter.disable_qualities(); } pufferfish::util::QueryCache qc; @@ -876,6 +886,9 @@ void processReads( fmt::MemoryWriter sstream; auto* qmLog = salmonOpts.qmLog.get(); bool writeQuasimappings = (qmLog != nullptr); + // if we aren't writing output at all, don't bother + // checking for quality scores either. + if (!writeQuasimappings) { check_qualities = false; } /* auto ap{selective_alignment::utils::AlignmentPolicy::DEFAULT}; @@ -913,6 +926,25 @@ void processReads( LibraryFormat expectedLibraryFormat = rl.format(); + // if we need to disable writing quality values + // because the user passed in a FASTA file, do that + // check here. + if (check_qualities and (rangeSize > 0)) { + auto& rp = rg[0]; + // a valid FASTQ record can't have an + // empty quality string, so then we will + // treat this as a FASTA. + if (rp.first.qual.empty() or rp.second.qual.empty()) { + formatter.disable_qualities(); + salmonOpts.jointLog->warn("The flag --writeQualities was provided,\n" + "but read records (e.g. {}/{}) appear not to have quality strings!\n" + "The input is being interpreted as a FASTA file, and the writing\n" + "of quality scores is being disabled.\n", rp.first.name, rp.second.name); + } + // we won't bother to perform this check more than once. + check_qualities = false; + } + bool tryAlign{salmonOpts.validateMappings}; for (size_t i = 0; i < rangeSize; ++i) { // For all the reads in this batch auto& rp = rg[i]; @@ -1649,9 +1681,19 @@ void processReads( std::vector recoveredHits; std::vector jointHits; PairedAlignmentFormatter formatter(qidx); + + // Says if we should check that quality values exist + // in the case the user requested to `--writeQualities`, + // because they may have accidentially passed in a FASTA + // file. + bool check_qualities = true; if (salmonOpts.writeQualities) { formatter.enable_qualities(); } else { + // we don't have to worry about + // checking qualities because + // we aren't writing them. + check_qualities = false; formatter.disable_qualities(); } pufferfish::util::QueryCache qc; @@ -1671,6 +1713,9 @@ void processReads( fmt::MemoryWriter sstream; auto* qmLog = salmonOpts.qmLog.get(); bool writeQuasimappings = (qmLog != nullptr); + // if we aren't writing output at all, don't bother + // checking for quality scores either. + if (!writeQuasimappings) { check_qualities = false; } std::string rc1; rc1.reserve(300); @@ -1704,6 +1749,25 @@ void processReads( LibraryFormat expectedLibraryFormat = rl.format(); + // if we need to disable writing quality values + // because the user passed in a FASTA file, do that + // check here. + if (check_qualities and (rangeSize > 0)) { + auto& rp = rg[0]; + // a valid FASTQ record can't have an + // empty quality string, so then we will + // treat this as a FASTA. + if (rp.qual.empty()) { + formatter.disable_qualities(); + salmonOpts.jointLog->warn("The flag --writeQualities was provided,\n" + "but read records (e.g. {}) appear not to have quality strings!\n" + "The input is being interpreted as a FASTA file, and the writing\n" + "of quality scores is being disabled.\n", rp.name); + } + // we won't bother to perform this check more than once. + check_qualities = false; + } + bool tryAlign{salmonOpts.validateMappings}; for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch auto& rp = rg[i]; From 595ecb79e356343b89f61a415a752a895685e277 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Wed, 22 Jun 2022 11:58:19 -0400 Subject: [PATCH 17/17] [SKIP CI]: bump version --- current_version.txt | 2 +- doc/source/conf.py | 4 ++-- docker/Dockerfile | 2 +- docker/build_test.sh | 2 +- include/SalmonConfig.hpp | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/current_version.txt b/current_version.txt index fb9ede991..40f96c219 100644 --- a/current_version.txt +++ b/current_version.txt @@ -1,3 +1,3 @@ VERSION_MAJOR 1 -VERSION_MINOR 8 +VERSION_MINOR 9 VERSION_PATCH 0 diff --git a/doc/source/conf.py b/doc/source/conf.py index 580f48e4d..ebc201b0f 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = '1.8' +version = '1.9' # The full version, including alpha/beta/rc tags. -release = '1.8.0' +release = '1.9.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docker/Dockerfile b/docker/Dockerfile index 61f3ffcf2..70009a298 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,7 +6,7 @@ MAINTAINER salmon.maintainer@gmail.com ENV PACKAGES git gcc make g++ libboost-all-dev liblzma-dev libbz2-dev \ ca-certificates zlib1g-dev libcurl4-openssl-dev curl unzip autoconf apt-transport-https ca-certificates gnupg software-properties-common wget -ENV SALMON_VERSION 1.8.0 +ENV SALMON_VERSION 1.9.0 # salmon binary will be installed in /home/salmon/bin/salmon diff --git a/docker/build_test.sh b/docker/build_test.sh index 45976506b..02ce42f6e 100644 --- a/docker/build_test.sh +++ b/docker/build_test.sh @@ -1,3 +1,3 @@ #! /bin/bash -SALMON_VERSION=1.8.0 +SALMON_VERSION=1.9.0 docker build --no-cache -t combinelab/salmon:${SALMON_VERSION} -t combinelab/salmon:latest . diff --git a/include/SalmonConfig.hpp b/include/SalmonConfig.hpp index 277485959..8d126a197 100644 --- a/include/SalmonConfig.hpp +++ b/include/SalmonConfig.hpp @@ -26,9 +26,9 @@ namespace salmon { constexpr char majorVersion[] = "1"; -constexpr char minorVersion[] = "8"; +constexpr char minorVersion[] = "9"; constexpr char patchVersion[] = "0"; -constexpr char version[] = "1.8.0"; +constexpr char version[] = "1.9.0"; constexpr uint32_t indexVersion = 5; constexpr char requiredQuasiIndexVersion[] = "p7"; } // namespace salmon