Develop (#827)

* Merged into the wrong branch without noticing :( (#814) * use better conda link (#799) * Estimated filtering fix (#813) * oops * fix testing and set a max number of filtered reads * apparently a bunch of things were getting skipped * fix wrappers * update computeMatrix wrapper * Decrease memory needs (#817) * Use an iterator to not blow memory up * Update a bit more * The GC bias stuff is all deprecated, I'm not fixing that old code * Cache resulting counts rather than just decreasing the bin size (#818) * Cache resulting counts rather than just decreasing the bin size * sanity check * Implement #815 * [skip ci] update change log * Implement #816 (#825) * Implement #816 * expose option * Add a test using pseudocounts and skipZeroOverZero * syntax * Fix tests * Make --skipZeroOverZero a galaxy macro and add to bigwigCompare * [ci skip] a bit of formatting * Fix #822 (#826)
deeptools · Apr 1, 2019 · 6f68c67 · 6f68c67
1 parent 3cefc7d
commit 6f68c67
Show file tree

Hide file tree

Showing 15 changed files with 97 additions and 56 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,6 +1,9 @@
 3.2.1
 
  * Changed a bug in `estimateReadFiltering` where the estimated number of filtered reads was typically too low.
+ * Made an internal change that should drastically reduce the memory requirements of many tools. This slightly increases run time, but as the resulting resource usage is much more attractive this is judged worthwhile.
+ * An informative error message is now produced with `bamCoverage` if RPGC normalization is requested but no effective genome size is provided (issue #815).
+ * Fixes some issues with y-axis scaling (issue #822)
 
 3.2.0
 

diff --git a/deeptools/_version.py b/deeptools/_version.py
@@ -2,4 +2,4 @@
 # This file is originally generated from Git information by running 'setup.py
 # version'. Distribution tarballs contain a pre-generated copy of this file.
 
-__version__ = '3.2.0'
+__version__ = '3.2.1'
diff --git a/deeptools/bamCoverage.py b/deeptools/bamCoverage.py
@@ -152,6 +152,8 @@ def main(args=None):
 
     if args.normalizeUsing == 'None':
         args.normalizeUsing = None  # For the sake of sanity
+    elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize:
+        sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n")
 
     if args.normalizeUsing:
         # if a normalization is required then compute the scale factors

diff --git a/deeptools/bigwigCompare.py b/deeptools/bigwigCompare.py
@@ -61,6 +61,12 @@ def parse_arguments(args=None):
                         type=float,
                         required=False)
 
+    parser.add_argument('--skipZeroOverZero',
+                        help='Skip bins where BOTH BAM files lack coverage. '
+                        'This is determined BEFORE any applicable pseudocount '
+                        'is added.',
+                        action='store_true')
+
     parser.add_argument('--operation',
                         help='The default is to output the log2ratio of the '
                         'two samples. The reciprocal ratio returns the '
@@ -159,6 +165,7 @@ def main(args=None):
         blackListFileName=args.blackListFileName,
         verbose=args.verbose,
         numberOfProcessors=args.numberOfProcessors,
+        skipZeroOverZero=args.skipZeroOverZero,
         format=args.outFileFormat,
         smoothLength=False,
         missingDataAsZero=not args.skipNonCoveredRegions,

diff --git a/deeptools/correctGCBias.py b/deeptools/correctGCBias.py
@@ -195,8 +195,11 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
              if r.flag & 4 == 0]
 
     bam.close()
+
     r_index = -1
     for read in reads:
+        if read.is_unmapped:
+            continue
         r_index += 1
         try:
             # calculate GC content of read fragment
@@ -340,6 +343,7 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,
     matePairs = {}
     read_repetitions = 0
     removed_duplicated_reads = 0
+
     # cache data
     # r.flag & 4 == 0 is to filter unmapped reads that
     # have a genomic position
@@ -348,6 +352,8 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,
 
     r_index = -1
     for read in reads:
+        if read.pos <= start or read.is_unmapped:
+            continue
         r_index += 1
         copies = None
         gc = None

diff --git a/deeptools/countReadsPerBin.py b/deeptools/countReadsPerBin.py
@@ -603,16 +603,15 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
             start_time = time.time()
             # caching seems faster. TODO: profile the function
             c = 0
-            if chrom in bamHandle.references:
-                reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
-                         if r.flag & 4 == 0]
-            else:
+            if chrom not in bamHandle.references:
                 raise NameError("chromosome {} not found in bam file".format(chrom))
 
             prev_pos = set()
             lpos = None
             # of previous processed read pair
-            for read in reads:
+            for read in bamHandle.fetch(chrom, regStart, regEnd):
+                if read.is_unmapped:
+                    continue
                 if self.minMappingQuality and read.mapq < self.minMappingQuality:
                     continue
 

diff --git a/deeptools/getScaleFactor.py b/deeptools/getScaleFactor.py
@@ -14,15 +14,20 @@ def getFractionKept_wrapper(args):
     return getFractionKept_worker(*args)
 
 
-def getFractionKept_worker(chrom, start, end, bamFile, args):
+def getFractionKept_worker(chrom, start, end, bamFile, args, offset):
     """
     Queries the BAM file and counts the number of alignments kept/found in the
     first 50000 bases.
     """
     bam = bamHandler.openBam(bamFile)
+    start += offset * 50000
     end = min(end, start + 50000)
     tot = 0
     filtered = 0
+
+    if end <= start:
+        return (filtered, tot)
+
     prev_pos = set()
     lpos = None
     if chrom in bam.references:
@@ -151,13 +156,10 @@ def fraction_kept(args, stats):
     else:
         chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))
 
-    while total < num_needed_to_sample and distanceBetweenBins > 50000:
-        # If we've iterated, then halve distanceBetweenBins
-        distanceBetweenBins /= 2
-        if distanceBetweenBins < 50000:
-            distanceBetweenBins = 50000
-
-        res = mapReduce.mapReduce((bam_handle.filename, args),
+    offset = 0
+    # Iterate over bins at various non-overlapping offsets until we have enough data
+    while total < num_needed_to_sample and offset < np.ceil(distanceBetweenBins / 50000):
+        res = mapReduce.mapReduce((bam_handle.filename, args, offset),
                                   getFractionKept_wrapper,
                                   chrom_sizes,
                                   genomeChunkLength=distanceBetweenBins,
@@ -166,7 +168,10 @@ def fraction_kept(args, stats):
                                   verbose=args.verbose)
 
         if len(res):
-            filtered, total = np.sum(res, axis=0)
+            foo, bar = np.sum(res, axis=0)
+            filtered += foo
+            total += bar
+        offset += 1
 
     if total == 0:
         # This should never happen

diff --git a/deeptools/plotProfile.py b/deeptools/plotProfile.py
@@ -690,6 +690,8 @@ def plot_profile(self):
 
         first = True
         ax_list = []
+        globalYmin = np.inf
+        globalYmax = -np.inf
         for plot in range(self.numplots):
             localYMin = None
             localYMax = None
@@ -698,7 +700,7 @@ def plot_profile(self):
             if (row == 0 and col == 0) or len(self.y_min) > 1 or len(self.y_max) > 1:
                 ax = self.fig.add_subplot(self.grids[row, col])
             else:
-                ax = self.fig.add_subplot(self.grids[row, col], sharey=ax_list[0])
+                ax = self.fig.add_subplot(self.grids[row, col])
 
             if self.per_group:
                 title = self.hm.matrix.group_labels[plot]
@@ -717,10 +719,10 @@ def plot_profile(self):
                     _row, _col = plot, data_idx
                 else:
                     _row, _col = data_idx, plot
-                if localYMin is None or self.y_min[_col % len(self.y_min)] < localYMin:
-                    localYMin = self.y_min[_col % len(self.y_min)]
-                if localYMax is None or self.y_max[_col % len(self.y_max)] > localYMax:
-                    localYMax = self.y_max[_col % len(self.y_max)]
+                if localYMin is None or self.y_min[col % len(self.y_min)] < localYMin:
+                    localYMin = self.y_min[col % len(self.y_min)]
+                if localYMax is None or self.y_max[col % len(self.y_max)] > localYMax:
+                    localYMax = self.y_max[col % len(self.y_max)]
 
                 sub_matrix = self.hm.matrix.get_matrix(_row, _col)
 
@@ -738,6 +740,8 @@ def plot_profile(self):
                             self.color_list[coloridx],
                             label,
                             plot_type=self.plot_type)
+            globalYmin = min(np.float64(globalYmin), ax.get_ylim()[0])
+            globalYmax = max(globalYmax, ax.get_ylim()[1])
 
             # remove the numbers of the y axis for all plots
             plt.setp(ax.get_yticklabels(), visible=False)
@@ -747,12 +751,6 @@ def plot_profile(self):
                 # on each row and make the numbers and ticks visible
                 plt.setp(ax.get_yticklabels(), visible=True)
                 ax.axes.set_ylabel(self.y_axis_label)
-                """
-                # reduce the number of yticks by half
-                num_ticks = len(ax.get_yticks())
-                yticks = [ax.get_yticks()[i] for i in range(1, num_ticks, 2)]
-                ax.set_yticks(yticks)
-                """
 
             totalWidth = sub_matrix['matrix'].shape[1]
             xticks, xtickslabel = self.getTicks(tickIdx)
@@ -776,23 +774,23 @@ def plot_profile(self):
                           frameon=False, markerscale=0.5)
                 if len(self.y_min) == 1 and len(self.y_max) == 1:
                     first = False
+            ax_list.append(ax)
 
-            """
-            ax.legend(bbox_to_anchor=(-0.05, -1.13, 1., 1),
-                      loc='upper center',
-                      ncol=1, mode="expand", prop=font_p,
-                      frameon=False, markerscale=0.5)
-            """
-            lims = ax.get_ylim()
+        # It turns out that set_ylim only takes np.float64s
+        for sample_id, subplot in enumerate(ax_list):
+            localYMin = self.y_min[sample_id % len(self.y_min)]
+            localYMax = self.y_max[sample_id % len(self.y_max)]
+            lims = [globalYmin, globalYmax]
             if localYMin is not None:
-                lims = (localYMin, lims[1])
-            if localYMax is not None:
-                lims = (lims[0], localYMax)
+                if localYMax is not None:
+                    lims = (np.float64(localYMin), np.float64(localYMax))
+                else:
+                    lims = (np.float64(localYMin), lims[1])
+            elif localYMax is not None:
+                lims = (lims[0], np.float64(localYMax))
             if lims[0] >= lims[1]:
                 lims = (lims[0], lims[0] + 1)
-            ax.set_ylim(lims)
-
-            ax_list.append(ax)
+            ax_list[sample_id].set_ylim(lims)
 
         plt.subplots_adjust(wspace=0.05, hspace=0.3)
         plt.tight_layout()

diff --git a/deeptools/sumCoveragePerBin.py b/deeptools/sumCoveragePerBin.py
@@ -85,10 +85,7 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
             c = 0
             try:
                 # BAM input
-                if chrom in bamHandle.references:
-                    reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
-                             if r.flag & 4 == 0]
-                else:
+                if chrom not in bamHandle.references:
                     raise NameError("chromosome {} not found in bam file".format(chrom))
             except:
                 # bigWig input, as used by plotFingerprint
@@ -104,7 +101,9 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
             prev_pos = set()
             lpos = None
             # of previous processed read pair
-            for read in reads:
+            for read in bamHandle.fetch(chrom, regStart, regEnd):
+                if read.is_unmapped:
+                    continue
                 if self.minMappingQuality and read.mapq < self.minMappingQuality:
                     continue
 

diff --git a/deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py b/deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py
@@ -62,6 +62,18 @@ def test_bigwigCompare_skipnas():
     unlink(outfile)
 
 
+def test_bigwigCompare_skipZeroOverZero():
+    outfile = '/tmp/result.bg"'
+    args = "-b1 {} -b2 {} -o {} --skipZeroOverZero --pseudocount 1 3 --outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_A, outfile).split()
+    bwComp.main(args)
+    _foo = open(outfile, 'r')
+    resp = _foo.readlines()
+    _foo.close()
+    expected = ['3R\t100\t200\t-1\n']
+    assert resp == expected, "{} != {}".format(resp, expected)
+    unlink(outfile)
+
+
 def test_multiBigwigSummary():
     outfile = '/tmp/result.bg'
     args = "bins -b {} {} --binSize 50 -o {}".format(BIGWIG_A, BIGWIG_B, outfile).split()

diff --git a/deeptools/writeBedGraph_bam_and_bw.py b/deeptools/writeBedGraph_bam_and_bw.py
@@ -45,7 +45,7 @@ def writeBedGraph_wrapper(args):
 def writeBedGraph_worker(
         chrom, start, end, tileSize, defaultFragmentLength,
         bamOrBwFileList, func, funcArgs, extendPairedEnds=True, smoothLength=0,
-        missingDataAsZero=False, fixed_step=False):
+        skipZeroOverZero=False, missingDataAsZero=False, fixed_step=False):
     r"""
     Writes a bedgraph having as base a number of bam files.
 
@@ -102,6 +102,10 @@ def writeBedGraph_worker(
                              "files. Remove this chromosome from the bigwig file "
                              "to continue".format(chrom))
 
+        if skipZeroOverZero and np.sum(tileCoverage) == 0:
+            previousValue = None
+            continue
+
         value = func(tileCoverage, funcArgs)
 
         if fixed_step:
@@ -147,7 +151,7 @@ def writeBedGraph(
         bamOrBwFileList, outputFileName, fragmentLength,
         func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1,
         format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
-        smoothLength=0, fixed_step=False, verbose=False):
+        skipZeroOverZero=False, smoothLength=0, fixed_step=False, verbose=False):
     r"""
     Given a list of bamfiles, a function and a function arguments,
     this method writes a bedgraph file (or bigwig) file
@@ -206,7 +210,7 @@ def writeBedGraph(
 
     res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                                func, funcArgs, extendPairedEnds, smoothLength,
-                               missingDataAsZero, fixed_step),
+                               skipZeroOverZero, missingDataAsZero, fixed_step),
                               writeBedGraph_wrapper,
                               chromNamesAndSize,
                               genomeChunkLength=genomeChunkLength,

diff --git a/galaxy/wrapper/bamCompare.xml b/galaxy/wrapper/bamCompare.xml
@@ -164,11 +164,7 @@
                 <expand macro="read_processing_options" />
 
                 <expand macro="skipNAs" />
-                <param argument="--skipZeroOverZero" type="select" label="Skip bins of no coverage"
-                    help="Skip bins where BOTH files lack coverage.">
-                    <option value="" selected="true">No</option>
-                    <option value="--skipZeroOverZero">Yes, skip them.</option>
-                </param>
+                <expand macro="skipZeroOverZero" />
 
                 <param argument="--ignoreForNormalization" type="text" value="" size="50"
                     label="regions that should be excluded for calculating the scaling factor"

diff --git a/galaxy/wrapper/bigwigCompare.xml b/galaxy/wrapper/bigwigCompare.xml
@@ -28,6 +28,7 @@
             #if $advancedOpt.showAdvancedOpt == "yes":
 
               $advancedOpt.skipNAs
+              $advancedOpt.skipZeroOverZero
               --scaleFactors '$advancedOpt.scaleFactor1:$advancedOpt.scaleFactor2'
               --binSize $advancedOpt.binSize
 
@@ -95,6 +96,7 @@
                     help="Size of the bins in bases for the output of the bigwig/bedgraph file."/>
 
                 <expand macro="skipNAs" />
+                <expand macro="skipZeroOverZero" />
                 <expand macro="scaleFactors" />
                 <expand macro="plotTitle" />
                 <expand macro="blacklist" />

diff --git a/galaxy/wrapper/deepTools_macros.xml b/galaxy/wrapper/deepTools_macros.xml
@@ -1,10 +1,10 @@
 <macros>
 
     <token name="@THREADS@">--numberOfProcessors "\${GALAXY_SLOTS:-4}"</token>
-    <token name="@WRAPPER_VERSION@">3.2.0.0</token>
+    <token name="@WRAPPER_VERSION@">3.2.1.0</token>
     <xml name="requirements">
         <requirements>
-            <requirement type="package" version="3.2.0">deeptools</requirement>
+            <requirement type="package" version="3.2.1">deeptools</requirement>
             <requirement type="package" version="1.9">samtools</requirement>
         </requirements>
         <expand macro="stdio" />
@@ -650,6 +650,14 @@ is vital to you, select Yes below.">
                  (default: False)" />
     </xml>
 
+    <xml name="skipZeroOverZero">
+        <param argument="--skipZeroOverZero" type="select" label="Skip bins of no coverage"
+            help="Skip bins where BOTH files lack coverage.">
+            <option value="" selected="true">No</option>
+            <option value="--skipZeroOverZero">Yes, skip them.</option>
+        </param>
+    </xml>
+
     <xml name="exactScaling">
         <param argument="--exactScaling" type="boolean" truevalue="--exactScaling" falsevalue="" checked="False"
             label="Compute an exact scaling factor"

diff --git a/setup.py b/setup.py
@@ -86,8 +86,8 @@ def openREADME():
         "scipy >= 0.17.0",
         "matplotlib >= 2.1.2",
         "pysam >= 0.14.0",
-        "numpydoc >=0.5",
-        "pyBigWig >=0.2.1",
+        "numpydoc >= 0.5",
+        "pyBigWig >= 0.2.1",
         "py2bit >= 0.2.0",
         "plotly >= 2.0.0",
         "deeptoolsintervals >= 0.1.7"