Skip to content

Commit

Permalink
Develop (#827)
Browse files Browse the repository at this point in the history
* Merged into the wrong branch without noticing :( (#814)

* use better conda link (#799)

* Estimated filtering fix (#813)

* oops

* fix testing and set a max number of filtered reads

* apparently a bunch of things were getting skipped

* fix wrappers

* update computeMatrix wrapper

* Decrease memory needs (#817)

* Use an iterator to not blow memory up

* Update a bit more

* The GC bias stuff is all deprecated, I'm not fixing that old code

* Cache resulting counts rather than just decreasing the bin size (#818)

* Cache resulting counts rather than just decreasing the bin size

* sanity check

* Implement #815

* [skip ci] update change log

* Implement #816 (#825)

* Implement #816

* expose option

* Add a test using pseudocounts and skipZeroOverZero

* syntax

* Fix tests

* Make --skipZeroOverZero a galaxy macro and add to bigwigCompare

* [ci skip] a bit of formatting

* Fix #822 (#826)
  • Loading branch information
dpryan79 authored Apr 1, 2019
1 parent 3cefc7d commit 6f68c67
Show file tree
Hide file tree
Showing 15 changed files with 97 additions and 56 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
3.2.1

* Changed a bug in `estimateReadFiltering` where the estimated number of filtered reads was typically too low.
* Made an internal change that should drastically reduce the memory requirements of many tools. This slightly increases run time, but as the resulting resource usage is much more attractive this is judged worthwhile.
* An informative error message is now produced with `bamCoverage` if RPGC normalization is requested but no effective genome size is provided (issue #815).
* Fixes some issues with y-axis scaling (issue #822)

3.2.0

Expand Down
2 changes: 1 addition & 1 deletion deeptools/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# This file is originally generated from Git information by running 'setup.py
# version'. Distribution tarballs contain a pre-generated copy of this file.

__version__ = '3.2.0'
__version__ = '3.2.1'
2 changes: 2 additions & 0 deletions deeptools/bamCoverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ def main(args=None):

if args.normalizeUsing == 'None':
args.normalizeUsing = None # For the sake of sanity
elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize:
sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n")

if args.normalizeUsing:
# if a normalization is required then compute the scale factors
Expand Down
7 changes: 7 additions & 0 deletions deeptools/bigwigCompare.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ def parse_arguments(args=None):
type=float,
required=False)

parser.add_argument('--skipZeroOverZero',
help='Skip bins where BOTH BAM files lack coverage. '
'This is determined BEFORE any applicable pseudocount '
'is added.',
action='store_true')

parser.add_argument('--operation',
help='The default is to output the log2ratio of the '
'two samples. The reciprocal ratio returns the '
Expand Down Expand Up @@ -159,6 +165,7 @@ def main(args=None):
blackListFileName=args.blackListFileName,
verbose=args.verbose,
numberOfProcessors=args.numberOfProcessors,
skipZeroOverZero=args.skipZeroOverZero,
format=args.outFileFormat,
smoothLength=False,
missingDataAsZero=not args.skipNonCoveredRegions,
Expand Down
6 changes: 6 additions & 0 deletions deeptools/correctGCBias.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,11 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
if r.flag & 4 == 0]

bam.close()

r_index = -1
for read in reads:
if read.is_unmapped:
continue
r_index += 1
try:
# calculate GC content of read fragment
Expand Down Expand Up @@ -340,6 +343,7 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,
matePairs = {}
read_repetitions = 0
removed_duplicated_reads = 0

# cache data
# r.flag & 4 == 0 is to filter unmapped reads that
# have a genomic position
Expand All @@ -348,6 +352,8 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,

r_index = -1
for read in reads:
if read.pos <= start or read.is_unmapped:
continue
r_index += 1
copies = None
gc = None
Expand Down
9 changes: 4 additions & 5 deletions deeptools/countReadsPerBin.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,16 +603,15 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
start_time = time.time()
# caching seems faster. TODO: profile the function
c = 0
if chrom in bamHandle.references:
reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
if r.flag & 4 == 0]
else:
if chrom not in bamHandle.references:
raise NameError("chromosome {} not found in bam file".format(chrom))

prev_pos = set()
lpos = None
# of previous processed read pair
for read in reads:
for read in bamHandle.fetch(chrom, regStart, regEnd):
if read.is_unmapped:
continue
if self.minMappingQuality and read.mapq < self.minMappingQuality:
continue

Expand Down
23 changes: 14 additions & 9 deletions deeptools/getScaleFactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,20 @@ def getFractionKept_wrapper(args):
return getFractionKept_worker(*args)


def getFractionKept_worker(chrom, start, end, bamFile, args):
def getFractionKept_worker(chrom, start, end, bamFile, args, offset):
"""
Queries the BAM file and counts the number of alignments kept/found in the
first 50000 bases.
"""
bam = bamHandler.openBam(bamFile)
start += offset * 50000
end = min(end, start + 50000)
tot = 0
filtered = 0

if end <= start:
return (filtered, tot)

prev_pos = set()
lpos = None
if chrom in bam.references:
Expand Down Expand Up @@ -151,13 +156,10 @@ def fraction_kept(args, stats):
else:
chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

while total < num_needed_to_sample and distanceBetweenBins > 50000:
# If we've iterated, then halve distanceBetweenBins
distanceBetweenBins /= 2
if distanceBetweenBins < 50000:
distanceBetweenBins = 50000

res = mapReduce.mapReduce((bam_handle.filename, args),
offset = 0
# Iterate over bins at various non-overlapping offsets until we have enough data
while total < num_needed_to_sample and offset < np.ceil(distanceBetweenBins / 50000):
res = mapReduce.mapReduce((bam_handle.filename, args, offset),
getFractionKept_wrapper,
chrom_sizes,
genomeChunkLength=distanceBetweenBins,
Expand All @@ -166,7 +168,10 @@ def fraction_kept(args, stats):
verbose=args.verbose)

if len(res):
filtered, total = np.sum(res, axis=0)
foo, bar = np.sum(res, axis=0)
filtered += foo
total += bar
offset += 1

if total == 0:
# This should never happen
Expand Down
46 changes: 22 additions & 24 deletions deeptools/plotProfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,8 @@ def plot_profile(self):

first = True
ax_list = []
globalYmin = np.inf
globalYmax = -np.inf
for plot in range(self.numplots):
localYMin = None
localYMax = None
Expand All @@ -698,7 +700,7 @@ def plot_profile(self):
if (row == 0 and col == 0) or len(self.y_min) > 1 or len(self.y_max) > 1:
ax = self.fig.add_subplot(self.grids[row, col])
else:
ax = self.fig.add_subplot(self.grids[row, col], sharey=ax_list[0])
ax = self.fig.add_subplot(self.grids[row, col])

if self.per_group:
title = self.hm.matrix.group_labels[plot]
Expand All @@ -717,10 +719,10 @@ def plot_profile(self):
_row, _col = plot, data_idx
else:
_row, _col = data_idx, plot
if localYMin is None or self.y_min[_col % len(self.y_min)] < localYMin:
localYMin = self.y_min[_col % len(self.y_min)]
if localYMax is None or self.y_max[_col % len(self.y_max)] > localYMax:
localYMax = self.y_max[_col % len(self.y_max)]
if localYMin is None or self.y_min[col % len(self.y_min)] < localYMin:
localYMin = self.y_min[col % len(self.y_min)]
if localYMax is None or self.y_max[col % len(self.y_max)] > localYMax:
localYMax = self.y_max[col % len(self.y_max)]

sub_matrix = self.hm.matrix.get_matrix(_row, _col)

Expand All @@ -738,6 +740,8 @@ def plot_profile(self):
self.color_list[coloridx],
label,
plot_type=self.plot_type)
globalYmin = min(np.float64(globalYmin), ax.get_ylim()[0])
globalYmax = max(globalYmax, ax.get_ylim()[1])

# remove the numbers of the y axis for all plots
plt.setp(ax.get_yticklabels(), visible=False)
Expand All @@ -747,12 +751,6 @@ def plot_profile(self):
# on each row and make the numbers and ticks visible
plt.setp(ax.get_yticklabels(), visible=True)
ax.axes.set_ylabel(self.y_axis_label)
"""
# reduce the number of yticks by half
num_ticks = len(ax.get_yticks())
yticks = [ax.get_yticks()[i] for i in range(1, num_ticks, 2)]
ax.set_yticks(yticks)
"""

totalWidth = sub_matrix['matrix'].shape[1]
xticks, xtickslabel = self.getTicks(tickIdx)
Expand All @@ -776,23 +774,23 @@ def plot_profile(self):
frameon=False, markerscale=0.5)
if len(self.y_min) == 1 and len(self.y_max) == 1:
first = False
ax_list.append(ax)

"""
ax.legend(bbox_to_anchor=(-0.05, -1.13, 1., 1),
loc='upper center',
ncol=1, mode="expand", prop=font_p,
frameon=False, markerscale=0.5)
"""
lims = ax.get_ylim()
# It turns out that set_ylim only takes np.float64s
for sample_id, subplot in enumerate(ax_list):
localYMin = self.y_min[sample_id % len(self.y_min)]
localYMax = self.y_max[sample_id % len(self.y_max)]
lims = [globalYmin, globalYmax]
if localYMin is not None:
lims = (localYMin, lims[1])
if localYMax is not None:
lims = (lims[0], localYMax)
if localYMax is not None:
lims = (np.float64(localYMin), np.float64(localYMax))
else:
lims = (np.float64(localYMin), lims[1])
elif localYMax is not None:
lims = (lims[0], np.float64(localYMax))
if lims[0] >= lims[1]:
lims = (lims[0], lims[0] + 1)
ax.set_ylim(lims)

ax_list.append(ax)
ax_list[sample_id].set_ylim(lims)

plt.subplots_adjust(wspace=0.05, hspace=0.3)
plt.tight_layout()
Expand Down
9 changes: 4 additions & 5 deletions deeptools/sumCoveragePerBin.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,7 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
c = 0
try:
# BAM input
if chrom in bamHandle.references:
reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
if r.flag & 4 == 0]
else:
if chrom not in bamHandle.references:
raise NameError("chromosome {} not found in bam file".format(chrom))
except:
# bigWig input, as used by plotFingerprint
Expand All @@ -104,7 +101,9 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
prev_pos = set()
lpos = None
# of previous processed read pair
for read in reads:
for read in bamHandle.fetch(chrom, regStart, regEnd):
if read.is_unmapped:
continue
if self.minMappingQuality and read.mapq < self.minMappingQuality:
continue

Expand Down
12 changes: 12 additions & 0 deletions deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ def test_bigwigCompare_skipnas():
unlink(outfile)


def test_bigwigCompare_skipZeroOverZero():
outfile = '/tmp/result.bg"'
args = "-b1 {} -b2 {} -o {} --skipZeroOverZero --pseudocount 1 3 --outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_A, outfile).split()
bwComp.main(args)
_foo = open(outfile, 'r')
resp = _foo.readlines()
_foo.close()
expected = ['3R\t100\t200\t-1\n']
assert resp == expected, "{} != {}".format(resp, expected)
unlink(outfile)


def test_multiBigwigSummary():
outfile = '/tmp/result.bg'
args = "bins -b {} {} --binSize 50 -o {}".format(BIGWIG_A, BIGWIG_B, outfile).split()
Expand Down
10 changes: 7 additions & 3 deletions deeptools/writeBedGraph_bam_and_bw.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def writeBedGraph_wrapper(args):
def writeBedGraph_worker(
chrom, start, end, tileSize, defaultFragmentLength,
bamOrBwFileList, func, funcArgs, extendPairedEnds=True, smoothLength=0,
missingDataAsZero=False, fixed_step=False):
skipZeroOverZero=False, missingDataAsZero=False, fixed_step=False):
r"""
Writes a bedgraph having as base a number of bam files.
Expand Down Expand Up @@ -102,6 +102,10 @@ def writeBedGraph_worker(
"files. Remove this chromosome from the bigwig file "
"to continue".format(chrom))

if skipZeroOverZero and np.sum(tileCoverage) == 0:
previousValue = None
continue

value = func(tileCoverage, funcArgs)

if fixed_step:
Expand Down Expand Up @@ -147,7 +151,7 @@ def writeBedGraph(
bamOrBwFileList, outputFileName, fragmentLength,
func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1,
format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
smoothLength=0, fixed_step=False, verbose=False):
skipZeroOverZero=False, smoothLength=0, fixed_step=False, verbose=False):
r"""
Given a list of bamfiles, a function and a function arguments,
this method writes a bedgraph file (or bigwig) file
Expand Down Expand Up @@ -206,7 +210,7 @@ def writeBedGraph(

res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
func, funcArgs, extendPairedEnds, smoothLength,
missingDataAsZero, fixed_step),
skipZeroOverZero, missingDataAsZero, fixed_step),
writeBedGraph_wrapper,
chromNamesAndSize,
genomeChunkLength=genomeChunkLength,
Expand Down
6 changes: 1 addition & 5 deletions galaxy/wrapper/bamCompare.xml
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,7 @@
<expand macro="read_processing_options" />

<expand macro="skipNAs" />
<param argument="--skipZeroOverZero" type="select" label="Skip bins of no coverage"
help="Skip bins where BOTH files lack coverage.">
<option value="" selected="true">No</option>
<option value="--skipZeroOverZero">Yes, skip them.</option>
</param>
<expand macro="skipZeroOverZero" />

<param argument="--ignoreForNormalization" type="text" value="" size="50"
label="regions that should be excluded for calculating the scaling factor"
Expand Down
2 changes: 2 additions & 0 deletions galaxy/wrapper/bigwigCompare.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#if $advancedOpt.showAdvancedOpt == "yes":
$advancedOpt.skipNAs
$advancedOpt.skipZeroOverZero
--scaleFactors '$advancedOpt.scaleFactor1:$advancedOpt.scaleFactor2'
--binSize $advancedOpt.binSize
Expand Down Expand Up @@ -95,6 +96,7 @@
help="Size of the bins in bases for the output of the bigwig/bedgraph file."/>

<expand macro="skipNAs" />
<expand macro="skipZeroOverZero" />
<expand macro="scaleFactors" />
<expand macro="plotTitle" />
<expand macro="blacklist" />
Expand Down
12 changes: 10 additions & 2 deletions galaxy/wrapper/deepTools_macros.xml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
<macros>

<token name="@THREADS@">--numberOfProcessors "\${GALAXY_SLOTS:-4}"</token>
<token name="@WRAPPER_VERSION@">3.2.0.0</token>
<token name="@WRAPPER_VERSION@">3.2.1.0</token>
<xml name="requirements">
<requirements>
<requirement type="package" version="3.2.0">deeptools</requirement>
<requirement type="package" version="3.2.1">deeptools</requirement>
<requirement type="package" version="1.9">samtools</requirement>
</requirements>
<expand macro="stdio" />
Expand Down Expand Up @@ -650,6 +650,14 @@ is vital to you, select Yes below.">
(default: False)" />
</xml>

<xml name="skipZeroOverZero">
<param argument="--skipZeroOverZero" type="select" label="Skip bins of no coverage"
help="Skip bins where BOTH files lack coverage.">
<option value="" selected="true">No</option>
<option value="--skipZeroOverZero">Yes, skip them.</option>
</param>
</xml>

<xml name="exactScaling">
<param argument="--exactScaling" type="boolean" truevalue="--exactScaling" falsevalue="" checked="False"
label="Compute an exact scaling factor"
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ def openREADME():
"scipy >= 0.17.0",
"matplotlib >= 2.1.2",
"pysam >= 0.14.0",
"numpydoc >=0.5",
"pyBigWig >=0.2.1",
"numpydoc >= 0.5",
"pyBigWig >= 0.2.1",
"py2bit >= 0.2.0",
"plotly >= 2.0.0",
"deeptoolsintervals >= 0.1.7"
Expand Down

0 comments on commit 6f68c67

Please sign in to comment.