Skip to content

Commit b22e7d1

Browse files
committed
Merge branch 'feature/implement619_CRAM' into develop
2 parents 3c55ce0 + 33a6ae1 commit b22e7d1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+84281
-520
lines changed

.planemo.sh

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,7 @@
11
#!/bin/bash
2-
planemo=./foo/bin/planemo
3-
owd=`pwd`
4-
temp_dir=`mktemp -d`
5-
cd $temp_dir
6-
conda config --add channels conda-forge
7-
conda config --add channels bioconda
8-
conda create -y --name gxtest numpy pysam
9-
source activate gxtest
10-
git clone --depth 1 https://github.com/galaxyproject/galaxy.git
11-
cd galaxy
12-
make client
13-
./scripts/common_startup.sh --skip-venv --dev-wheels
14-
cd ..
15-
# reset what's available in conda
16-
cd $owd
17-
conda install --yes -c conda-forge numpy scipy matplotlib==2.1.0 plotly==2.0.12
18-
conda install --yes -c bioconda -c conda-forge pysam pyBigWig py2bit
19-
python setup.py install
2+
planemo=$HOME/miniconda/envs/planemo/bin/planemo
203

21-
#galaxy/wrapper/correctGCBias.xml \
22-
$planemo test --postgres --no_dependency_resolution --galaxy_root $temp_dir/galaxy \
4+
$planemo test --no_dependency_resolution --install_galaxy \
235
galaxy/wrapper/alignmentSieve.xml \
246
galaxy/wrapper/bamCompare.xml \
257
galaxy/wrapper/bamCoverage.xml \
@@ -28,6 +10,7 @@ galaxy/wrapper/bigwigCompare.xml \
2810
galaxy/wrapper/computeGCBias.xml \
2911
galaxy/wrapper/computeMatrix.xml \
3012
galaxy/wrapper/computeMatrixOperations.xml \
13+
galaxy/wrapper/correctGCBias.xml \
3114
galaxy/wrapper/estimateReadFiltering.xml \
3215
galaxy/wrapper/multiBamSummary.xml \
3316
galaxy/wrapper/multiBigwigSummary.xml \

.travis.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ jobs:
1414

1515
# Setup anaconda
1616
before_install:
17-
- if [[ "$TRAVIS_OS_NAME" == "linux" && ${TESTGALAXY:-"0"} == "1" ]] ; then pip install virtualenv --user ; fi
18-
- if [[ "$TRAVIS_OS_NAME" == "linux" && ${TESTGALAXY:-"0"} == "1" ]] ; then virtualenv foo; source foo/bin/activate; pip install planemo ; deactivate ; fi
1917
- if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_PYTHON_VERSION" == "2.7" ]] ; then curl https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -o miniconda.sh ; fi
2018
- if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_PYTHON_VERSION" == "3.6" ]] ; then curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o miniconda.sh ; fi
2119
- if [[ "$TRAVIS_OS_NAME" == "osx" && "$TRAVIS_PYTHON_VERSION" == "2.7" ]] ; then curl https://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -o miniconda.sh ; fi
@@ -25,13 +23,14 @@ before_install:
2523
- hash -r
2624
- conda config --set always_yes yes --set changeps1 no
2725
- conda update -q conda
26+
- if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_PYTHON_VERSION" == "3.6" && ${TESTGALAXY:-"0"} == "1" ]] ; then conda create -c conda-forge -c bioconda -n planemo planemo; fi
2827

2928
# Useful for debugging any issues with conda
3029
- conda info -a
3130

3231
# Install packages
3332
install:
34-
- conda install --yes -c conda-forge -c bioconda python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib==2.1.0 nose flake8 plotly==2.0.12 pysam pyBigWig py2bit
33+
- conda install --yes -c conda-forge -c bioconda python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib==2.1.0 nose flake8 plotly==2.0.12 pysam==0.14.0 pyBigWig py2bit cython
3534
- python setup.py install
3635

3736
# command to run tests
@@ -41,6 +40,6 @@ script:
4140
- if [[ ${TESTGALAXY:-"2"} == "2" ]] ; then cd ~/ && nosetests --with-doctest -sv deeptools ; fi
4241
- if [[ ${TESTGALAXY:-"2"} == "2" ]] ; then nosetests --with-doctest -sv deeptoolsintervals ; fi
4342
- if [[ ${TESTGALAXY:-"2"} == "2" ]] ; then cd ${owd} ; fi
44-
- if [[ "$TRAVIS_OS_NAME" == "linux" && ${TESTGALAXY:-"0"} == "1" ]] ; then /home/travis/build/deeptools/deepTools/foo/bin/planemo lint galaxy/wrapper/ ; fi
43+
- if [[ "$TRAVIS_OS_NAME" == "linux" && ${TESTGALAXY:-"0"} == "1" ]] ; then /home/travis/miniconda/envs/planemo/bin/planemo lint galaxy/wrapper/ ; fi
4544
- if [[ "$TRAVIS_OS_NAME" == "linux" && ${TESTGALAXY:-"0"} == "1" ]] ; then ./.planemo.sh ; fi
4645
sudo: false

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
* The `-o` option can now be universally used to indicate the file to save a tool's primary output. Previously, some tools use `-o`, some used `-out` and still others used things like `-hist` or `-freq`. This caused annoyance due to having to always remember the appropriate switch. Hopefully standardizing to `-o` will alleviate this. (issue #640)
3333
* Using a --blackListFileName with overlapping regions will typically now cause the various deepTools programs to stop. This is to ensure that resulting scale factors are correct (issue #649)
3434
* `bamCoverage` is a bit more efficient with small BAM files now due to underlying algorithmic changes. Relatedely, bamCoverage will skip some unnecessary estimation steps if you are not filtering reads, further speeding processing a bit. (issue #662)
35+
* Added support for CRAM files. This requires pysam > 0.13.0 (issue #619).
3536

3637
2.5.7
3738

deeptools/SES_scaleFactor.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
1616
normalizationLength,
1717
avg_method='median', blackListFileName=None, numberOfProcessors=1,
18-
verbose=False, chrsToSkip=[]):
18+
verbose=False, chrsToSkip=[], mappingStatsList=[]):
1919
r"""
2020
Subdivides the genome into chunks to be analyzed in parallel
2121
using several processors. The code handles the creation of
@@ -44,6 +44,8 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
4444
scale estimation. Usually the chrX is included.
4545
blackListFileName : str
4646
BED file containing blacklisted regions
47+
mappingStatsList : list
48+
List of the number of mapped reads per file
4749
4850
Returns
4951
-------
@@ -76,8 +78,12 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
7678

7779
assert len(bamFilesList) == 2, "SES scale factors are only defined for 2 files"
7880

79-
bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList]
80-
mappedReads = [x.mapped for x in bamFilesHandlers]
81+
if len(mappingStatsList) == len(bamFilesList):
82+
mappedReads = mappingStatsList
83+
else:
84+
mappedReads = []
85+
for fname in bamFilesList:
86+
mappedReads.append(bamHandler.openBam(fname, returnStats=True, nThreads=numberOfProcessors)[1])
8187

8288
sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')
8389

deeptools/alignmentSieve.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -205,17 +205,11 @@ def filterWorker(arglist):
205205
chrom, start, end, args, chromDict = arglist
206206
fh = openBam(args.bam)
207207

208-
if fh.is_cram:
209-
mode = 'wc'
210-
oname = getTempFileName(suffix='.cram')
211-
if args.filteredOutReads:
212-
onameFiltered = getTempFileName(suffix='.cram')
208+
mode = 'wbu'
209+
oname = getTempFileName(suffix='.bam')
210+
if args.filteredOutReads:
211+
onameFiltered = getTempFileName(suffix='.bam')
213212
else:
214-
mode = 'wbu'
215-
oname = getTempFileName(suffix='.bam')
216-
if args.filteredOutReads:
217-
onameFiltered = getTempFileName(suffix='.bam')
218-
if not args.filteredOutReads:
219213
onameFiltered = None
220214
ofh = pysam.AlignmentFile(oname, mode=mode, template=fh)
221215
if onameFiltered:
@@ -382,8 +376,8 @@ def main(args=None):
382376
elif args.ATACshift:
383377
args.shift = [4, -5, 5, -4]
384378

385-
bam = openBam(args.bam)
386-
total = bam.mapped + bam.unmapped
379+
bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
380+
total = mapped + unmapped
387381
chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)]
388382
chromDict = {x: y for x, y in zip(bam.references, bam.lengths)}
389383

deeptools/bamCompare.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def process_args(args=None):
158158
# while get_scale_factor is used for depth normalization
159159

160160

161-
def get_scale_factors(args):
161+
def get_scale_factors(args, statsList, mappedList):
162162

163163
if args.scaleFactors:
164164
scale_factors = list(map(float, args.scaleFactors.split(":")))
@@ -167,6 +167,7 @@ def get_scale_factors(args):
167167
[args.bamfile1, args.bamfile2],
168168
args.sampleLength, args.numberOfSamples,
169169
1,
170+
mappingStatsList=mappedList,
170171
blackListFileName=args.blackListFileName,
171172
numberOfProcessors=args.numberOfProcessors,
172173
verbose=args.verbose,
@@ -175,9 +176,6 @@ def get_scale_factors(args):
175176
scale_factors = scalefactors_dict['size_factors']
176177

177178
if args.verbose:
178-
bam1 = bamHandler.openBam(args.bamfile1)
179-
bam2 = bamHandler.openBam(args.bamfile2)
180-
181179
print("Size factors using SES: {}".format(scale_factors))
182180
print("%s regions of size %s where used " %
183181
(scalefactors_dict['sites_sampled'],
@@ -186,19 +184,17 @@ def get_scale_factors(args):
186184
print("ignoring filtering/blacklists, size factors if the number of mapped "
187185
"reads would have been used:")
188186
print(tuple(
189-
float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])))
190-
bam1.close()
191-
bam2.close()
187+
float(min(mappedList)) / np.array(mappedList)))
192188

193189
elif args.scaleFactorsMethod == 'readCount':
194190
# change the scaleFactor to 1.0
195191
args.scaleFactor = 1.0
196192
# get num of kept reads for bam file 1
197193
args.bam = args.bamfile1
198-
bam1_mapped, _ = get_num_kept_reads(args)
194+
bam1_mapped, _ = get_num_kept_reads(args, statsList[0])
199195
# get num of kept reads for bam file 2
200196
args.bam = args.bamfile2
201-
bam2_mapped, _ = get_num_kept_reads(args)
197+
bam2_mapped, _ = get_num_kept_reads(args, statsList[1])
202198

203199
mapped_reads = [bam1_mapped, bam2_mapped]
204200

@@ -237,16 +233,22 @@ def main(args=None):
237233
print("Warning! RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare. Ignored..")
238234
args.effectiveGenomeSize = None
239235

240-
scale_factors = get_scale_factors(args)
236+
# Get mapping statistics
237+
bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors)
238+
bam1.close()
239+
bam2, mapped2, unmapped2, stats2 = bamHandler.openBam(args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors)
240+
bam2.close()
241+
242+
scale_factors = get_scale_factors(args, [stats1, stats2], [mapped1, mapped2])
241243
if scale_factors is None:
242244
# check whether one of the depth norm methods are selected
243245
if args.normalizeUsing is not None:
244246
args.scaleFactor = 1.0
245247
# if a normalization is required then compute the scale factors
246248
args.bam = args.bamfile1
247-
scale_factor_bam1 = get_scale_factor(args)
249+
scale_factor_bam1 = get_scale_factor(args, stats1)
248250
args.bam = args.bamfile2
249-
scale_factor_bam2 = get_scale_factor(args)
251+
scale_factor_bam2 = get_scale_factor(args, stats2)
250252
scale_factors = [scale_factor_bam1, scale_factor_bam2]
251253
else:
252254
scale_factors = [1, 1]

deeptools/bamCoverage.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from deeptools import writeBedGraph # This should be made directly into a bigWig
99
from deeptools import parserCommon
1010
from deeptools.getScaleFactor import get_scale_factor
11+
from deeptools.bamHandler import openBam
1112

1213
debug = 0
1314

@@ -148,7 +149,9 @@ def main(args=None):
148149

149150
if args.normalizeUsing:
150151
# if a normalization is required then compute the scale factors
151-
scale_factor = get_scale_factor(args)
152+
bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
153+
bam.close()
154+
scale_factor = get_scale_factor(args, stats)
152155
else:
153156
scale_factor = args.scaleFactor
154157

deeptools/bamHandler.py

Lines changed: 89 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,103 @@
11
import sys
22
import pysam
3+
from deeptools.mapReduce import mapReduce
34

45

5-
def openBam(bamFile):
6+
def countReadsInInterval(args):
7+
chrom, start, end, fname, toEOF = args
8+
9+
bam = openBam(fname)
10+
mapped = 0
11+
unmapped = 0
12+
for b in bam.fetch(chrom, start, end):
13+
if chrom == "*":
14+
unmapped += 1
15+
continue
16+
if b.pos < start:
17+
continue
18+
if not b.is_unmapped:
19+
mapped += 1
20+
else:
21+
unmapped += 1
22+
return mapped, unmapped, chrom
23+
24+
25+
def getMappingStats(bam, nThreads):
26+
"""
27+
This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless
28+
29+
This requires pysam > 0.13.0
30+
"""
31+
header = [(x, y) for x, y in zip(bam.references, bam.lengths)]
32+
res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads)
33+
34+
mapped = sum([x[0] for x in res])
35+
unmapped = sum([x[1] for x in res])
36+
stats = {x[0]: [0, 0] for x in header}
37+
for r in res:
38+
stats[r[2]][0] += r[0]
39+
stats[r[2]][1] += r[1]
40+
41+
# We need to count the number of unmapped reads as well
42+
unmapped += bam.count("*")
43+
44+
return mapped, unmapped, stats
45+
46+
47+
def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True):
48+
"""
49+
A wrapper for opening BAM/CRAM files.
50+
51+
bamFile: str
52+
A BAM/CRAM file name
53+
54+
returnStats: bool
55+
Return a tuple of (file_handle, nMappedReads, nUnmappedReads, statsDict).
56+
These additional values are needed by some downstream functions, since one
57+
can't use file_handle.mapped on CRAM files (or idxstats())
58+
59+
nThreads: int
60+
If returnStats is True, number of threads to use for computing statistics
61+
62+
minimalDecoding: Bool
63+
For CRAM files, don't decode the read name, sequence, qual, or auxiliary tag fields (these aren't used by most functions).
64+
65+
Returns either the file handle or a tuple as described in returnStats
66+
"""
67+
format_options = ["required_fields=0x1FF"]
68+
if sys.version_info.major >= 3:
69+
format_options = [b"required_fields=0x1FF"]
70+
if not minimalDecoding:
71+
format_options = None
672
try:
7-
bam = pysam.Samfile(bamFile, 'rb')
73+
bam = pysam.Samfile(bamFile, 'rb', format_options=format_options)
874
except IOError:
975
sys.exit("The file '{}' does not exist".format(bamFile))
1076
except:
11-
sys.exit("The file '{}' does not have BAM format ".format(bamFile))
77+
sys.exit("The file '{}' does not have BAM or CRAM format ".format(bamFile))
1278

1379
try:
14-
if 'check_index' in dir(bam):
15-
assert(bam.check_index() is not False)
16-
else:
17-
# The proper check_index() function wasn't implemented until pysam 0.8.4!
18-
assert(bam._hasIndex() is not False)
80+
assert(bam.check_index() is not False)
1981
except:
2082
sys.exit("'{}' does not appear to have an index. You MUST index the file first!".format(bamFile))
2183

22-
if bam.mapped == 0:
23-
sys.exit("'{}' does not have any mapped reads. Please "
24-
"check that the file is properly indexed and "
25-
"the it containes mapped reads.".format(bamFile))
84+
if bam.is_cram and returnStats:
85+
mapped, unmapped, stats = getMappingStats(bam, nThreads)
86+
elif bam.is_bam:
87+
mapped = bam.mapped
88+
unmapped = bam.unmapped
89+
90+
# Make the dictionary to hold the stats
91+
if returnStats:
92+
stats = {chrom.contig: [chrom.mapped, chrom.unmapped] for chrom in bam.get_index_statistics()}
93+
94+
if bam.is_bam or (bam.is_cram and returnStats):
95+
if mapped == 0:
96+
sys.exit("'{}' does not have any mapped reads. Please "
97+
"check that the file is properly indexed and "
98+
"that it contains mapped reads.".format(bamFile))
2699

27-
return bam
100+
if returnStats:
101+
return bam, mapped, unmapped, stats
102+
else:
103+
return bam

deeptools/computeGCBias.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,7 @@ def main(args=None):
655655
global_vars['extra_sampling_file'] = extra_sampling_file
656656

657657
tbit = py2bit.open(global_vars['2bit'])
658-
bam = bamHandler.openBam(global_vars['bam'])
658+
bam, mapped, unmapped, stats = bamHandler.openBam(global_vars['bam'], returnStats=True, nThreads=args.numberOfProcessors)
659659

660660
if args.fragmentLength:
661661
fragment_len_dict = \
@@ -676,7 +676,7 @@ def main(args=None):
676676
chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)
677677

678678
global_vars['genome_size'] = sum(tbit.chroms().values())
679-
global_vars['total_reads'] = bam.mapped
679+
global_vars['total_reads'] = mapped
680680
global_vars['reads_per_bp'] = \
681681
float(global_vars['total_reads']) / args.effectiveGenomeSize
682682

@@ -740,7 +740,7 @@ def __init__(self):
740740
self.mappability = self.root + "mappability.bw"
741741
self.chrNameBam = '2L'
742742
self.chrNameBit = 'chr2L'
743-
bam = bamHandler.openBam(self.bamFile)
743+
bam, mapped, unmapped, stats = bamHandler.openBam(self.bamFile, returnStats=True)
744744
tbit = py2bit.open(self.tbitFile)
745745
global debug
746746
debug = 0
@@ -754,7 +754,7 @@ def __init__(self):
754754
'min_reads': 0,
755755
'min_reads': 0,
756756
'reads_per_bp': 0.3,
757-
'total_reads': bam.mapped,
757+
'total_reads': mapped,
758758
'genome_size': sum(tbit.chroms().values())
759759
}
760760

0 commit comments

Comments
 (0)