|
1 | 1 | import sys |
2 | 2 | import pysam |
| 3 | +from deeptools.mapReduce import mapReduce |
3 | 4 |
|
4 | 5 |
|
5 | | -def openBam(bamFile): |
| 6 | +def countReadsInInterval(args): |
| 7 | + chrom, start, end, fname, toEOF = args |
| 8 | + |
| 9 | + bam = openBam(fname) |
| 10 | + mapped = 0 |
| 11 | + unmapped = 0 |
| 12 | + for b in bam.fetch(chrom, start, end): |
| 13 | + if chrom == "*": |
| 14 | + unmapped += 1 |
| 15 | + continue |
| 16 | + if b.pos < start: |
| 17 | + continue |
| 18 | + if not b.is_unmapped: |
| 19 | + mapped += 1 |
| 20 | + else: |
| 21 | + unmapped += 1 |
| 22 | + return mapped, unmapped, chrom |
| 23 | + |
| 24 | + |
| 25 | +def getMappingStats(bam, nThreads): |
| 26 | + """ |
| 27 | + This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless |
| 28 | +
|
| 29 | + This requires pysam > 0.13.0 |
| 30 | + """ |
| 31 | + header = [(x, y) for x, y in zip(bam.references, bam.lengths)] |
| 32 | + res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads) |
| 33 | + |
| 34 | + mapped = sum([x[0] for x in res]) |
| 35 | + unmapped = sum([x[1] for x in res]) |
| 36 | + stats = {x[0]: [0, 0] for x in header} |
| 37 | + for r in res: |
| 38 | + stats[r[2]][0] += r[0] |
| 39 | + stats[r[2]][1] += r[1] |
| 40 | + |
| 41 | + # We need to count the number of unmapped reads as well |
| 42 | + unmapped += bam.count("*") |
| 43 | + |
| 44 | + return mapped, unmapped, stats |
| 45 | + |
| 46 | + |
| 47 | +def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True): |
| 48 | + """ |
| 49 | + A wrapper for opening BAM/CRAM files. |
| 50 | +
|
| 51 | + bamFile: str |
| 52 | + A BAM/CRAM file name |
| 53 | +
|
| 54 | + returnStats: bool |
| 55 | + Return a tuple of (file_handle, nMappedReads, nUnmappedReads, statsDict). |
| 56 | + These additional values are needed by some downstream functions, since one |
| 57 | + can't use file_handle.mapped on CRAM files (or idxstats()) |
| 58 | +
|
| 59 | + nThreads: int |
| 60 | + If returnStats is True, number of threads to use for computing statistics |
| 61 | +
|
| 62 | + minimalDecoding: Bool |
| 63 | + For CRAM files, don't decode the read name, sequence, qual, or auxiliary tag fields (these aren't used by most functions). |
| 64 | +
|
| 65 | + Returns either the file handle or a tuple as described in returnStats |
| 66 | + """ |
| 67 | + format_options = ["required_fields=0x1FF"] |
| 68 | + if sys.version_info.major >= 3: |
| 69 | + format_options = [b"required_fields=0x1FF"] |
| 70 | + if not minimalDecoding: |
| 71 | + format_options = None |
6 | 72 | try: |
7 | | - bam = pysam.Samfile(bamFile, 'rb') |
| 73 | + bam = pysam.Samfile(bamFile, 'rb', format_options=format_options) |
8 | 74 | except IOError: |
9 | 75 | sys.exit("The file '{}' does not exist".format(bamFile)) |
10 | 76 | except: |
11 | | - sys.exit("The file '{}' does not have BAM format ".format(bamFile)) |
| 77 | + sys.exit("The file '{}' does not have BAM or CRAM format ".format(bamFile)) |
12 | 78 |
|
13 | 79 | try: |
14 | | - if 'check_index' in dir(bam): |
15 | | - assert(bam.check_index() is not False) |
16 | | - else: |
17 | | - # The proper check_index() function wasn't implemented until pysam 0.8.4! |
18 | | - assert(bam._hasIndex() is not False) |
| 80 | + assert(bam.check_index() is not False) |
19 | 81 | except: |
20 | 82 | sys.exit("'{}' does not appear to have an index. You MUST index the file first!".format(bamFile)) |
21 | 83 |
|
22 | | - if bam.mapped == 0: |
23 | | - sys.exit("'{}' does not have any mapped reads. Please " |
24 | | - "check that the file is properly indexed and " |
25 | | - "the it containes mapped reads.".format(bamFile)) |
| 84 | + if bam.is_cram and returnStats: |
| 85 | + mapped, unmapped, stats = getMappingStats(bam, nThreads) |
| 86 | + elif bam.is_bam: |
| 87 | + mapped = bam.mapped |
| 88 | + unmapped = bam.unmapped |
| 89 | + |
| 90 | + # Make the dictionary to hold the stats |
| 91 | + if returnStats: |
| 92 | + stats = {chrom.contig: [chrom.mapped, chrom.unmapped] for chrom in bam.get_index_statistics()} |
| 93 | + |
| 94 | + if bam.is_bam or (bam.is_cram and returnStats): |
| 95 | + if mapped == 0: |
| 96 | + sys.exit("'{}' does not have any mapped reads. Please " |
| 97 | + "check that the file is properly indexed and " |
| 98 | + "that it contains mapped reads.".format(bamFile)) |
26 | 99 |
|
27 | | - return bam |
| 100 | + if returnStats: |
| 101 | + return bam, mapped, unmapped, stats |
| 102 | + else: |
| 103 | + return bam |
0 commit comments