Skip to content

Commit

Permalink
Merge pull request #338 from CGATOxford/AC_strand_script
Browse files Browse the repository at this point in the history
Ac strand script
  • Loading branch information
sebastian-luna-valero committed Jun 23, 2017
2 parents a0f3ac4 + e17d212 commit 9ede7d9
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 0 deletions.
151 changes: 151 additions & 0 deletions CGAT/scripts/bam2libtype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""bam2libtype.py - determine the library type of a bam file
============================================================
Author: Adam Cribbs
Purpose
-------
This tool determines the library type of a BAM file. The naming
convention used is from the salmon documentation:
http://salmon.readthedocs.io/en/latest/library_type.html.
BAM files need to have a corresponding index file i.e. example.bam
and example.bam.bai
Usage
-----
cat example.bam | cgat bam2libtype > out.tsv
options
-------
There are no options for this script, just pass the script a bam file
as the stdin and an outfile as the stdout.
Type::
python bam2bed.py --help
for command line help.
Command line options
--------------------
"""

import sys
import pysam
import CGAT.Experiment as E


def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""

if not argv:
argv = sys.argv

# setup command line parser
parser = E.OptionParser(
version="%prog version: $Id$", usage=globals()["__doc__"])

(options, args) = E.Start(parser, argv=argv)

samfile = pysam.AlignmentFile(options.stdin, "rb")
outfile = options.stdout

# initialise counts for each library type
MSR = 0
MSF = 0
ISF = 0
ISR = 0
OSF = 0
OSR = 0
SR = 0
SF = 0

for read in samfile:

# to handle paired end reads:
if read.is_paired and read.is_proper_pair:

# get attributes of read
read_start = read.reference_start
read_end = read.reference_end
read_neg = read.is_reverse

# specify which read is R1 and which is R2:
# specify which read is R1 and which is R2:
if read.is_read1 is True:
R1_is_reverse = read.is_reverse
R1_reference_start = read.reference_start

R2_is_reverse = read.mate_is_reverse
R2_reference_start = read.next_reference_start
else:
R1_is_reverse = read.mate_is_reverse
R1_reference_start = read.next_reference_start

R2_is_reverse = read.is_reverse
R2_reference_start = read.reference_start

# Decision tree to specify strandness:
# potential to convert this to a machine learning
# decision tree algorithm in the future:
if R1_is_reverse is True:

if R2_is_reverse is True:

MSF += 1
else:
if R2_reference_start - R1_reference_start >= 0:
OSR += 1
else:
ISR += 1

else:

if R2_is_reverse is True:

if R1_reference_start - R2_reference_start >= 0:

OSF += 1
else:
ISF += 1
else:
MSR += 1
else:
if read.is_reverse:
SR += 1
else:
SF += 1

total = MSR + ISR + OSR + ISF + MSF + OSF + SF + SR

def total_percent(strand, total):
return float(strand)/float(total)*100

MSR_total = total_percent(MSR, total)
ISR_total = total_percent(ISR, total)
OSR_total = total_percent(OSR, total)
ISF_total = total_percent(ISF, total)
MSF_total = total_percent(MSF, total)
OSF_total = total_percent(OSF, total)
SF_total = total_percent(SF, total)
SR_total = total_percent(SR, total)

outfile.write("MSR\tISR\tOSR\tISF\tMSF\tOSF\tSF\tSR\n")
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
(int(MSR_total), int(ISR_total), int(OSR_total),
int(ISF_total), int(MSF_total),
int(OSF_total), int(SF_total), int(SR_total)))

E.Stop()

if __name__ == "__main__":
sys.exit(main(sys.argv))
1 change: 1 addition & 0 deletions tests/bam2libtype.py/paired.bam
16 changes: 16 additions & 0 deletions tests/bam2libtype.py/paired_IU.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# output generated by bam2libtype
# job started at Thu Jun 22 19:03:14 2017 on cgat057.anat.ox.ac.uk -- 5bab66f9-4752-4eae-b579-033086913e56
# pid: 4943, system: Linux 2.6.32-696.1.1.el6.x86_64 #1 SMP Tue Apr 11 08:56:50 CDT 2017 x86_64
# loglevel : 1
# random_seed : None
# short_help : None
# stderr : <open file '<stderr>', mode 'w' at 0x2b75db3511e0>
# stdin : <open file '<stdin>', mode 'r' at 0x2b75db3510c0>
# stdlog : <open file '<stdout>', mode 'w' at 0x2b75db351150>
# stdout : <open file '<stdout>', mode 'w' at 0x2b75db351150>
# timeit_file : None
# timeit_header : None
# timeit_name : all
MSR ISR OSR ISF MSF OSF SF SR
0 49 0 47 0 0 1 1
# job finished in 0 seconds at Thu Jun 22 19:03:14 2017 -- 0.43 0.60 0.00 0.00 -- 5bab66f9-4752-4eae-b579-033086913e56
Binary file added tests/bam2libtype.py/single.bam
Binary file not shown.
16 changes: 16 additions & 0 deletions tests/bam2libtype.py/single_U.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# output generated by bam2libtype
# job started at Thu Jun 22 19:03:53 2017 on cgat057.anat.ox.ac.uk -- f3083ee5-cbde-4e32-a43e-eafdb5ba42ec
# pid: 4965, system: Linux 2.6.32-696.1.1.el6.x86_64 #1 SMP Tue Apr 11 08:56:50 CDT 2017 x86_64
# loglevel : 1
# random_seed : None
# short_help : None
# stderr : <open file '<stderr>', mode 'w' at 0x2b86133961e0>
# stdin : <open file '<stdin>', mode 'r' at 0x2b86133960c0>
# stdlog : <open file '<stdout>', mode 'w' at 0x2b8613396150>
# stdout : <open file '<stdout>', mode 'w' at 0x2b8613396150>
# timeit_file : None
# timeit_header : None
# timeit_name : all
MSR ISR OSR ISF MSF OSF SF SR
0 0 0 0 0 0 51 48
# job finished in 0 seconds at Thu Jun 22 19:03:53 2017 -- 1.27 0.71 0.00 0.01 -- f3083ee5-cbde-4e32-a43e-eafdb5ba42ec
19 changes: 19 additions & 0 deletions tests/bam2libtype.py/tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

version:
stdin: null
outputs: [stdout]
references: []
options: --version

paired_lib:
stdin: paired.bam
outputs: [stdout]
references: [paired_IU.txt]
options:

single_lib:
stdin: single.bam
outputs: [stdout]
references: [single_U.txt]
options:

0 comments on commit 9ede7d9

Please sign in to comment.