-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathMapReads.py
175 lines (133 loc) · 5.11 KB
/
MapReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
MapReads.py: Runs tophat to map RNA-seq reads, and assigns them to the
appropriate species using a sub-program.
Does its best to automatically calculate things like filenames, etc, based only
on the given indices.
"""
import sys
from glob import glob
from os.path import join, abspath
import os
from time import time
from datetime import timedelta
from subprocess import Popen, PIPE, call
from argparse import Namespace
from pysam import view as samview
import pandas as pd
ARGS = Namespace()
ARGS.analysis_dir = abspath('analysis-multi')
ARGS.base_species = 'Dmel'
ARGS.seq_dir = abspath('sequence')
ARGS.config_file = abspath('Parameters/RunConfig.cfg')
ARGS.ref_base = abspath('Reference/AAA/')
ARGS.star_params = abspath('Parameters/STAR_params.in')
########################################################################
BASE = Namespace()
BASE.map_base = ('STAR --parametersFiles {params} '
'--genomeDir {genome} '
'--readFilesIn {rf1} {rf2}')
BASE.assign_base = 'nice python AssignReads2.py {fname}'
########################################################################
def process_config_file(cfg_fname):
""" Get data we want out of the configuration file"""
cfg_file = pd.read_table(cfg_fname, index_col='Label')
return cfg_file
def get_readfiles(args, mbepc, index):
" Find the names of the read files, based on configuration"
# Directory structure:
# Project_Eisen_M
# Sample_MBE_PC_64A_index1
# MBE_PC_64A_index1_ATCACG_L001_R1_001.fastq.gz
# MBE_PC_64A_index1_ATCACG_L001_R1_002.fastq.gz
# MBE_PC_64A_index1_ATCACG_L001_R1_003.fastq.gz
# ...
# MBE_PC_64A_index1_ATCACG_L001_R2_001.fastq.gz
# MBE_PC_64A_index1_ATCACG_L001_R2_002.fastq.gz
# MBE_PC_64A_index1_ATCACG_L001_R2_003.fastq.gz
# ...
# Sample_MBE_PC_64B_index2
# ...
print "Finding reads for", mbepc, 'index', index
seq_dir = join(args.seq_dir,
'Sample_MBEPC{}*_index{}'.format(mbepc, index))
read_1s = glob(join(seq_dir, "*_R1_*.fastq*"))
print read_1s
read_2s = glob(join(seq_dir, "*_R2_*.fastq*"))
print read_2s
return [[abspath(r) for r in sorted(read_1s)],
[abspath(r) for r in sorted(read_2s)]]
def count_reads(read_files):
''' Count the reads in the files'''
# Let's cheat and only look in the last file: the others should have 4M
# per...
wc_proc = Popen(['wc', '-l', read_files[-1]], stdout=PIPE)
wcout, _ = wc_proc.communicate()
first_reads = int(4e6) * (len(read_files) - 1)
result = str(wcout).splitlines()[-1].strip()
lines, _ = result.split()
last_reads = int(lines) / 4
return first_reads + last_reads
DATA = Namespace()
DATA.config_data = process_config_file(ARGS.config_file)
DATA.readnames = {}
DATA.samples = DATA.config_data.index
TIMES = Namespace()
TIMES.start = time()
TEMP = Namespace()
TEMP.assign_procs = []
TEMP.rezip_procs = []
#for libname, (rf1, rf2) in DATA.readnames.items():
for sample in DATA.config_data.index:
TEMP.sample_reads = get_readfiles(ARGS,
DATA.config_data['MBEPC'][sample],
DATA.config_data['Index'][sample])
DATA.readnames[sample] = TEMP.sample_reads
rf1, rf2 = TEMP.sample_reads
# Print the name of the files we're going through, as a progress bar
print '-'*72, '\n', sample, '\n', '-'*72
sys.stdout.flush()
# Skip dealing with the gzipped files... this has been in Tophat
# since version 1.3.0
TEMP.od = join(ARGS.analysis_dir, sample)
try:
os.makedirs(TEMP.od)
except OSError:
print ("Directory '%s' already exists... shouldn't be a problem" %
TEMP.od)
# Do mapping
print 'Tophatting...', '\n', '='*30
TEMP.old_dir = os.getcwd()
os.chdir(TEMP.od)
print DATA.config_data['CarrierSpecies'][sample]
TEMP.genome = join(ARGS.ref_base,
ARGS.base_species +
DATA.config_data['CarrierSpecies'][sample])
print TEMP.genome
TEMP.commandstr = BASE.map_base.format(
rf1 = ','.join(rf1),
rf2 = ','.join(rf2),
genome = TEMP.genome,
params = ARGS.star_params)
print TEMP.commandstr
sys.stdout.flush()
call(str(TEMP.commandstr).split())
print "Converting to bam"
print abspath('../../ToBamAssign.sh')
sys.stdout.flush()
TEMP.assign_procs.append(Popen(['sh' , '../../ToBamAssign.sh']))
os.chdir(TEMP.old_dir)
# Stop the timing
TIMES.end = time()
print "Mapping elapsed time", timedelta(seconds = TIMES.end - TIMES.start)
for proc in TEMP.assign_procs:
proc.wait()
print "Assignment extra time", timedelta(seconds = time() - TIMES.end)
print "Final time", timedelta(seconds=time() - TIMES.start)
import cPickle as pickle
call('STAR --parametersFiles {params} '
'--genomeDir {genome} '
'--genomeLoad Remove'.format(params=ARGS.star_params,
genome=join(ARGS.ref_base)
).split())
pickle.dump(dict(data=DATA, args=ARGS),
open('mapreads_dump.pkl', 'w'))