forked from deweylab/RSEM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rsem-run-prsem-testing-procedure
executable file
·324 lines (224 loc) · 13.1 KB
/
rsem-run-prsem-testing-procedure
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/env perl
use Getopt::Long;
use Pod::Usage;
use File::Basename;
use FindBin;
use lib $FindBin::RealBin;
use rsem_perl_utils qw(runCommand collectResults showVersionInfo);
use Env qw(@PATH);
@PATH = ($FindBin::RealBin, "$FindBin::RealBin/sam", @PATH);
use strict;
use warnings;
#const
my $status = 0;
my $bowtie_path = "";
my $nThreads = 1;
my $quiet = 0;
my $help = 0;
my $keep_intermediate_files = 0;
my $version = 0;
my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ('') x 7;
my $chipseq_target_read_files = '';
my $chipseq_control_read_files = '';
my $chipseq_peak_file = '';
my $partition_model = 'pk';
my $chipseq_read_files_multi_targets = ''; ## read files for multiple targets
## delimited by comma
my $chipseq_bed_files_multi_targets = ''; ## BED files for multiple targets
## delimited by comma
GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
"temporary-folder=s" => \$temp_dir,
"bowtie-path=s" => \$bowtie_path,
"p|num-threads=i" => \$nThreads,
'chipseq-target-read-files=s' => \$chipseq_target_read_files,
## delimited by comma if more than one
'chipseq-control-read-files=s' => \$chipseq_control_read_files,
## delimited by comma if more than one
'chipseq-read-files-multi-targets=s' => \$chipseq_read_files_multi_targets,
## delimited by comma
'chipseq-bed-files-multi-targets=s' => \$chipseq_bed_files_multi_targets,
## delimited by comma
'chipseq-peak-file=s' => \$chipseq_peak_file,
'partition-model=s' => \$partition_model,
"version" => \$version,
"q|quiet" => \$quiet,
"h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
pod2usage(-verbose => 2) if ($help == 1);
&showVersionInfo($FindBin::RealBin) if ($version == 1);
#check parameters and options
{
my $msg = '';
if ( ( $chipseq_peak_file eq '' ) &&
( ( $chipseq_target_read_files eq '' ) ||
( $chipseq_control_read_files eq '' ) ||
( $bowtie_path eq '' ) ) &&
( ( $chipseq_read_files_multi_targets eq '' ) ||
( $bowtie_path eq '' ) ) &&
( $chipseq_bed_files_multi_targets eq '' )
) {
$msg = "please define one set of the following options to run pRSEM's testing procedure:\n" .
"1. --chipseq-peak-file <file>\n" .
"2. --chipseq-target-read-files <file> and\n" .
" --chipseq-control-read-files <file> and\n" .
" --bowtie-path <path>\n" .
"3. --chipseq-read-files-multi-targets <files> and\n" .
" --bowtie-path <path>\n" .
"4. --chipseq-bed-files-multi-targets <files>\n";
}
my @prsem_partition_models = ( 'pk', 'cmb_lgt' );
my %prtmdl2one = ();
foreach my $prtmdl (@prsem_partition_models) {
$prtmdl2one{$prtmdl} = 1;
}
if ( exists $prtmdl2one{$partition_model} ) {
if ( ( $partition_model eq 'cmb_lgt' ) &&
( ( $chipseq_read_files_multi_targets eq '' ) &&
( $chipseq_bed_files_multi_targets eq '' ) ) ){
$msg = 'either --chipseq-read-files-multi-targets <files> or ' .
'--chipseq-bed-files-multi-targets <files> needs to be ' .
"defined for pRSEM's partition model: '$partition_model'";
} elsif ( ( $partition_model ne 'pk' ) &&
( $partition_model ne 'cmb_lgt' ) &&
( ( $chipseq_target_read_files eq '' ) ||
( $chipseq_control_read_files eq '' ) ||
( $bowtie_path eq '' ) ) ){
$msg = '--chipseq-target-read-files <file> and ' .
'--chipseq-control-read-files <file> and ' .
'--bowtie-path <path> need to be defined for ' .
"pRSEM's partition model: '$partition_model'";
}
} else {
my $s_prt_mdls = join(', ', @prsem_partition_models);
$msg = "\n--partition-model <string> must be one of [$s_prt_mdls]\n" .
"pRSEM's testing procedure only supports the above partition models";
}
if ( $msg ne '' ) {
pod2usage(-msg => "$msg\n", -exitval => 2, -verbose => 2);
}
if ( ( $partition_model ne 'cmb_lgt' ) &&
( ( $chipseq_read_files_multi_targets ne '' ) ||
( $chipseq_bed_files_multi_targets ne '' ) ) ) {
print "\nCombining signals from multiple sources, partition model is set to 'cmb_lgt'\n\n";
$partition_model = 'cmb_lgt';
}
}
$refName = $ARGV[0];
$sampleName = $ARGV[1];
my $pos = rindex($sampleName, '/');
if ($pos < 0) { $sampleToken = $sampleName; }
else { $sampleToken = substr($sampleName, $pos + 1); }
if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
$stat_dir = "$sampleName.stat";
if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
$imdName = "$temp_dir/$sampleToken";
$statName = "$stat_dir/$sampleToken";
if ($bowtie_path ne "") { $bowtie_path .= "/"; }
my $command = "";
{
$command = "$FindBin::RealBin/pRSEM/prsem-testing-procedure " .
" --num-threads $nThreads " .
" --partition-model $partition_model ";
## ChIP-seq peak file from single source
if ( $chipseq_peak_file ne '') { ## only for partition model pk
## need to add sanity check!!
$command .= " --chipseq-peak-file $chipseq_peak_file";
} elsif ( $partition_model eq 'cmb_lgt' ) { ## multi-sources
if ( $chipseq_bed_files_multi_targets ne '' ) { ## use bed over read
$command .= ' --chipseq-bed-files-multi-targets ' .
$chipseq_bed_files_multi_targets;
} elsif ( $chipseq_read_files_multi_targets ne '' ) {
$command .= ' --chipseq-read-files-multi-targets ' .
$chipseq_read_files_multi_targets .
" --bowtie-path $bowtie_path" ;
}
} else { ## ChIP-seq reads files from single source
$command .= " --chipseq-target-read-files $chipseq_target_read_files " .
" --bowtie-path $bowtie_path" ;
if ( $chipseq_control_read_files ne '' ) {
$command .= " --chipseq-control-read-files $chipseq_control_read_files";
}
}
if ( $quiet ) {
$command .= ' --quiet ';
}
$command .= " $refName $sampleName $statName $imdName";
&runCommand($command);
}
if (!$keep_intermediate_files) {
&runCommand("rm -rf $temp_dir", "Fail to delete the temporary folder!");
}
__END__
=head1 NAME
rsem-run-prsem-testing-procedure
=head1 SYNOPSIS
rsem-run-prsem-testing-procedure [options] reference_name sample_name
=head1 ARGUMENGS
=over
=item B<reference_name>
The name of the reference used. Users must run 'rsem-prepare-reference' with this reference_name and with '--prep-pRSEM' specified before running this program.
=item B<sample_name>
The name of the sample analyzed. Users must run 'rsem-calculate-expression' with this sample_name and with RSEM's default Gibbs sampling performed before running this program.
=back
=head1 BASIC OPTIONS
=over
=item B<--bowtie-path> <string>
The path to the Bowtie executables. (Default: the path to the Bowtie executables is assumed to be in the user's PATH environment variable)
=item B<--chipseq-target-read-files> <string>
Comma-separated full path of FASTQ read file(s) for ChIP-seq target. This option provides information to calculate ChIP-seq peaks and signals. The file(s) can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie-path <path>' and '--chipseq-control-read-files <string>' must be defined when this option is specified. (Default: "")
=item B<--chipseq-control-read-files> <string>
Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. This option provides information to call ChIP-seq peaks. The file(s) can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie-path <path>' and '--chipseq-target-read-files <string>' must be defined when this option is specified. (Default: "")
=item B<--chipseq-read-files-multi-targets> <string>
Comma-separated full path of FASTQ read files for multiple ChIP-seq targets. This option is used when prior is learned from multiple complementary data sets. It provides information to calculate ChIP-seq signals. All files can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--bowtie-path <path>' must be defined and the option '--partition-model <string>' will be set to 'cmb_lgt' automatically. (Default: "")
=item B<--chipseq-bed-files-multi-targets> <string>
Comma-separated full path of BED files for multiple ChIP-seq targets. This option is used when prior is learned from multiple complementary data sets. It provides information of ChIP-seq signals and must have at least the first six BED columns. All files can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--partition-model <string>' will be set to 'cmb_lgt' automatically. (Default: "")
=item B<--chipseq-peak-file> <string>
Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. BED6+4 format. This file is used in prior-enhanced RSEM's default two-partition model. It partitions isoforms by whether they have ChIP-seq overlapping with their transcription start site region or not. Each partition will have its own prior parameter learned from a training set. This file can be either gzipped or ungzipped. (Default: "")
=item B<--partition-model> <string>
A keyword to specify the partition model. It must be either 'pk' or 'cmb_lgt'. For details, please see the help document of 'rsem-calculate-expression'.
=item B<-p|--num-threads> <int>
Number of threads to use. (Default: 1)
=item B<--version>
Show version information.
=item B<-q|--quiet>
Suppress the output of logging information. (Default: off)
=item B<-h|--help>
Show help information.
=back
=head1 ADVANCED OPTIONS
=over
=item B<--keep-intermediate-files>
Keep temporary files generated by RSEM and this testing procedure. RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. By default, after this test is finished, the temporary directory is deleted. Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
=item B<--temporary-folder> <string>
Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
=back
=head1 DESCRIPTION
This program provides users a p-value and a log-likelihood to determine whether external data set(s) is informative and how informative it is for RNA-seq quantification. It is used in conjunction with prior-enhanced RSEM to let user select the most effective external data set(s).
Users can run this program repetitively with different external data. All p-values and log-likelihoods will be saved in an output file 'sample_name.pval_LL'.
=head1 NOTES
Users must run 'rsem-prepare-reference' with the appropriate referece and with the option '--prep-pRSEM' before using this program
Users must run 'rsem-calculate-expression' with the option '--calc-pme' before using this program
The temporary directory and all intermediate files will be removed when RSEM finishes unless '--keep-intermediate-files' is specified.
=head1 OUTPUT
=over
=item B<sample_name.pval_LL>
This file contains partition model's name, basename(s) of external data set file(s), p-value, and log-likelihood delimited by tab. When this program is ran repetiively, output will be concatenated to the end of this file without removing previous results.
=back
The following output files are the same as the ones generated by 'rsem-calculate-expression' with prior-enhanced RSEM. Please refer to the help document of 'rsem-calculate-expression' for details
=over 2
=item B<sample_name.stat/sample_name.all_tr_features>
=item B<sample_name.stat/sample_name.all_tr_prior>
=item B<sample_name.stat/sample_name.lgt_mdl.RData>
=item B<sample_name.stat/sample_name.pval_LL>
=item B<sample_name.stat/sample_name_uniform_prior_1.isoforms.results>
=item B<sample_name.stat/sample_name_uniform_prior_1.genes.results>
=back
=head1 EXAMPLE
Assuming RSEM reference files are under '/ref' with name 'mouse_125' and expression files are under '/expr' with name 'mouse_125'. Suppose we want to derive prior from four histone modification ChIP-seq read data sets: '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz', '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming Bowtie's executables are under '/sw/bowtie/' and we want to use 16 cores:
rsem-run-prsem-testing-procedure --partition-model cmb_lgt \
--chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \
--bowtie-path /sw/bowtie \
--num-threads 16 \
/ref/mouse_125 \
/expr/mouse_125
=cut