-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathocp_cnv_report.pl
executable file
·420 lines (366 loc) · 13 KB
/
ocp_cnv_report.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
#!/usr/bin/env perl
# Read in VCF file from IR and output called CNVs
# 9/20/2014 - D Sims
#################################################################################################
use warnings;
use strict;
use autodie;
use Getopt::Long qw( :config bundling auto_abbrev no_ignore_case );
use File::Basename;
use Sort::Versions;
use JSON -support_by_pp;
use Parallel::ForkManager;
use Data::Dump;
use Term::ANSIColor;
use constant DEBUG => 0;
my $scriptname = basename($0);
my $version = "v3.4.053119";
# Remove when in prod.
#print "\n";
#print colored("*" x 75, 'bold yellow on_black'), "\n";
#print colored(" DEVELOPMENT VERSION OF $scriptname\n", 'bold yellow on_black');
#print colored("*" x 75, 'bold yellow on_black');
#print "\n\n";
my $description = <<"EOT";
Input one more more VCF files from IR output and generate a report of called
CNVs. Can print anything called a CNV, or filter based on gene name, copy
number, number of tiles, or hotspot calls. Also can output a raw formatted table
that is easily imported into Excel and R for downstream analysis.
EOT
my $usage = <<"EOT";
USAGE: $scriptname [options] <VCF_file(s)>
Filter Options
--cn INT Only report amplifications above this threshold (DEFAULT:
CN >= 0)
--cu INT Upper bound for amplifications based on 5% CI (DEFAULT:
5% CI >= 4)
--cl INT Lower bound for deletions based on 95% CI (DEFAULT:
95% CI <= 1)
-n, --novel Print non-HS CNVs (Default = OFF)
-g, --gene Print out results for this gene only. Can also input a list
of comma separated gene names to search
-t, --tiles Only print out results for CNVs with at least this many
tiles.
-a, --annot Only print CNVs with Oncomine Annotations.
-p, --procs Number of processor cores to use to process files in
parallel. (Default: 48).
-N, --NOCALL Do not output NOCALL results (Default: OFF)
Output Options
-o, --output Send output to custom file. Default is STDOUT.
-f, --format Format to use for output. Can choose 'csv', 'tsv', or
pretty print (as 'pp'). DEFAULT: pp.
This format still retains the header and other data.
-r, --raw Output as a raw CSV format that can be input into Excel.
The header output is not in columns as
a part of the whole.
-v, --version Version information
-h, --help Print this help information
EOT
my $help;
my $ver_info;
my $outfile;
my $novel;
my $copy_number = 0;
my $cu;
my $cl;
my $geneid;
my $tiles;
my $annot;
my $nocall;
my $format = 'pp';
my $raw_output;
my $num_procs = 48;
GetOptions( "novel|n" => \$novel,
"copy-number|cn=f" => \$copy_number,
"upper-copies|cu=f" => \$cu,
"lower-copies|cl=f" => \$cl,
"gene|g=s" => \$geneid,
"tiles|t=i" => \$tiles,
"annot|a" => \$annot,
"output|o=s" => \$outfile,
"format|f=s" => \$format,
"raw|r" => \$raw_output,
"NOCALL|N" => \$nocall,
"procs|p=i" => \$num_procs,
"version|v" => \$ver_info,
"help|h" => \$help )
or die $usage;
sub help {
printf "%s - %s\n\n%s\n\n%s\n", $scriptname, $version, $description, $usage;
exit;
}
sub version {
printf "%s - %s\n", $scriptname, $version;
exit;
}
help if $help;
version if $ver_info;
# We don't need both cn and cu or cl
if ($cl or $cu) {
undef $copy_number;
}
my @genelist = split(/,/, $geneid) if $geneid;
my %filters = (
'cn' => $copy_number,
'cu' => $cu,
'cl' => $cl,
'gene' => [@genelist],
'tiles' => $tiles,
'annot' => $annot,
'novel' => $novel,
);
if (DEBUG) {
print '='x35, ' DEBUG ', '='x35, "\n";
print "Filters being employed\n";
while (my ($keys, $values) = each %filters) {
$values //= 'undef';
if ($keys eq 'gene') {
printf "\t%-7s => %s\n",$keys,join(',',@$values);
} else {
printf "\t%-7s => %s\n",$keys,$values;
}
}
print '='x79, "\n";
}
my %formats = (
'csv' => ',',
'tsv' => "\t",
'pp' => '',
);
# Set the output format delimiter
my $delimiter;
unless (defined $formats{$format}) {
die "ERROR: '$format' is not a valid option as a delimiter!\n";
}
($format) ? ($delimiter = $formats{$format}) : ($delimiter = $formats{pp});
# Make sure enough args passed to script
if ( scalar( @ARGV ) < 1 ) {
print "ERROR: No VCF files passed to script!\n\n";
print "$usage\n";
exit 1;
}
# Write output to either indicated file or STDOUT
my $out_fh;
if ( $outfile ) {
open( $out_fh, ">", $outfile );
} else {
$out_fh = \*STDOUT;
}
#########--------------------- END ARG Parsing -----------------------#########
my %cnv_data;
my @vcfs = @ARGV;
my $pm = Parallel::ForkManager->new($num_procs);
$pm->run_on_finish(
sub {
my ($pid, $exit_code, $ident, $exit_signal, $core_dump,
$data_structure_reference) = @_;
my $vcf = $data_structure_reference->{input};
my $name = $data_structure_reference->{id};
$name //= basename($vcf);
$cnv_data{$$name} = $data_structure_reference->{result};
}
);
for my $input_file (@vcfs) {
$pm->start and next;
my ($return_data, $sample_id) = proc_vcf(\$input_file);
$pm->finish(0,
{
result => $return_data,
input => $input_file,
id => $sample_id,
}
);
}
$pm->wait_all_children;
#dd \%cnv_data;
# exit;
my %results;
for my $sample ( keys %cnv_data ) {
$results{$sample} = [];
my @outfields = qw( END LEN NUMTILES RAW_CN REF_CN CN HS FUNC );
for my $cnv ( sort { versioncmp ( $a, $b ) } keys %{$cnv_data{$sample}} ) {
my %mapped_cnv_data;
last if $cnv eq 'NONE';
# Seems to be a bug in the same the CI are reported for deletions.
# Solution correctly reports the value in the VCF, but it's not so
# informative. This will give a better set of data.
my ($ci_5, $ci_95) = $cnv_data{$sample}->{$cnv}->{'CI'} =~ /0\.05:(.*?),0\.95:(.*)$/;
my ($chr, $start, $gene, undef) = split( /:/, $cnv );
%mapped_cnv_data = map{ $_ => $cnv_data{$sample}->{$cnv}->{$_} } @outfields;
@mapped_cnv_data{qw(ci_05 ci_95)} = (sprintf("%.2f",$ci_5),
sprintf("%.2f",$ci_95));
@mapped_cnv_data{qw(chr start gene undef)} = split(/:/, $cnv);
$mapped_cnv_data{HS} //= 'No';
$mapped_cnv_data{ci_05} //= 0;
$mapped_cnv_data{ci_95} //= 0;
# Get OVAT Annot Data
my ($gene_class, $variant_class);
my $func = $mapped_cnv_data{FUNC};
if ( $func && $func =~ /oncomine/ ) {
my $json_annot = JSON->new->allow_singlequote->decode($func);
my $parsed_annot = $$json_annot[0];
$gene_class = $$parsed_annot{'oncomineGeneClass'};
$variant_class = $$parsed_annot{'oncomineVariantClass'};
} else {
$gene_class = $variant_class = '---';
}
$mapped_cnv_data{GC} = $gene_class;
$mapped_cnv_data{VC} = $variant_class;
my @filtered_data = filter_results(\%mapped_cnv_data, \%filters);
push(@{$results{$sample}}, \@filtered_data) if @filtered_data;
}
}
print_results(\%results, $delimiter);
sub print_results {
my ($data, $delimiter) = @_;
my @header = qw( Chr Gene Start End Length Tiles Raw_CN Ref_CN CI_05 CI_95
CN Annot );
my $pp_format = "%-8s %-8s %-11s %-11s %-11s %-8s %-8s %-8s %-8s %-8s " .
"%-8s %-18s\n";
select $out_fh;
# Print out comma separated dataset for easy import into Excel and whatnot.
raw_output($data) if $raw_output;
for my $sample (keys %$data) {
my ($id, $gender, $mapd, $cellularity) = split( /:/, $sample );
print "::: CNV Data For $id (Gender: $gender, Cellularity: ",
"$cellularity, MAPD: $mapd) :::\n";
($delimiter)
? print join($delimiter, @header),"\n"
: printf $pp_format, @header;
if ( ! @{$$data{$sample}} ) {
print ">>>> No Reportable CNVs Found in Sample <<<<\n";
} else {
for my $cnv (@{$$data{$sample}}) {
($delimiter)
? print join($delimiter, @$cnv), "\n"
: printf $pp_format, @$cnv;
}
}
print "\n";
}
return;
}
sub raw_output {
my $data = shift;
my @header = qw( Sample Gender Cellularity MAPD Chr Gene Start End Length
Tiles Raw_CN Ref_CN CI_05 CI_95 CN Annot );
select $out_fh;
print join(',', @header), "\n";
for my $sample (keys %$data) {
my @elems = split(/:/, $sample);
for my $cnv (@{$$data{$sample}}) {
print join(',', @elems, @$cnv), "\n";
}
}
exit;
}
sub filter_results {
# Filter out CNV data prior to printing it all out.
my ($data, $filters) = @_;
my @cn_thresholds = @$filters{qw(cn cu cl)};
# Gene level filter
return if (@{$filters{gene}}) and ! grep {$$data{gene} eq $_} @{$filters{gene}};
# Filter non-hotspots and novel if we don't want them.
return if ! $$filters{novel} and ($$data{HS} eq 'No' || $$data{gene} eq '.');
# Number of tiles filter
return if ($$filters{tiles} and $$data{NUMTILES} < $$filters{tiles});
# OVAT Filter
return if ($$filters{annot} and $$data{GC} eq '---');
# We made it the whole way through; check for copy number thresholds
(copy_number_filter($data, \@cn_thresholds))
? return return_data($data)
: return;
}
sub return_data {
my $data = shift;
my @fields = qw(chr gene start END LEN NUMTILES RAW_CN REF_CN ci_05 ci_95
CN GC);
return @$data{@fields};
}
sub copy_number_filter {
# if there is a 5% / 95% CI filter, use that, otherwise if there's a cn
# filter use that, and if there's nothing, just return it all.
my ($data, $threshold) = @_;
my ($cn, $cu, $cl) = @$threshold;
if ($cn) {
return 1 if ($$data{CN} >= $cn);
}
elsif ($cu) {
return 1 if $cl and ($$data{ci_05} >= $cu || $$data{ci_95} <= $cl);
return 1 if ($$data{ci_05} >= $cu);
}
elsif ($cl) {
return 1 if $cu and ($$data{ci_05} >= $cu || $$data{ci_95} <= $cl);
return 1 if ($$data{ci_95} <= $cl);
} else {
# Return everything if there are no filters.
return 1;
}
return 0;
}
sub proc_vcf {
my $vcf = shift;
my ($sample_id, $gender, $mapd, $cellularity, $sample_name);
my %results;
open( my $vcf_fh, "<", $$vcf);
while (<$vcf_fh>) {
if ( /^##/ ) {
if ( $_ =~ /sampleGender=(\w+)/ ) {
$gender = $1 and next;
}
# Need to add to accomodate the new CNV plugin; may not have the
# same field as the normal IR data.
if ($_ =~ /AssumedGender=([mf])/) {
($1 eq 'm') ? ($gender='Male') : ($gender='Female');
next;
}
elsif ( $_ =~ /mapd=(\d\.\d+)/ ) {
$mapd = $1 and next;
}
elsif ( $_ =~ /CellularityAsAFractionBetween0-1=(.*)$/ ) {
$cellularity = $1 and next;
}
}
my @data = split;
if ( $data[0] =~ /^#/ ) {
$sample_name = $data[-1] and next;
}
$sample_id = join( ':', $sample_name, $gender, $mapd, $cellularity );
next unless $data[4] eq '<CNV>';
# Let's handle NOCALLs for MATCHBox compatibility (prefer to filter on
# my own though).
if ($nocall && $data[6] eq 'NOCALL') {
${$cnv_data{$sample_id}->{'NONE'}} = '';
next;
}
my $varid = join( ':', @data[0..3] );
# Will either get a HS entry in the line, or nothing at all. Kludgy, but
# need to deal with hotspots (HS) field; not like others (i.e. not a
# key=val struct)!
$data[7] =~ s/HS/HS=Yes/;
# New v5.2 standard deviation value; sometimes data in this field and
# sometimes not.
$data[7] =~ s/SD;/SD=NA;/;
my @format = split( /;/, $data[7] );
my ($cn) = $data[9] =~ /:([^:]+)$/;
push( @format, "CN=$cn" );
%{$results{$varid}} = map { split /=/ } @format;
}
if (DEBUG) {
print "="x35, " DEBUG ", "="x35, "\n";
print "\tSample Name: $sample_name\n";
print "\tCellularity: $cellularity\n";
print "\tGender: $gender\n";
print "\tMAPD: $mapd\n";
print "="x79, "\n";
}
return \%results, \$sample_id;
}
sub __exit__ {
my ($line, $msg) = @_;
print "\n\n";
print colored("Got exit message at line: $line with message:\n$msg",
'bold white on_green');
print "\n";
exit;
}