-
Notifications
You must be signed in to change notification settings - Fork 1
/
evaluate.pl
259 lines (236 loc) · 14.2 KB
/
evaluate.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/perl -w
########################################################################################################################################################
# Evaluate.pl is perl script that concatenates split read across all samples and evaluate the read coverage and distribution that cross the breakpoint
# Author: Sen ZHAO, [email protected]
# evaluate.pl uses the outputs of Select_read.pl as inputs. Before you start evaluate.pl, please run Select_read.pl first
########################################################################################################################################################
use strict;
use warnings;
use Getopt::Std;
use Getopt::Long;
use File::Basename;
use File::Spec;
# setup option parameters in the script command line
my @usage;
push @usage, "Usage: ".basename($0)." [options]\n";
push @usage, "Evaluate split reads that support a given fusion breakpoint.\n";
push @usage, " --help Displays this information\n";
push @usage, " --input Set input path: it is the output path of select_read.pl (it contains all the samples that you are going to evaluate)\n";
push @usage, " --output Set output path\n";
my $help;
my $input;
my $output;
GetOptions
(
'help' => \$help,
'input=s' => \$input,
'output=s' => \$output
);
not defined $help or die @usage;
defined $input or die @usage;
defined $output or die @usage;
# setup output folder status
if ( -e $output ) {
`rm -r $output/`; print "\nwarning: $output exists, and the previous files will be over-written\n";
`mkdir $output`;
} else {
`mkdir $output`;
}
my %break_num; # data structure: $break_num{$break}{$seq_length} = [$break_start, $read_length];
# $break = unique breakpoint name (e.g. alt_1)
# $seq_length = the length of scaffold sequence
# [$break_start: the start position of breakpoint in the scaffold sequence; $read_length: the length of pairend reads; $scaffold_name: the scaffold name of one given sample]
my %summary; # summarize spanning, discordant_split, singlton_split reads across all samples
if ( -e "$input" ) {
open (INPUT, "ls $input |") || die "main $input path is wrong\n";
while ( <INPUT> ) {
chomp $_; my $name = $_; # sample name
if ( -d "$input/$name" ) { # judge whether sample is a folder
open (BK, "ls $input/$name |") || die "main $input/$name path is wrong\n";
while ( <BK> ) {
chomp $_; my $break = $_; print "### $name && $break ###\n"; # name of breakpoint
# if breakpoint folder not present in output path, please create it.
if (! -e "$output/$break") { `mkdir $output/$break`; }
if ( -d "$input/$name/$break" ) { # judge whether breakpoint is folder for input path
if ( -e "$input/$name/$break/final_split_1.txt" ) { # judge whether final_split_1/_2 file present
if (! -z "$input/$name/$break/final_split_1.txt" ) { # judge whether final_split_1/_2 is not zero
`cat $input/$name/$break/final_split_1.txt >> $output/$break/All_sample_split_1.txt`;
`cat $input/$name/$break/final_split_2.txt >> $output/$break/All_sample_split_2.txt`;
}
# manage the breakpoint sequence: extract three important paramters (breakpoint_start, scaffold_length, read_length)
my $tmp = `ls $input/$name/$break/scaffold_*_seq.fa`; chomp $tmp;;
my $file_name = (split /\//, $tmp)[-1]; # the last element
my @file_name_parameter = (split /_/, $file_name);
if ( scalar(@file_name_parameter) == 6 ) {
my ($break_start, $read_length) = ($file_name_parameter[2], $file_name_parameter[4]);
if ( defined($break_start) && defined($read_length) ) {
my $sequence = `grep -A1 'scaffold' $tmp`; chomp $sequence; $sequence = (split /\n/, $sequence)[1];
my $seq_length = length($sequence);
$break_num{$break}{$seq_length} = [$break_start, $read_length, $tmp, $sequence];
#print "Breakpoint_sequence: $break\t$seq_length\t$break_start, $read_length, $tmp, $sequence\n";
} else {
print "scaffold name has been changed, please implement original one: scaffold_TRANSCRIPTION1*_XX_TRANSCRIPTION2*_YY_seq.fa\n"; exit;
}
} else {
print "scaffold name is not correct, it contain unexpected symbol '_' in user-defined transcription name\n"; exit;
}
}
if ( -e "$input/$name/$break/final_read_mapped_info" ) { # judge whether final_read_mapped_info present
my $num_span = 0; # number of spanning reads (one mapped to geneA, the other mapped to geneB)
my $num_split_dis = 0; # number of discordant_split read (one mapped to scaffold, the other mapped to geneA / geneB)
my $num_split_sig = 0; # number of singlton_split read ( one mapped to scaffold, the other shows no mapping)
my $num_split_dis_sig = 0; # number of singlton_split => discordant_split reads during genome mapping process
if (! -z "$input/$name/$break/final_read_mapped_info" ) { # judge whether final_read_mapped_info is not zero
$num_span = `grep -c -P '^spanning' $input/$name/$break/final_read_mapped_info`; chomp $num_span;
$num_split_dis = `grep -c -P '^discordant_split' $input/$name/$break/final_read_mapped_info`; chomp $num_split_dis;
$num_split_dis_sig = `grep -c -P '^singlton_to_discordant_split' $input/$name/$break/final_read_mapped_info`; chomp $num_split_dis_sig;
$num_split_sig = `grep -c -P '^singlton_split' $input/$name/$break/final_read_mapped_info`; chomp $num_split_sig;
}
$summary{$name}{$break} = [$num_split_dis, $num_split_dis_sig, $num_split_sig, $num_span];
#print "Sample_Break: $name\t$break\t$num_split_dis, $num_split_dis_sig, $num_split_sig, $num_span\n";
}
}
}
close BK;
}
}
close INPUT;
}
# run statistical test for balance mapping of breakpoint scaffold
my %summary_pvalue; # collect [num_read_upstream, num_read_downstream, pvalue] per one breakpoint scaffold
foreach my $break ( keys %break_num ) { # $break represent name of unique breakpoint
my @array = sort {$a <=> $b} keys %{$break_num{$break}}; # Get all the key %{$break_num{$break}} (scaffold sequence length);
my $upstream = 0; my $downstream = 0; my $middlestream = 0;
my $fisher_pvalue = "NULL"; my $chisq_pvalue = "NULL";
if ( scalar(@array) == 1 ) { # only one unique scaffold sequence, indicating samples with the same read length
# cp the scaffold sequence to the output folder
`cp $break_num{$break}{$array[0]}[2] $output/$break`;
if (! -e "$output/$break/All_sample_split_1.txt" ) { `cat <> $output/$break/All_sample_split_1.txt`; `cat <> $output/$break/All_sample_split_1.txt`; } # NULL
if (! -z "$output/$break/All_sample_split_1.txt" ) {
my $build_flag = `hisat2-build -p 1 -f $output/$break/scaffold_*_seq.fa $output/$break/index`; # build index file
my $final_flag = system("hisat2 -p 1 --no-unal --no-spliced-alignment --no-softclip -x $output/$break/index -q -1 $output/$break/All_sample_split_1.txt -2 $output/$break/All_sample_split_2.txt -S $output/$break/All_sample_noclip.sam");
$final_flag == 0 or die "Hisat2 map to GeneA-GeneB-scaffold for all concatenated samples fails\n";
# transform sam format to sam format using samtools
`samtools view $output/$break/All_sample_noclip.sam -bS > $output/$break/All_sample_noclip.bam`;
`samtools sort $output/$break/All_sample_noclip.bam -o $output/$break/All_sample_noclip.sorted.bam`;
`samtools index $output/$break/All_sample_noclip.sorted.bam`;
`rm $output/$break/All_sample_noclip.bam`;
`rm $output/$break/index*`;
# set the three parameters
# transfer the three parameters: scaffold_length, the start position of breakpoint site, read_length, output_path
($upstream, $middlestream, $downstream, $fisher_pvalue, $chisq_pvalue) = &access_distribution($array[0], $break_num{$break}{$array[0]}[0], $break_num{$break}{$array[0]}[1], "$output/$break");
}
} else { # there are more than one scaffold sequence, indicating samples with various read length
`cp $break_num{$break}{$array[-1]}[2] $output/$break`; # put the longest scaffold as reference candidate
if (! -e "$output/$break/All_sample_split_1.txt" ) { `cat <> $output/$break/All_sample_split_1.txt`; `cat <> $output/$break/All_sample_split_1.txt`; } # NULL
if (! -z "$output/$break/All_sample_split_1.txt" ) {
my $build_flag = `hisat2-build -p 1 -f $output/$break/scaffold_*_seq.fa $output/$break/index`; # build index file
my $final_flag = system("hisat2 -p 1 --no-unal --no-spliced-alignment --no-softclip -x $output/$break/index -q -1 $output/$break/All_sample_split_1.txt -2 $output/$break/All_sample_split_2.txt -S $output/$break/All_sample_noclip.sam");
$final_flag == 0 or die "Hisat2 map to GeneA-GeneB-scaffold for all concatenated samples fails\n";
# transform sam format to sam format using samtools
`samtools view $output/$break/All_sample_noclip.sam -bS > $output/$break/All_sample_noclip.bam`;
`samtools sort $output/$break/All_sample_noclip.bam -o $output/$break/All_sample_noclip.sorted.bam`;
`samtools index $output/$break/All_sample_noclip.sorted.bam`;
`rm $output/$break/All_sample_noclip.bam`;
`rm $output/$break/index*`;
# set the three parameters
# transfer the three parameters: scaffold_length, the start position of breakpoint site, read_length, output_path
($upstream, $middlestream, $downstream, $fisher_pvalue, $chisq_pvalue) = &access_distribution($array[-1], $break_num{$break}{$array[-1]}[0], $break_num{$break}{$array[0]}[1], "$output/$break");
}
}
open (OUT, ">$output/$break/p.value") || die "cannot output pvalue:$!\n";
print OUT "Num_mapped_upstream\tNum_mapped_downstream\tp.value(Fisher_test)\tp.value(Chisq_test)\n";
print OUT "$upstream\t$downstream\t$fisher_pvalue\t$chisq_pvalue\n";
close OUT;
$summary_pvalue{$break} = [$upstream, $downstream, $fisher_pvalue, $chisq_pvalue, $middlestream];
}
# final print the output #
open (OUT, ">$output/summary.txt") || die "cannot open output file:$!\n";
print OUT "Sample";
foreach my $break ( sort {$a cmp $b} keys %summary_pvalue ) { print OUT "\t$break"; }
print OUT "\n";
foreach my $name ( sort {$a cmp $b} keys %summary ) {
print OUT "$name";
foreach my $break ( sort {$a cmp $b} keys %{$summary{$name}} ) {
print OUT "\t$summary{$name}{$break}[0]", "+", "$summary{$name}{$break}[1],", "$summary{$name}{$break}[2];", "$summary{$name}{$break}[3]";
}
print OUT "\n";
}
print OUT "Statistics_summary(split_reads)";
foreach my $break ( sort {$a cmp $b} keys %summary_pvalue ) {
print OUT "\t$summary_pvalue{$break}[0]|$summary_pvalue{$break}[1]: $summary_pvalue{$break}[2](fisher exact test), $summary_pvalue{$break}[3](chisq test)";
}
print OUT "\n";
################
# sub function #
################
sub access_distribution {
my ($scaffold_length, $break_start, $read_length, $path) = @_;
my %collect; # collect the read names
# Load sam file
open (IN, "awk -F '\t' -v OFS='\t' '(\$8>0)' $path/All_sample_noclip.sam |") || die "cannot run awk scaffold recruit script:$!\n";
while ( <IN> ) {
my ($name, $one_match, $one_pos, $other_match, $other_pos, $seq) = (split /\t/, $_)[0, 2, 3, 6, 7, 9];
$name =~s/\/[\w\:\-]+$//g; $name =~s/\s[\w\:\-]+$//g;
if ( $one_match eq "scaffold" ) {
push @{$collect{$name}}, [$one_pos, $seq];
}
}
close IN;
my %ref; # statistics on read mapping bias in scaffold
foreach my $name ( keys %collect ) {
if ( scalar(@{$collect{$name}}) == 1 ) {
my $middle = $collect{$name}[0][0] + length($collect{$name}[0][1])/2; # the middle position of reads
if ( $middle > $break_start ) {
push @{$ref{"downstream"}}, $name;
} elsif ( $middle <= $break_start ) {
push @{$ref{"upstream"}}, $name;
}
} elsif ( scalar(@{$collect{$name}}) == 2 ) {
if ( $collect{$name}[0][0] < $collect{$name}[1][0] ) { # start point: first element < second element
my $middle = ($collect{$name}[0][0] + $collect{$name}[1][0] + length($collect{$name}[1][1]))/2;
if ( $middle > $break_start ) {
push @{$ref{"downstream"}}, $name;
} elsif ( $middle <= $break_start ) {
push @{$ref{"upstream"}}, $name;
}
} else {
my $middle = ($collect{$name}[1][0] + $collect{$name}[0][0] + length($collect{$name}[0][1]))/2;
if ( $middle > $break_start ) {
push @{$ref{"downstream"}}, $name;
} elsif ( $middle <= $break_start ) {
push @{$ref{"upstream"}}, $name;
}
}
}
}
#run statistical anaylses
my $up = 0; my $down = 0; my $middle = 0; my $p;
if ( exists($ref{"upstream"}) ) {
if ( exists($ref{"downstream"}) ) {
$up = scalar(@{$ref{"upstream"}});
$down = scalar(@{$ref{"downstream"}});
} else {
$up = scalar(@{$ref{"upstream"}});
}
} else {
if ( exists($ref{"downstream"}) ) {
$down = scalar(@{$ref{"downstream"}});
}
}
if ( exists($ref{"middle"}) ) { $middle = scalar(@{$ref{"middle"}}); }
if ( $up == 0 && $down == 0 ) { #if no reads mapped to scaffold, please cancel the statistical test
return($up, $middle, $down, "NA", "NA"); # please reture NA for pvalue
} else {
`echo "a=$up; b=$down; a1=round(($up+$down)*(($break_start-1)/$scaffold_length), digits=0); b1=round(($up+$down)*(1-($break_start-1)/$scaffold_length), digits=0); ab=matrix(c(a,b,a1,b1),nrow=2); c=fisher.test(ab); write.table(c\$ p.value, file='$path/fisher_p.value', row.names=F, col.names=F); sessionInfo()" | R --vanilla --slave`;
my $fisher_p = `head -n 1 $path/fisher_p.value`; chomp $fisher_p;
if ( ($up + $down) > 9 ) {
`echo "a=$up; b=$down; a1=round(($up+$down)*(($break_start-1)/$scaffold_length), digits=1); b1=round(($up+$down)*(1-($break_start-1)/$scaffold_length), digits=1); c=chisq.test(c(a,b), p=c(a1,b1), rescale.p=T); write.table(c\$ p.value, file='$path/chisq_p.value', row.names=F, col.names=F); sessionInfo()" | R --vanilla --slave`;
} else {
`echo "a=$up; b=$down; a1=round(($up+$down)*(($break_start-1)/$scaffold_length), digits=1); b1=round(($up+$down)*(1-($break_start-1)/$scaffold_length), digits=1); c=chisq.test(c(a,b), p=c(a1,b1), rescale.p=T, simulate.p.value=T, B=100000); write.table(c\$ p.value, file='$path/chisq_p.value', row.names=F, col.names=F); sessionInfo()" | R --vanilla --slave`;
}
my $chisq_p = `head -n 1 $path/chisq_p.value`; chomp $chisq_p;
return($up, $middle, $down, $fisher_p, $chisq_p);
}
}
#----- End ------#