forked from 4ureliek/TEanalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTE-analysis_Shuffle_bed.pl
executable file
·745 lines (683 loc) · 38.3 KB
/
TE-analysis_Shuffle_bed.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
#!/usr/bin/perl
#######################################################
# Author : Aurelie Kapusta (https://github.com/4ureliek), with the help of Edward Chuong
# email : [email protected]
# Purpose : Writen to test enrichment of TEs in a set of simple features (ChIP-seq for example)
#######################################################
use strict;
use warnings;
use Carp;
use Getopt::Long;
use Bio::SeqIO;
use Data::Dumper;
use vars qw($BIN);
use Cwd 'abs_path';
BEGIN {
$BIN = abs_path($0);
$BIN =~ s/(.*)\/.*$/$1/;
unshift(@INC, "$BIN/Lib");
}
use TEshuffle;
use Statistics::R; #required to get the Binomial Test p-values
#-------------------------------------------------------------------------------
#------------------------------- DESCRIPTION -----------------------------------
#-------------------------------------------------------------------------------
#flush buffer
$| = 1;
my $VERSION = "4.4.2";
my $SCRIPTNAME = "TE-analysis_Shuffle_bed.pl";
my $CHANGELOG;
set_chlog();
sub set_chlog {
$CHANGELOG = "
# - v1.0 = Mar 2016
# based on TE-analysis_Shuffle_v3+.pl, v3.3,
# but adapted to more general input files = bed file corresponding to any features to test.
# - v2.0 = Mar 2016
# attempt of making this faster by removing any length info and allowing overlaps
# of shuffled features (since they are independent tests, it's OK)
# Also added the possibility of several files to -e and -i
# - v2.1 = Oct 2016
# remove empty column of length info from output
# get enrichment by age categories if age file provided
# bug fix for total counts of hit features when upper levels (by class or family, by Rname was probably OK)
# Changes in stats, bug fix; use R for the binomial test
# - v3.0 = Oct 25 2016
# TEshuffle.pm for subroutines shared with the shuffle_tr script
# - v4.0 = Nov 28 2016
# different choices to shuffle the TEs:
# shufflebed = completely random positions, but same chromosome
# shuffle inside the current TE positions, same chromosome
# shuffle each TE, keeping its distance to a TSS, same chromosome - thanks to: Cedric Feschotte, Ed Chuong
# make subfolders for each input file (for the shuffled outputs)
# - v4.1 = Nov 30 2016
# Minor bug fix: when random was 0, values were not reported
# (still interesting to see them in the obs, even if no stats possible)
# - v4.2 = Dec 01 2017
# Bug fix for when long TEs were shuffled to the position of small TEs that are
# too close to the start of the genomic sequence (led to negative starts).
# This is now checked for -s rm and -s tss:
# for -s rm, the TE is shifted of as many bp as needed
# for -s tss the start is simply changed to 1 (to avoid having a TE placed closer to a tss)
# Also added the option to use -r file in -s rm as well (to check ends) and chift the TE if needed.
# - v4.3 = Jan 23 2018
# Avoid passing to subs variables that don't need to + use upercase syntax
# Cleaned up a few things
# Bug fix for cases when the 4th col of the bed file is not a unique ID => identifier is now a cat of cols 1 to 4
# - v4.4 = Mar 08 2018
# Minor changes in logs and check options
# Was assuming a 5 columns bed file => fix
# Change for -s tss: if the TE ends up out of the scaffold/chr, put on the other side and if still out, shift it to be inside
# Also, skip if not in the annotation file
# Make -r mandatory for all
# - v4.4.1 = Oct 02 2018
# Change how the id is set in sub check_for_overlap, to avoid uninitialized errors if the input is a 3 columns bed file
# Citation edit in the usage
# Clarify option -d for the -s bed
# - v4.4.2 = Aug 05 2019
# Small edits in TEshuffle::RMtobed
# Load Statistics::R after the Lib path is set
# Fixed some uninitialized values for the total count of observed when overlap is only seen in the expected
#TO DO: add the -keep option.
\n";
return 1;
}
my $USAGE;
set_usage();
sub set_usage {
$USAGE = "
Synopsis (v$VERSION):
perl $SCRIPTNAME -f features.bed [-o <nt>] -q features_to_shuffle [-n <nb>] -s shuffling_type -r <genome.sizes> [-b]
[-a <annotations>]
[-e <genome.gaps>] [-d] [-i <include.range>] [-x]
[-w <bedtools_path>] [-l <if_nonTE>] [-t <filterTE>] [-c] [-g <TE.age.tab>] [-v] [-h]
/!\\ REQUIRES: Bedtools, at least v18 (but I advise updating up to the last version; v26 has a better shuffling)
/!\\ Previous outputs, if any, will be moved as *.previous [which means previous results are only saved once]
Typically, for the 3 types, the mandatory arguments are:
perl $SCRIPTNAME -f features.bed -q rm.out -r genome.sizes -s rm
perl $SCRIPTNAME -f features.bed -q rm.out -r genome.sizes -s tss -a annotations.gtf
perl $SCRIPTNAME -f features.bed -q rm.out -r genome.sizes -s bed -e genome.gaps
CITATION:
- Include the version of the script + link to the GitHub page (https://github.com/4ureliek/TEanalysis)
- Cite Kapusta et al. (2013) PLoS Genetics (DOI: 10.1371/journal.pgen.1003470)
(http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003470)
If relevant or space allows, please cite Trizzani, Kapusta and Brown (2018) BMC Genomics
(https://doi.org/10.1186/s12864-018-4850-3), for which the script was updated
- For BEDtools, please cite the version + Quinlan AR and Hall IM (2010) Bioinformatics
(https://doi.org/10.1093/bioinformatics/btq033)
DESCRIPTION:
Features provided in -s will be overlapped with -f file (which must be simple intervals in bed format),
without (no_boot) or with (boot) shuffling (on same chromosome)
One feature may overlap with several repeats and all are considered.
However, if there are several fragments of the same repeat in a feature, it will be counted
only one time. Could be an issue for larger features, but otherwise can't normalize by total
number of features.
There are 3 options for shuffling, but all will shuffle on the same chromosome:
- with bedtool shuffle (=> random position)
[will require files for -r and -e that can be generated by this script, see below]
- shuffle among the current TE positions (still random, but less)
- shuffle the TEs keeping their distance to the closest TSS of an annotation file provided
[will require ensembl gtf, or gencode gtf or gff3 annotations]
Note that because TEs are often fragmented + there are inversions, the counts of TEs are likely inflated;
this also means that when TEs are shuffled, there are more fragments than TEs. Some should be moved non independently,
or the input file should be corrected when possible to limit that issue
[not implemented in this script for now, but you may edit the RM file to merge TE fragments]
Two-tailed permutation test and a binomial test are done on the counts of overlaps.
The results are in a .stats.txt file. Note that high bootstraps takes a lot of time.
Note that for low counts, expected and/or observed, stats likely don't mean much.
MANDATORY ARGUMENTS:
-f,--feat => (STRING) ChIPseq peaks, chromatin marks, etc, in bed format
/!\\ Script assumes no overlap between features
-q,--query => (STRING) Features to shuffle = TE file
For now, can only be the repeat masker .out or the .bed file generated by the TE-analysis_pipeline script
See -l and -t for filters, and -g for age data
-s,--shuffle => (STRING) Shuffling type. Should be one of the following:
-s bed => use bedtools shuffle, random position on the same chromosome
-s rm => shuffle inside the current TE positions; still random, but less; also, with this
the stats won't be affected with overall depletion or enrichment of TEs in the -f file
-s tss => shuffle the TEs on the same chromosome keeping their distance
to the closest TSS of an annotation file provided
(TSS are shuffled and then assigned to a TE;
same TSS will be assigned multiple TEs if fewer TSS than TEs).
Thanks to: Edward Chuong & Cedric Feschotte for the suggestion
-r,--range => (STRING) To know the maximum value in a given chromosome/scaffold.
File should be: sequence_name \\t length
Can be files from UCSC, files *.chrom.sizes
If you don't have such file, use -b (--build) and provide the genome fasta file for -r
MANDATORY ARGUMENTS IF USING -s tss:
-a,--annot => (STRING) gtf or gff; annotations to load all unique TSS: will be used to set the distance
between each TE and the closest TSS that will be kept while randomization
Note that it requires transcript lines
MANDATORY ARGUMENTS IF USING -s bed:
-e,--excl => (STRING) This will be used as -excl for bedtools shuffle: \"coordinates in which features from -i should not be placed.\"
More than one file may be provided (comma separated), they will be concatenated
(in a file = first-file-name.cat.bed).
By default, at least one file is required = assembly gaps, and it needs to be the first file
if not in bed format. Indeed, you may provide the UCSC gap file, with columns as:
bin, chrom, chromStart, chromEnd, ix, n, size, type, bridge
it will be converted to a bed file.
If you do nothave this file, you may provide the genome file in fasta format
and add the option -d (--dogaps), to generate a bed file corresponding to assembly gaps.
If you need to generate the <genome.gaps> file but you would also like to add more files to the -e option,
just do a first run with no bootstraps (in this example the genome.range is also being generated):
perl ~/bin/$SCRIPTNAME -f input.bed -q genome.out -s rm -r genome.fa -b -e genome.fa -d -n 0
Other files may correspond to regions of low mappability, for example for hg19:
http://www.broadinstitute.org/~anshul/projects/encode/rawdata/blacklists/hg19-blacklist-README.pdf
Notes: -> when the bed file is generated by this script, any N stretch > 50nt will be considered as a gap
(this can be changed in the load_gap subroutine)
-> 3% of the shuffled feature may overlap with these regions
(this can be changed in the shuffle subroutine).
OPTIONAL ARGUMENTS IF USING -s bed:
-d,--dogaps => (BOOL) If no available gap file, give the genome fasta file for -e, and
set this option. The script will create the gaps and use it; but
if several files in -e, then the genome needs to be the first one.
This step is not optimized, it will take a while.
-i,--incl => (STRING) To use as -incl for bedtools shuffle: \"coordinates in which features from -i should be placed.\"
Bed of gff format. Could be intervals close to TSS for example.
More than one file (same format) may be provided (comma separated),
they will be concatenated (in a file = first-file-name.cat.bed)
-x,--x => (BOOL) to add the -noOverlapping option to the bedtools shuffle command line,
and therefore NOT allow overlaps between the shuffled features.
This may create issues mostly if -i is used (space to shuffle may be too small to shuffle features)
OTHER OPTIONAL ARGUMENTS (for all three -s):
-b,--build => (BOOL) See above; use this and provide the genome fasta file if no range/lengths file (-r)
This step may take a while but will create the required file
-o,--overlap => (INT) Minimal length (in nt) of intersection in order to consider the TE included in the feature.
Default = 10 (to match the TEanalysis-pipeline.pl)
-n,--nboot => (STRING) number of bootsraps with shuffled -s file
Default = 10 for faster runs; use higher -n for good pvalues
(-n 10000 is best for permutation test but this will take a while)
If set to 0, no bootstrap will be done (this would be to get the observed values / test run)
-w,--where => (STRING) if BEDtools are not in your path, provide path to BEDtools bin directory
OPTIONAL ARGUMENTS FOR TE FILTERING (for all -s):
-l,--low => (STRING) To set the behavior regarding non TE sequences: all, no_low, no_nonTE, none
-l all = keep all non TE sequences (no filtering)
-l no_low [default] = keep all besides low_complexity and simple_repeat
-l no_nonTE = keep all except when class = nonTE
-l none = everything is filtered out (nonTE, low_complexity, simple_repeat, snRNA, srpRNA, rRNA, tRNA/tRNA, satellite)
-t,--te => (STRING) <type,name>
run the script on only a subset of repeats. Not case sensitive.
The type can be: name, class or family and it will be EXACT MATCH unless -c is chosen as well
ex: -t name,nhAT1_ML => only fragments corresponding to the repeat named exactly nhAT1_ML will be looked at
-t class,DNA => all repeats with class named exactly DNA (as in ...#DNA/hAT or ...#DNA/Tc1)
-t family,hAT => all repeats with family named exactly hAT (so NOT ...#DNA/hAT-Charlie for example)
-c,--contain => (BOOL) to check if the \"name\" determined with -filter is included in the value in Repeat Masker output, instead of exact match
ex: -t name,HERVK -c => all fragments containing HERVK in their name
-t family,hAT -c => all repeats with family containing hAT (...#DNA/hAT, ...#DNA/hAT-Charlie, etc)
-g,--group => (STRING) provide a file with TE age:
Rname Rclass Rfam Rclass/Rfam %div(avg) lineage age_category
At least Rname and lineage are required (other columns can be \"na\"),
and age_category can be empty. But if age_category has values, it will
be used as well. Typically:
TE1 LTR ERVL-MaLR LTR/ERVL-MaLR 24.6 Eutheria Ancient
TE2 LTR ERVL-MaLR LTR/ERVL-MaLR 9.9 Primates LineageSpe
OPTIONAL ARGUMENTS (GENERAL):
-v,--version => (BOOL) print the version
-h,--help => (BOOL) print this usage
\n";
return 1;
}
#-------------------------------------------------------------------------------
#------------------------------ LOAD AND CHECK ---------------------------------
#-------------------------------------------------------------------------------
my ($IN,$SHUFFLE,$STYPE,$TSSFILE,$EXCLUDE,$DOGAPS,$BUILD,$DOBUILD,$F_REGEX,$ALLOW,$NOOVERLAPS,$V,$HELP);
my $INTERS = 10;
my $NBOOT = 10;
my $INCL = "na";
my $NONTE = "no_low";
my $FILTER = "na";
my $TEAGE = "na";
my $BEDTOOLS = "";
my $KEEP = "none";
GetOptions(
'feat=s' => \$IN,
'query=s' => \$SHUFFLE,
'shuffle=s' => \$STYPE,
'overlap=s' => \$INTERS,
'nboot=s' => \$NBOOT,
'annot=s' => \$TSSFILE,
'range=s' => \$BUILD,
'build' => \$DOBUILD,
'excl=s' => \$EXCLUDE,
'dogaps' => \$DOGAPS,
'incl=s' => \$INCL,
'x' => \$NOOVERLAPS,
'low=s' => \$NONTE,
'te=s' => \$FILTER,
'contain' => \$F_REGEX,
'group=s' => \$TEAGE,
'where=s' => \$BEDTOOLS,
'version' => \$V,
'help' => \$HELP,);
#Check options, if files exist, etc
check_options();
sub check_options {
die "\n --- $SCRIPTNAME version $VERSION\n\n" if ($V);
die $USAGE if ($HELP);
die "\n SOME MANDATORY ARGUMENTS MISSING, CHECK USAGE:\n$USAGE" if (! $IN || ! $SHUFFLE || ! $STYPE || ! $BUILD);
die "\n -f $IN is not a bed file?\n\n" unless ($IN =~ /\.bed$/);
die "\n -f $IN does not exist?\n\n" if (! -e $IN);
die "\n -q $SHUFFLE is not in a proper format? (not .out, .bed, .gff or .gff3)\n\n" unless ($SHUFFLE =~ /\.out$/ || $SHUFFLE =~ /\.bed$/ || $SHUFFLE =~ /\.gff$/ || $SHUFFLE =~ /\.gff3$/);
die "\n -q $SHUFFLE does not exist?\n\n" if (! -e $SHUFFLE);
die "\n -s $STYPE should be one of the following: bed, rm or tss\n\n" if ($STYPE ne "bed" && $STYPE ne "rm" && $STYPE ne "tss");
die "\n -r $BUILD does not exist?\n\n" if (! -e $BUILD);
#deal with conditional mandatory stuff
if ($STYPE eq "tss") {
die "\n -s tss was set, but -a is missing?\n\n" if (! $TSSFILE);
die "\n -a $TSSFILE does not exist?\n\n" if ($TSSFILE && ! -e $TSSFILE);
} elsif ($STYPE eq "bed") {
die "\n -s bed was set, but -e is missing?\n\n" if (! $EXCLUDE);
die "\n -e $EXCLUDE does not exist?\n\n" if ($EXCLUDE !~ /,/ && ! -e $EXCLUDE); #if several files, can't check existence here
die "\n -i $INCL does not exist?\n\n" if ($INCL ne "na" && $INCL !~ /,/ && ! -e $INCL); #if several files, can't check existence here
}
#Now the rest
die "\n -n $NBOOT but should be an integer\n\n" if ($NBOOT !~ /\d+/);
die "\n -i $INTERS but should be an integer\n\n" if ($INTERS !~ /\d+/);
die "\n -w $BEDTOOLS does not exist?\n\n" if ($BEDTOOLS ne "" && ! -e $BEDTOOLS);
die "\n -t requires 2 values separated by a coma (-t <name,filter>; use -h to see the usage)\n\n" if ($FILTER ne "na" && $FILTER !~ /,/);
die "\n -g $TEAGE does not exist?\n\n" if ($TEAGE ne "na" && ! -e $TEAGE);
($DOGAPS)?($DOGAPS = "y"):($DOGAPS = "n");
($DOBUILD)?($DOBUILD = "y"):($DOBUILD = "n");
($F_REGEX)?($F_REGEX = "y"):($F_REGEX="n");
$BEDTOOLS = $BEDTOOLS."/" if ($BEDTOOLS ne "" && substr($BEDTOOLS,-1,1) ne "/"); #put the / at the end of path if not there
($NOOVERLAPS)?($NOOVERLAPS = "-noOverlapping"):($NOOVERLAPS = "");
}
#-------------------------------------------------------------------------------
#----------------------------------- MAIN --------------------------------------
#-------------------------------------------------------------------------------
#Quick log on files used
print STDERR "\n --- $SCRIPTNAME v$VERSION started, with:\n";
print STDERR " input file = $IN\n";
print STDERR " features to shuffle = $SHUFFLE\n";
print STDERR " shuffling type = $STYPE\n";
print STDERR " genome chromosome sizes = $BUILD\n";
my $BEDV = $BEDTOOLS."bedtools --version";
my $BEDVER = `$BEDV`;
chomp $BEDVER;
print STDERR " bedtools version = $BEDVER\n";
#Prep outputs
print STDERR " --- prepping output directories and files\n";
my $DIR = $IN.".shuffle-".$STYPE.".".$NBOOT;
print STDERR " output directory = $DIR\n";
my ($STATS,$DISTRIB,$OUT,$OUTB,$TEMP) = TEshuffle::prep_out("bed",$DIR,$NBOOT,$FILTER,$IN,$STYPE,$NONTE,$KEEP);
#Chosomosome sizes / Genome range
my ($OKSEQ,$BUILD_FILE);
print STDERR " --- loading build (genome range)\n";
($OKSEQ,$BUILD_FILE) = TEshuffle::load_build($BUILD,$DOBUILD);
#prep steps if shuffling type is bed
my $EXCL;
if ($STYPE eq "bed") {
#Files to exclude for shuffling
print STDERR " --- getting ranges to exclude in the shuffling of features from $EXCLUDE\n";
my @exclude = ();
if ($EXCLUDE =~ /,/) {
($DOGAPS eq "y")?(print STDERR " several files provided, -d chosen, genome file (fasta) should be the first one\n"):
(print STDERR " several files provided, assembly gaps should be the first one\n");
@exclude = split(",",$EXCLUDE) if ($EXCLUDE =~ /,/);
} else {
$exclude[0] = $EXCLUDE;
}
$exclude[0] = TEshuffle::load_gap($exclude[0],$DOGAPS);
print STDERR " concatenating files for -e\n" if ($EXCLUDE =~ /,/);
if ($EXCLUDE =~ /,/) {
$EXCL = TEshuffle::concat_beds(\@exclude);
} else {
$EXCL = $exclude[0];
}
#If relevant, files to include for shuffling
if ($INCL ne "na" && $INCL =~ /,/) {
print STDERR " --- concatenating $INCL files to one file\n";
my @include = split(",",$INCL);
$INCL = TEshuffle::concat_beds(\@include);
}
}
#Load TEage if any
my $AGE = ();
if ($TEAGE ne "na") {
print STDERR " --- Loading TE ages from $TEAGE\n";
$AGE = TEshuffle::load_TEage($TEAGE,$V);
}
#Now features to shuffle (need to be after in case there was $OKSEQ loaded)
print STDERR " --- checking file in -s, print in .bed if not a .bed or gff file\n";
print STDERR " filtering TEs based on filter ($FILTER) and non TE behavior ($NONTE)\n" unless ($FILTER eq "na");
print STDERR " + getting genomic counts for each repeat\n";
print STDERR " + load all TE positions in a hash (since -s is set to rm)\n" if ($STYPE eq "rm") ;
my ($TOSHUFF_F,$PARSEDRM,$RM,$RM_C) = TEshuffle::RMtobed($SHUFFLE,$OKSEQ,$FILTER,$F_REGEX,$NONTE,$AGE,"y",$STYPE); #Note: $RM and $RM_C are empty unless $STYPE eq rm
#prep steps if shuffling type is tss
my ($TSSBED,$CLOSEST,$ALLTSS);
if ($STYPE eq "tss") {
#sort TEs
my $bedsort = $BEDTOOLS."bedtools sort";
my $sorted = $TOSHUFF_F;
$sorted =~ s/\.bed$/\.sorted\.bed/;
if (! -e $sorted) {
print STDERR " --- sorting features of $TOSHUFF_F\n";
print STDERR " $bedsort -i $TOSHUFF_F > $sorted\n";
`$bedsort -i $TOSHUFF_F > $sorted` ;
}
$TOSHUFF_F = $sorted;
print STDERR " --- loading the tss from $TSSFILE\n";
#print the tss in a bed file => use bedtools closest
($TSSBED,$ALLTSS) = TEshuffle::load_and_print_tss($TSSFILE);
print STDERR " --- sorting features in the tss file\n" unless (-e "$TSSBED.bed");
print STDERR " $bedsort -i $TSSBED > $TSSBED.bed\n" unless (-e "$TSSBED.bed");
`$bedsort -i $TSSBED > $TSSBED.bed` unless (-e "$TSSBED.bed");
$TSSBED = $TSSBED.".bed";
#get the closest tss if relevant
my $tssclosest = $TOSHUFF_F.".closest.tss";
my $closestbed = $BEDTOOLS."closestBed";
print STDERR " --- getting closest tss for each feature in $TOSHUFF_F, with the command line below\n";
print STDERR " $closestbed -a $TOSHUFF_F -b $TSSBED -D b -t first > $tssclosest\n"; #I want only one entry per TE, therefore -t first
`$closestbed -a $TOSHUFF_F -b $TSSBED -D b -t first > $tssclosest`;
print STDERR " --- loading distance to TSS\n";
$CLOSEST = TEshuffle::load_closest_tss($tssclosest);
}
#Get total number of features in input file (= counting number of lines with stuff in it)
print STDERR " --- getting number and length of input\n";
my %IN_FEAT = ();
get_features_info($IN);
print STDERR " number of features = $IN_FEAT{'nb'}\n";
#Join -i file with -s
my $INTERSectBed = $BEDTOOLS."intersectBed";
print STDERR " --- intersecting with command lines:\n";
print STDERR " $INTERSectBed -a $TOSHUFF_F -b $IN -wo > $DIR/no_boot.joined\n";
`$INTERSectBed -a $TOSHUFF_F -b $IN -wo > $DIR/no_boot.joined`;
#Process the joined files
print STDERR " --- checking intersections of $IN with features in $TOSHUFF_F (observed)\n";
my $OBS = ();
$OBS = check_for_overlap("$DIR/no_boot.joined","no_boot",$OUT,$OBS);
#Now bootstrap runs
print STDERR " --- running $NBOOT bootstraps now (to get significance of the overlaps)\n";
if ($STYPE eq "rm") {
print STDERR " with intersect command line similar to the one above,\n";
print STDERR " and TEs shuffled among ALL TE positions, keeping length info\n";
} elsif ($STYPE eq "tss") {
print STDERR " with intersect command line similar to the one above,\n";
print STDERR " and TEs shuffled keeping same distance to TSS (using $TSSBED)\n";
} elsif ($STYPE eq "bed") {
print STDERR " with intersect command line similar to the one above, and shuffle command line:\n";
if ($INCL eq "na") {
print STDERR " ".$BEDTOOLS."shuffleBed -i $TOSHUFF_F -excl $EXCL -f 2 $NOOVERLAPS -g $BUILD -chrom -maxTries 10000\n";
} else {
print STDERR " ".$BEDTOOLS."shuffleBed -incl $INCL -i $TOSHUFF_F -excl $EXCL -f 2 $NOOVERLAPS -g $BUILD -chrom -maxTries 10000\n";
}
}
my $BOOTS = ();
if ($NBOOT > 0) {
foreach (my $i = 1; $i <= $NBOOT; $i++) {
print STDERR " ..$i bootstraps done\n" if (($i == 10) || ($i == 100) || ($i == 1000) || (($i > 1000) && (substr($i/1000,-1,1) == 0)));
my $shuffled;
$shuffled = TEshuffle::shuffle_tss($TOSHUFF_F,$TEMP,$i,$ALLTSS,$CLOSEST,$OKSEQ) if ($STYPE eq "tss");
$shuffled = TEshuffle::shuffle_rm($TOSHUFF_F,$TEMP,$i,$RM,$RM_C,$OKSEQ) if ($STYPE eq "rm");
$shuffled = TEshuffle::shuffle_bed($TOSHUFF_F,$TEMP,$i,$EXCL,$INCL,$BUILD_FILE,$BEDTOOLS,$NOOVERLAPS) if ($STYPE eq "bed");
`$INTERSectBed -a $shuffled -b $IN -wo > $TEMP/boot.$i.joined`;
$BOOTS = check_for_overlap("$TEMP/boot.$i.joined","boot.".$i,$OUTB,$BOOTS);
`cat $OUTB >> $OUTB.CAT.boot.txt` if (-e $OUTB);
`rm -Rf $TEMP/boot.$i.joined $shuffled`; #these files are now not needed anymore, all is stored
}
}
#Stats now
if ($NBOOT > 0) {
print STDERR " --- getting and printing counts stats\n";
print_stats();
print STDERR " Stats printed in: $STATS.txt\n";
#save disk space
print STDERR " --- saving disk space by deleting shuffled and joined files\n";
`rm -Rf $TEMP`;
}
#end
print STDERR " --- $SCRIPTNAME done\n";
print STDERR "\n";
exit;
#-------------------------------------------------------------------------------
#-------------------------------- SUBROUTINES ----------------------------------
#-------------------------------------------------------------------------------
sub get_features_info {
my $nb = `grep -c -E "\\w" $IN`;
# my $len = `cat $file | awk '{SUM += (\$3-\$2)} END {print SUM}'`; #this assumes no overlaps, trusting user for now
chomp($nb);
# chomp($len);
$IN_FEAT{'nb'} = $nb;
# $IN_FEAT{'len'} = $len;
return 1;
}
#-------------------------------------------------------------------------------
sub check_for_overlap {
#NB called in 2 situations, OBS and the bootstraps
# check_for_overlap("$TEMP/no_boot.joined","no_boot",$OUT,$INTERS,$AGE);
# $BOOTS = check_for_overlap("$TEMP/boot.$i.joined","boot.".$i,$OUTB,$INTERS,$AGE,$BOOTS);
my ($file,$fileid,$out,$counts) = @_;
my %check = ();
open(my $fh, "<$file") or confess "\n ERROR (sub check_for_overlap): could not open to read $file!\n";
LINE: while(<$fh>){
chomp(my $l = $_);
#FYI:
# chr1 4522383 4522590 1111;18.9;4.6;1.0;chr1;4522383;4522590;(190949381);-;B3;SINE/B2;(0);216;1;1923 . - chr1 4496315 4529218 [ID] [score] [strand] [overlap_len]
my @l = split(/\s+/,$l);
next LINE unless ($l[-1] >= $INTERS);
my @rm = split(";",$l[3]);
my $Rnam = $rm[9];
$Rnam =~ s/\//_/; #making sure no / in repeat name
my ($Rcla,$Rfam) = TEshuffle::get_Rclass_Rfam($Rnam,$rm[10]);
my $id = $l[6]."#".$l[7]."#".$l[8];
$id = $l[9]."#".$id if ($l[9]);
$id = $id."#".$l[11] if ($l[11]);
#Increment in the data structure, but only if feature not already counted for this category
unless ($check{$id}{'tot'}) {
$counts->{$fileid}{'tot'}{'tot'}{'tot'}{'tot'}++;
}
unless ($check{$id}{$Rcla}) {
$counts->{$fileid}{$Rcla}{'tot'}{'tot'}{'tot'}++;
}
unless ($check{$id}{$Rcla.$Rfam}) {
$counts->{$fileid}{$Rcla}{$Rfam}{'tot'}{'tot'}++;
}
unless ($check{$id}{$Rcla.$Rfam.$Rnam}) {
$counts->{$fileid}{$Rcla}{$Rfam}{$Rnam}{'tot'}++;
}
#Need to check if a feature is counted several times in the upper classes
$check{$id}{'tot'}=1;
$check{$id}{$Rcla}=1;
$check{$id}{$Rcla.$Rfam}=1;
$check{$id}{$Rcla.$Rfam.$Rnam}=1;
#Now, age categories if any
if ($TEAGE ne "na") {
if ($AGE->{$Rnam}) {
my $cat1 = $AGE->{$Rnam}[4];
my $cat2 = $AGE->{$Rnam}[5];
unless ($check{$id}{'age'}) { #easier to load tot hit with these keys for the print_out sub
$counts->{$fileid}{'age'}{'cat.1'}{'tot'}{'tot'}++;
$counts->{$fileid}{'age'}{'cat.2'}{'tot'}{'tot'}++;
}
unless ($check{$id}{$cat1}) {
$counts->{$fileid}{'age'}{'cat.1'}{$cat1}{'tot'}++;
}
if ($cat2 && ! $check{$id}{$cat2}) {
$counts->{$fileid}{'age'}{'cat.2'}{$cat2}{'tot'}++;
}
$check{$id}{'age'}=1;
$check{$id}{$cat1}=1;
$check{$id}{$cat2}=1;
} else {
print STDERR " WARN: $Rnam not in $TEAGE?\n" unless ($check{$Rnam});
$check{$Rnam} = 1;
}
}
}
close ($fh);
#Print only if observed; the hash for bootstraps is incrementing
if ($fileid eq "no_boot") {
print STDERR " printing details in $out\n";
print_out($counts,$fileid,$out);
}
return $counts;
}
#-------------------------------------------------------------------------------
sub print_out {
my ($counts,$fileid,$out) = @_;
foreach my $Rclass (keys %{$counts->{$fileid}}) {
print_out_sub($fileid,$Rclass,"tot","tot",$counts,$out.".Rclass") if ($Rclass ne "age");
foreach my $Rfam (keys %{$counts->{$fileid}{$Rclass}}) {
print_out_sub($fileid,$Rclass,$Rfam,"tot",$counts,$out.".Rfam") if ($Rclass ne "age");
foreach my $Rname (keys %{$counts->{$fileid}{$Rclass}{$Rfam}}) {
print_out_sub($fileid,$Rclass,$Rfam,$Rname,$counts,$out.".Rname") if ($Rclass ne "age");
print_out_sub($fileid,$Rclass,$Rfam,$Rname,$counts,$out.".age1") if (($Rclass eq "age") && ($Rfam eq "cat.1"));
print_out_sub($fileid,$Rclass,$Rfam,$Rname,$counts,$out.".age2") if (($Rclass eq "age") && ($Rfam eq "cat.2"));
}
}
}
return 1;
}
#-------------------------------------------------------------------------------
sub print_out_sub {
my ($fileid,$key1,$key2,$key3,$counts,$out) = @_;
my $tothit = $counts->{$fileid}{'tot'}{'tot'}{'tot'}{'tot'};
my $hit = $counts->{$fileid}{$key1}{$key2}{$key3}{'tot'};
my $unhit = $IN_FEAT{'nb'} - $hit;
# my $len = $counts->{$fileid}{$key1}{$key2}{$key3}{'len'}{'tot'};
open (my $fh, ">>", $out) or confess "ERROR (sub print_out_sub): can't open to write $out $!\n";
#fileid, class, fam, name, hits, total features loaded, unhit feat, total feat hit all categories
print $fh "$fileid\t$key1\t$key2\t$key3\t$hit\t$IN_FEAT{'nb'}\t$unhit\t$tothit\n";
close $fh;
return 1;
}
#-------------------------------------------------------------------------------
sub print_stats {
#get the bootstrapped avg values, sd, agregate all values
my $exp = get_stats_data();
$exp = TEshuffle::binomial_test_R($exp,"bed");
#now print; permutation test + binomial test with avg lengths
my $midval = $NBOOT/2;
open (my $fh, ">", $STATS.".txt") or confess "ERROR (sub print_stats): can't open to write $STATS.txt $!\n";
print $fh "#Script $SCRIPTNAME, v$VERSION\n";
print $fh "#Aggregated results + stats\n";
print $fh "#Features in input file (counts):\n\t$IN_FEAT{'nb'}\n";
print $fh "#With $NBOOT bootstraps for exp (expected); sd = standard deviation; nb = number; len = length; avg = average\n";
print $fh "#Two tests are made (permutation and binomial) to assess how significant the difference between observed and random, so two pvalues are given\n";
print $fh "#For the two tailed permutation test:\n";
print $fh "#if rank is < $midval and pvalue is not \"ns\", there are significantly fewer observed values than expected \n";
print $fh "#if rank is > $midval and pvalue is not \"ns\", there are significantly higher observed values than expected \n";
print $fh "#The binomial test is done with binom.test from R, two sided\n";
print $fh "\n#Level_(tot_means_all)\t#\t#\t#COUNTS\t#\t#\t#\t#\t#\t#\t#\t#\t#\t#\t#\t#\t#\n";
print $fh "#Rclass\tRfam\tRname\tobs_hits\t%_obs_(%of_features)\tobs_tot_hits\tnb_of_trials(nb_of_TE_in_genome)\texp_avg_hits\texp_sd\t%_exp_(%of_features)\texp_tot_hits(avg)\tobs_rank_in_exp\t2-tailed_permutation-test_pvalue(obs.vs.exp)\tsignificance\tbinomal_test_proba\tbinomial_test_95%_confidence_interval\tbinomial_test_pval\tsignificance\n\n";
foreach my $Rclass (keys %{$exp}) { #loop on all the repeat classes; if not in the obs then it will be 0 for obs values
foreach my $Rfam (keys %{$exp->{$Rclass}}) {
foreach my $Rname (keys %{$exp->{$Rclass}{$Rfam}}) {
#observed
my ($obsnb,$obsper,$obstot) = (0,0,0);
$obsnb = $OBS->{'no_boot'}{$Rclass}{$Rfam}{$Rname}{'tot'} if ($OBS->{'no_boot'}{$Rclass}{$Rfam}{$Rname}{'tot'});
$obsper = $obsnb/$IN_FEAT{'nb'}*100 unless ($obsnb == 0);
$obstot = $OBS->{'no_boot'}{'tot'}{'tot'}{'tot'}{'tot'} if ($OBS->{'no_boot'}{'tot'}{'tot'}{'tot'}{'tot'});
#expected
my $expper = 0;
my $expavg = $exp->{$Rclass}{$Rfam}{$Rname}{'avg'};
$expper = $expavg/$IN_FEAT{'nb'}*100 unless ($expavg == 0);
#stats
my $pval_nb = $exp->{$Rclass}{$Rfam}{$Rname}{'pval'};
$pval_nb = "na" if ($expavg == 0 && $obsnb == 0);
#Now print stuff
print $fh "$Rclass\t$Rfam\t$Rname\t";
print $fh "$obsnb\t$obsper\t$obstot\t";
print $fh "$PARSEDRM->{$Rclass}{$Rfam}{$Rname}\t";
print $fh "$expavg\t$exp->{$Rclass}{$Rfam}{$Rname}{'sd'}\t$expper\t$exp->{'tot'}{'tot'}{'tot'}{'avg'}\t";
my $sign = TEshuffle::get_sign($pval_nb);
print $fh "$exp->{$Rclass}{$Rfam}{$Rname}{'rank'}\t$pval_nb\t$sign\t";
#Binomial
my $b_pval = $exp->{$Rclass}{$Rfam}{$Rname}{'binom_pval'};
$b_pval = "na" unless ($b_pval =~ /\d/);
$sign = TEshuffle::get_sign($b_pval);
print $fh "$exp->{$Rclass}{$Rfam}{$Rname}{'binom_prob'}\t$exp->{$Rclass}{$Rfam}{$Rname}{'binom_conf'}\t$exp->{$Rclass}{$Rfam}{$Rname}{'binom_pval'}\t$sign\n";
}
}
}
close $fh;
return 1;
}
#-------------------------------------------------------------------------------
sub get_stats_data {
my $exp = initialize_exp(); #0 values for all the ones seen in obs => so that even if not seen in exp, will be there
#agregate data
my ($nb_c,$nb_f,$nb_r,$nb_a1,$nb_a2) = ();
foreach my $round (keys %{$BOOTS}) {
foreach my $Rclass (keys %{$BOOTS->{$round}}) {
push(@{$nb_c->{$Rclass}{'tot'}{'tot'}},$BOOTS->{$round}{$Rclass}{'tot'}{'tot'}{'tot'}) if ($Rclass ne "age");
foreach my $Rfam (keys %{$BOOTS->{$round}{$Rclass}}) {
push(@{$nb_f->{$Rclass}{$Rfam}{'tot'}},$BOOTS->{$round}{$Rclass}{$Rfam}{'tot'}{'tot'}) if ($Rclass ne "age");
foreach my $Rname (keys %{$BOOTS->{$round}{$Rclass}{$Rfam}}) {
push(@{$nb_r->{$Rclass}{$Rfam}{$Rname}},$BOOTS->{$round}{$Rclass}{$Rfam}{$Rname}{'tot'}) if ($Rclass ne "age");
push(@{$nb_a1->{$Rclass}{$Rfam}{$Rname}},$BOOTS->{$round}{$Rclass}{$Rfam}{$Rname}{'tot'}) if (($Rclass eq "age") && ($Rfam eq "cat.1"));
push(@{$nb_a2->{$Rclass}{$Rfam}{$Rname}},$BOOTS->{$round}{$Rclass}{$Rfam}{$Rname}{'tot'}) if (($Rclass eq "age") && ($Rfam eq "cat.2"));
}
}
}
}
#get avg, sd and p values now => load in new hash, that does not have the fileID
foreach my $round (keys %{$BOOTS}) {
foreach my $Rclass (keys %{$BOOTS->{$round}}) {
$exp = get_stats_data_details($Rclass,"tot","tot",$nb_c->{$Rclass}{'tot'}{'tot'},$exp) if ($Rclass ne "age");
foreach my $Rfam (keys %{$BOOTS->{$round}{$Rclass}}) {
$exp = get_stats_data_details($Rclass,$Rfam,"tot",$nb_f->{$Rclass}{$Rfam}{'tot'},$exp) if ($Rclass ne "age");
foreach my $Rname (keys %{$BOOTS->{$round}{$Rclass}{$Rfam}}) {
$exp = get_stats_data_details($Rclass,$Rfam,$Rname,$nb_r->{$Rclass}{$Rfam}{$Rname},$exp) if ($Rclass ne "age");
$exp = get_stats_data_details($Rclass,$Rfam,$Rname,$nb_a1->{$Rclass}{$Rfam}{$Rname},$exp) if (($Rclass eq "age") && ($Rfam eq "cat.1"));
$exp = get_stats_data_details($Rclass,$Rfam,$Rname,$nb_a2->{$Rclass}{$Rfam}{$Rname},$exp) if (($Rclass eq "age") && ($Rfam eq "cat.2"));
}
}
}
}
return($exp);
}
#-------------------------------------------------------------------------------
sub initialize_exp {
my $exp = ();
#obs:
#$counts->{$fileid}{'age'}{'cat.1'}{'tot'}{'tot'}
foreach my $f (keys %{$OBS}) {
foreach my $k1 (keys %{$OBS->{$f}}) {
foreach my $k2 (keys %{$OBS->{$f}{$k1}}) {
foreach my $k3 (keys %{$OBS->{$f}{$k1}{$k2}}) {
$exp->{$k1}{$k2}{$k3}{'avg'}=0;
$exp->{$k1}{$k2}{$k3}{'sd'}=0;
$exp->{$k1}{$k2}{$k3}{'p'}=0;
$exp->{$k1}{$k2}{$k3}{'rank'}="na";
$exp->{$k1}{$k2}{$k3}{'pval'}="na";
}
}
}
}
return $exp;
}
#-------------------------------------------------------------------------------
sub get_stats_data_details {
my ($key1,$key2,$key3,$agg_data,$exp) = @_;
#get average and sd of the expected
($exp->{$key1}{$key2}{$key3}{'avg'},$exp->{$key1}{$key2}{$key3}{'sd'}) = TEshuffle::get_avg_and_sd($agg_data);
my $obs = $OBS->{'no_boot'}{$key1}{$key2}{$key3}{'tot'};
# print STDERR "FYI: no observed value for {$key1}{$key2}{$key3}{'tot'}\n" unless ($obs);
$obs = 0 unless ($obs);
#Get the rank of the observed value in the list of expected + pvalue for the permutation test
my $rank = 1; #pvalue can't be 0, so I have to start there - that does mean there will be a rank nboot+1
my @data = sort {$a <=> $b} @{$agg_data};
EXP: foreach my $exp (@data) {
last EXP if ($exp > $obs);
$rank++;
}
$exp->{$key1}{$key2}{$key3}{'rank'}=$rank;
if ($rank <= $NBOOT/2) {
$exp->{$key1}{$key2}{$key3}{'pval'}=$rank/$NBOOT*2;
} else {
$exp->{$key1}{$key2}{$key3}{'pval'}=($NBOOT+2-$rank)/$NBOOT*2; #+2 so it is symetrical (around nboot+1)
}
#Binomial test
#get all the values needed for binomial test in R => do them all at once
my $n = $PARSEDRM->{$key1}{$key2}{$key3} if ($PARSEDRM->{$key1}{$key2}{$key3});
$n = 0 unless ($n);
# print STDERR " WARN: no value for total number (from RM output), for {$key1}{$key2}{$key3}? => no binomial test\n" if ($n == 0);
my $p = 0;
# print STDERR " WARN: n must be > x but n = $n and x = $obs for {$key1}{$key2}{$key3} => no binomial test\n" if ($n < $obs);
$p=$exp->{$key1}{$key2}{$key3}{'avg'}/$n unless ($n == 0); #should not happen, but could
$exp->{$key1}{$key2}{$key3}{'p'} = $p;
$exp->{$key1}{$key2}{$key3}{'n'} = $n;
$exp->{$key1}{$key2}{$key3}{'x'} = $obs;
return($exp);
}