-
Notifications
You must be signed in to change notification settings - Fork 8
/
run_get_phylomarkers_pipeline.sh
executable file
·3198 lines (2593 loc) · 157 KB
/
run_get_phylomarkers_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env bash
#: PROGRAM: run_get_phylomarkers_pipeline.sh
#: AUTHORS: Pablo Vinuesa, Center for Genome Sciences, CCG-UNAM, Mexico
#: https://www.ccg.unam.mx/~vinuesa/
# Bruno Contreras Moreira, EEAD-CSIC, Zaragoza, Spain
# https://www.eead.csic.es/compbio
#
#: DISCLAIMER: programs of the GET_PHYLOMARKERS package are distributed in the hope that they will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
#: LICENSE: This software is freely available under the GNU GENERAL PUBLIC LICENSE v.3.0 (GPLv3)
# see https://github.com/vinuesa/get_phylomarkers/blob/master/LICENSE.txt
#
#: AVAILABILITY: freely available from GitHub @ https://github.com/vinuesa/get_phylomarkers
# and DockerHub @ https://hub.docker.com/r/vinuesa/get_phylomarkers
#
#: PROJECT START: April 2017; This is a wrapper script to automate the whole process of marker selection
# and downstream phylogenomic analyses.
#
#: AIM: select optimal molecular markers for phylogenomics and population genomics from orthologous gene clusters computed by GET_HOMOLOGUES,
# which is freely available from GitHub @ https://github.com/eead-csic-compbio/get_homologues
#
#: OUTPUT: multiple sequence alignments (of protein and DNA sequences) of selected markers, gene trees, and species trees
# inferred from gene trees (ASTRAL/ASTER), and the concatenated supermatrix of top-ranking markers,
# along with graphics and tables summarizing the results of the pipeline obtained at the different filtering steps.
#
#: MANUAL: a detailed manual and tutorial are available here: https://vinuesa.github.io/get_phylomarkers/
#
#: CITATION / PUBLICATION: If you use this software in your own publications, please cite the following paper:
# Vinuesa P, Ochoa-Sanchez LE, Contreras-Moreira B. GET_PHYLOMARKERS, a Software Package to Select Optimal Orthologous Clusters for Phylogenomics
# and Inferring Pan-Genome Phylogenies, Used for a Critical Geno-Taxonomic Revision of the Genus Stenotrophomonas.
# Front Microbiol. 2018 May 1;9:771. doi:10.3389/fmicb.2018.00771.
# eCollection 2018. PubMed PMID: 29765358; PubMed Central PMCID: PMC5938378.
# ===============================================================================================================
# DEVFLAGS: <FIXME>; <SOLVED>; <TODO>; <DONE>
# Set Bash strict mode
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
#set -x # enables debugging output to trace the execution flow; Print commands and their arguments as they are executed. Also executed as bash -x script
set -e # NOTES: all "${distrodir}"/* calls fail unless they are wrapped in the following code
# { "${distrodir}"/run_parallel_cmmds.pl faaed 'add_nos2fasta_header.pl $file > ${file}no' "$n_cores" &> /dev/null && return 0; }
# Also added extended error and return 0 calls to the functions in get_phylomarkers_fun_lib and main script
set -u
set -o pipefail
progname=${0##*/} # run_get_phylomarkers_pipeline.sh
VERSION='2.8.4.0_2024-04-20'
# Set GLOBALS
# in Strict mode, need to explicitly set undefined variables to an empty string var=''
DEBUG=0
wkdir=$(pwd) #echo "# working in $wkdir"
PRINT_KDE_ERR_MESSAGE=0
logdir=$(pwd)
dir_suffix=''
lmsg=''
gene_tree_ext=''
no_kde_outliers=''
no_kde_ok=''
lmap_sampling=2000
declare -A filtering_results_h=()
declare -a filtering_results_kyes_a=()
declare -A output_files_h=()
declare -a output_files_a=()
declare -A figs_h=()
declare -a figs_a=()
declare -A aln_models_h=()
declare -a aln_models_a=()
declare -A aln_lmappings_h=()
declare -a aln_lmappings_a=()
declare -A alns_passing_lmap_and_suppval_h=()
declare -a alns_passing_lmap_and_suppval_a=()
declare -A alnIDs2names=()
min_bash_vers=4.4 # required for:
# 1. mapfile -t array <(cmd); see SC2207
# 2. printf '%(%F)T' '-1' in print_start_time; and
# 3. passing an array or hash by name reference to a bash function (since version 4.3+),
# by setting the -n attribute
# see https://stackoverflow.com/questions/16461656/how-to-pass-array-as-an-argument-to-a-function-in-bash
DATEFORMAT_SHORT="%d%b%y"
TIMESTAMP_SHORT=$(date +${DATEFORMAT_SHORT})
DATEFORMAT_HMS="%H:%M:%S"
#TIMESTAMP_HMS=$(date +${DATEFORMAT_HMS})
TIMESTAMP_SHORT_HMS=$(date +${DATEFORMAT_SHORT}-${DATEFORMAT_HMS})
#>>> set colors in bash
# ANSI escape codes
# Black 0;30 Dark Gray 1;30
# Red 0;31 Light Red 1;31
# Green 0;32 Light Green 1;32
# Brown/Orange 0;33 Yellow 1;33
# Blue 0;34 Light Blue 1;34
# Purple 0;35 Light Purple 1;35
# Cyan 0;36 Light Cyan 1;36
# Light Gray 0;37 White 1;37
RED='\033[0;31m'
LRED='\033[1;31m'
#GREEN='\033[0;32m'
#YELLOW='\033[1;33m'
#BLUE='\033[0;34m'
#LBLUE='\033[1;34m'
#CYAN='\033[0;36m'
NC='\033[0m' # No Color => end color
#printf "${RED}%s${NC} ${GREEN}%s${NC}\n" "I like" "GitHub"
#---------------------------------------------------------------------------------#
#>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTION DEFINITIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#
#---------------------------------------------------------------------------------#
function check_bash_version()
{
bash_vers=$(bash --version | head -1 | awk '{print $4}' | sed 's/(.*//' | cut -d. -f1,2)
if [ 1 -eq "$(echo "$bash_vers < $min_bash_vers" | bc)" ];
then
msg "FATAL: you are running acient bash v${bash_vers}, and version >= $min_bash_vers, is required" ERROR RED && exit 1
else
return 0
fi
}
#-----------------------------------------------------------------------------------------
function print_start_time()
{
#echo -n "[$(date +%T)] "
printf '%(%T )T' '-1' # requires Bash >= 4.3
}
#-----------------------------------------------------------------------------------------
function set_pipeline_environment()
{
if [[ "$OSTYPE" == "linux-gnu" ]]
then
local wkd scriptdir distrodir bindir OS no_cores
wkd=$(pwd)
scriptdir=$(readlink -f "${BASH_SOURCE[0]}")
distrodir=$(dirname "$scriptdir") #echo "scriptdir: $scriptdir|basedir:$distrodir|OSTYPE:$OSTYPE"
bindir="$distrodir/bin/linux"
OS='linux'
no_cores=$(awk '/^processor/{n+=1}END{print n}' /proc/cpuinfo)
#local rlibs=`for p in $(R -q -e 'print(.libPaths())'); do if [[ "$p" =~ '/' ]]; then echo -n "$p:"; fi; done; echo -n "$wkd"/"$distrodir/lib/R"`
#export R_LIBS_SITE="$rlibs"
elif [[ "$OSTYPE" == "darwin"* ]]
then
# get abs path of script as in
# https://stackoverflow.com/questions/5756524/how-to-get-absolute-path-name-of-shell-script-on-macos
local wkd scriptdir distrodir bindir OS no_cores
wkd=$(pwd)
scriptdir=${BASH_SOURCE[0]}
distrodir=$(cd "$(dirname "$scriptdir")" || { msg "ERROR: could not cd into $scriptdir" ERROR RED && exit 1 ; }; pwd -P)
distrodir="$wkd"/"$distrodir"
bindir="$distrodir/bin/macosx-intel"
OS='darwin'
no_cores=$(sysctl -n hw.ncpu)
#local rlibs=`for p in $(R -q -e 'print(.libPaths())'); do if [[ "$p" =~ '/' ]]; then echo -n "$p:"; fi; done; echo -n "$distrodir/lib/R"`
#export R_LIBS_SITE="$rlibs"
else
echo "ERROR: untested OS $OSTYPE, exit"
exit 1
fi
echo "$distrodir $bindir $OS $no_cores"
}
#-----------------------------------------------------------------------------------------
function check_dependencies()
{
#local VERBOSITY="$1"
# check if scripts are in path; if not, set flag
(( DEBUG > 0 )) && msg " => working in ${FUNCNAME[0]} ..." DEBUG NC
local prog bin
system_binaries=(bash R perl awk bc cut grep sed sort uniq Rscript find)
for prog in "${system_binaries[@]}"
do
bin=$(type -P "$prog")
if [ -z "$bin" ]; then
echo
printf "${RED}%s${NC}\n" "# ERROR: system binary $prog is not in \$PATH!"
printf "${LRED}%s${NC}\n" " >>> you will need to install $prog for $progname to run on $HOSTNAME"
printf "${LRED}%s${NC}\n" " >>> will exit now ..."
exit 1 # Exit with non-zero status
else
(( DEBUG > 0 )) && msg " <= exiting ${FUNCNAME[0]} ..." DEBUG NC
return 0
fi
done
}
#-----------------------------------------------------------------------------------------
function check_scripts_in_path()
{
(( DEBUG > 0 )) && msg " => working in ${FUNCNAME[0]} ..." DEBUG NC
local bash_scripts perl_scripts R_scripts prog bin distrodir not_in_path user
distrodir=$1
user=$2
not_in_path=0
(( DEBUG > 0 )) && msg "check_scripts_in_path() distrodir: $distrodir" DEBUG NC
bash_scripts=(run_parallel_molecClock_test_with_paup.sh)
perl_scripts=(run_parallel_cmmds.pl add_nos2fasta_header.pl pal2nal.pl rename.pl concat_alignments.pl \
add_labels2tree.pl convert_aln_format_batch_bp.pl popGen_summStats.pl)
R_scripts=( run_kdetrees.R compute_suppValStas_and_RF-dist.R )
# check if scripts are in path; if not, set flag
for prog in "${bash_scripts[@]}" "${perl_scripts[@]}" "${R_scripts[@]}"
do
bin=$(type -P "$prog")
if [ -z "$bin" ]; then
echo
if [ "$user" == "root" ]; then
msg "# WARNING: script $prog is not in \$PATH!" WARNING LRED
msg " >>> Will generate a symlink from /usr/local/bin or add it to \$PATH" WARNING CYAN
not_in_path=1
else
msg "# WARNING: script $prog is not in \$PATH!" WARNING LRED
msg " >>> Will generate a symlink from $HOME/bin or add it to \$PATH" WARNING CYAN
not_in_path=1
fi
fi
done
# if flag $not_in_path -eq 1, then either generate symlinks into $HOME/bin (if in $PATH) or export $distrodir to PATH
if (( not_in_path == 1 ))
then
if [[ "$user" == "root" ]]
then
if [ ! -d /usr/local/bin ]
then
msg "Could not find a /usr/local/bin directory for $user ..." WARNING CYAN
msg " ... will update PATH=$distrodir:$PATH" WARNING CYAN
export PATH="${distrodir}:${PATH}" # prepend $ditrodir to $PATH
fi
# check if $HOME/bin is in $PATH
if echo "$PATH" | sed 's/:/\n/g' | grep "/usr/local/bin$" &> /dev/null
then
msg "Found dir /usr/local/bin for $user in \$PATH ..." WARNING CYAN
msg " ... will generate symlinks in /usr/local/bin to all scripts in $distrodir ..." WARNING CYAN
ln -s "$distrodir"/*.sh /usr/local/bin &> /dev/null
ln -s "$distrodir"/*.R /usr/local/bin &> /dev/null
ln -s "$distrodir"/*.pl /usr/local/bin &> /dev/null
else
msg " Found dir /usr/local/bin for $user, but it is NOT in \$PATH ..." WARNING CYAN
msg " ... updating PATH=$PATH:$distrodir" WARNING CYAN
export PATH="${distrodir}:${PATH}" # prepend $distrodir to $PATH
fi
else
if [ ! -d "$HOME"/bin ]
then
msg "Could not find a $HOME/bin directory for $user ..." WARNING CYAN
msg " ... will update PATH=$distrodir:$PATH" WARNING CYAN
export PATH="${distrodir}:${PATH}" # prepend $ditrodir to $PATH
fi
# check if $HOME/bin is in $PATH
if echo "$PATH" | sed 's/:/\n/g'| grep "$HOME/bin$" &> /dev/null
then
msg "Found dir $HOME/bin for $user in \$PATH ..." WARNING CYAN
msg " ... will generate symlinks in $HOME/bin to all scripts in $distrodir ..." WARNING CYAN
ln -s "$distrodir"/*.sh "$HOME"/bin &> /dev/null
ln -s "$distrodir"/*.R "$HOME"/bin &> /dev/null
ln -s "$distrodir"/*.pl "$HOME"/bin &> /dev/null
#ln -s $distrodir/rename.pl $HOME/bin &> /dev/null
else
msg " Found dir $HOME/bin for $user, but it is NOT in \$PATH ..." WARNING CYAN
msg " ... updating PATH=$PATH:$distrodir" WARNING CYAN
export PATH="${distrodir}:${PATH}" # prepend $distrodir to $PATH
fi
fi
fi
(( DEBUG > 0 )) && msg " <= exiting ${FUNCNAME[0]} ..." DEBUG NC
return 0
}
#-----------------------------------------------------------------------------------------
function set_bindirs()
{
(( DEBUG > 0 )) && msg " => working in ${FUNCNAME[0]} ..." DEBUG NC
# receives: $bindir
local bindir
bindir=$1
# not_in_path=1
# # prepend $bindir to $PATH in any case, to ensure that the script runs with the distributed binaries in $bindir
export PATH="${bindir}:${PATH}"
# bins=( clustalo FastTree parallel Phi paup consense )
#
# for prog in "${bins[@]}"
# do
# bin=$(type -P $prog)
# if [ -z $bin ]
# then
# echo
# printf "${LRED}# $prog not found in \$PATH ... ${NC}\n"
# not_in_path=1
# fi
# done
#
# # check if scripts are in path; if not, set flag
# if [ $not_in_path -eq 1 ]
# then
# printf "${CYAN} updating PATH=$PATH:$bindir ${NC}\n"
# #export PATH=$PATH:$bindir # append $HOME/bin to $PATH, (at the end, to not interfere with the system PATH)
# # we do not export, so that this PATH update lasts only for the run of the script,
# # avoiding a longer-lasting alteration of $ENV;
# export PATH="${bindir}:${PATH}" # prepend $bindir to $PATH to ensure that the script runs with the distributed binaries
# fi
#echo $setbindir_flag
(( DEBUG > 0 )) && msg " <= exiting ${FUNCNAME[0]}..." DEBUG NC
return 0
}
#-----------------------------------------------------------------------------------------
function print_software_versions()
{
echo
msg ">>> Software versions run by $progname version $VERSION" PROGR LBLUE
bash --version | grep bash
check_bash_version
echo '-------------------------'
perl -v | grep version
echo '-------------------------'
R --version | grep version | grep R
echo '-------------------------'
parallel --version | grep 'GNU parallel' | grep -v warranty
echo '-------------------------'
bc --version | grep bc
echo '-------------------------'
"${bindir}"/paup --version
echo '-------------------------'
"${bindir}"/FastTree &> FT.tmp && FT_vers=$(head -1 FT.tmp | sed 's/Usage for FastTree //; s/://') && rm FT.tmp
echo "FastTree v.${FT_vers}"
echo '-------------------------'
"${bindir}"/iqtree --version | head -1
echo '-------------------------'
echo "consense, pars and seqboot v.3.69"
echo '-------------------------'
clustalo_version=$("${bindir}"/clustalo --version)
echo "clustalo v.${clustalo_version}"
echo '-------------------------'
echo
exit 0
}
#-----------------------------------------------------------------------------------------
function print_codontables()
{
cat <<CODONTBL
1 Universal code (default)
2 Vertebrate mitochondrial code
3 Yeast mitochondrial code
4 Mold, Protozoan, and Coelenterate Mitochondrial code
and Mycoplasma/Spiroplasma code
5 Invertebrate mitochondrial
6 Ciliate, Dasycladacean and Hexamita nuclear code
9 Echinoderm and Flatworm mitochondrial code
10 Euplotid nuclear code
11 Bacterial, archaeal and plant plastid code
12 Alternative yeast nuclear code
13 Ascidian mitochondrial code
14 Alternative flatworm mitochondrial code
15 Blepharisma nuclear code
16 Chlorophycean mitochondrial code
21 Trematode mitochondrial code
22 Scenedesmus obliquus mitochondrial code
23 Thraustochytrium mitochondrial code
CODONTBL
exit 0
}
#-----------------------------------------------------------------------------------------
function print_usage_notes()
{
cat <<USAGE
$progname v$VERSION extensive Help and details on the search modes and models.
1. Start the run from within the directory holding core gene clusters generated by
get_homologues.pl -e -t number_of_genomes or compare_clusters.pl -t number_of_genomes
NOTE: Both .faa and .fna files are required to generate codon alignments from DNA fasta files. This
means that two runs of compare_clusters.pl (from the get_homologues package) are required, one of them
using the -n flag. See GET_HOMOLOGUES online help http://eead-csic-compbio.github.io/get_homologues/manual/
2. $progname is intended to run on a collection of single-copy sequence clusters from different species or strains.
NOTES: An absolute minimum of 4 distinct genomes are required.
However, the power of the pipeline for selecting optimal genome loci
for phylogenomics improves when a larger number of genomes are available
for analysis. Reasonable numbers lie in the range of 10 to 200 distinct genomes
from multiple species of a genus, family, order or phylum.
The pipeline may not perform satisfactorily with very distant genome sequences,
particularly when sequences with significantly distinct nucleotide or aminoacid
compositions are used. This type of sequence heterogeneity is well known to
cause systematic bias in phylogenetic inference.
3. On the locus filtering criteria. $progname uses a hierarchical filtering scheme, as follows:
i) Detection of recombinant loci. Codon or protein alignments (depending on runmode)
are first screened with phi(w) for the presence of potential recombinant sequences.
It is a well established fact that recombinant sequences negatively impact
phylogenetic inference when using algorithms that do not account for the effects
of this evolutionary force. The permutation test with 1000 permutations is used
to compute the p-values. These are considerd significant if < 0.05.
ii) Detection of trees deviating from the expectation of the (multispecies) coalescent.
The next filtering step is provided by the kdetrees test, which checks the distribution of
topologies, tree lengths and branch lenghts. kdetrees is a non-parametric method for
estimating distributions of phylogenetic trees, with the goal of identifying trees that
are significantly different from the rest of the trees in the sample. Such "outlier"
trees may arise for example from horizontal gene transfers or gene duplication
(and subsequent neofunctionalization) followed by differential loss of paralogues among
lineages. Such processes will cause the affected genes to exhibit a history distinct
from those of the majority of genes, which are expected to be generated by the
(multispecies) coalescent as species or populations diverge. Alignments producing
significantly deviating trees in the kdetrees test are discarded.
* Parameter for controlling kdetrees stingency:
-k <real> kde stringency (0.7-1.6 are reasonable values; less is more stringent)
[default: $kde_stringency]
iii) Phylogenetic signal content.
The alignments passing the two previous filters are subjected to maximum likelihood
(ML) tree searches with FastTree or IQ-TREE to infer the corresponding ML gene trees.
The phylogenetic signal of these trees is computed from the Shimodair-Hasegawa-like
likelihood ratio test (SH-alrt) of branch support values, which vary between 0-1.
The support values of each internal branch or bipartition are parsed to compute the
mean support value for each tree. Trees with a mean support value below a cutoff
threshold are discarded.
* Parameters controlling filtering based on mean support values.
-m <real> min. average support value (0.7-0.8 are reasonable values)
for trees to be selected [default: $min_supp_val]
iv) On tree searching: From version 2.0 onwards, $progname performs tree searches using
either the FastTree (FT) or IQ-TREE (IQT) fast ML tree search algorithms,
controlled with the -A <F|I> option [default: $search_algorithm]
a) FT searches:
FT meets a good compromise between speed and accuracy, runnig both
with DNA and protein sequence alignments. It computes the above-mentioned
Shimodaria-Hasegawa-like likelihood ratio test of branch support values.
A limitation though, is that it implements only tow substitution models.
However, for divergent sequences of different species within a bacterial taxonomic
genus or family, our experience has shown that almost invariably the GTR+G model
is selected by jmodeltest2, particularly when there is base frequency heterogeneity.
The GTR+G+CAT is the substitution model used by $progname calls of FastTree on codon
alignments. From version 2.0 onwards, $progname can compute gene trees wit FT using
differetn levels of tree search intensity as defined with the -T parameter, both
during the gene-tree and species-tree estimation phases.
high: -nt -gtr -bionj -slow -slownni -gamma -mlacc 3 -spr $spr -sprlength $spr_length
medium: -nt -gtr -bionj -slownni -gamma -mlacc 2 -spr $spr -sprlength $spr_length
low: -nt -gtr -bionj -gamma -spr $spr -sprlength $spr_length
lowest: -nt -gtr -gamma -mlnni 4
where -s \$spr and -l \$spr_length can be set by the user.
The lines above show their default values.
The same applies for tree searches on concatenated codon alignments.
Note that these may take a considerable time (up to several hours)
for large datasets (~ 100 taxa and > 300 concatenated genes).
For protein alignments, the search parameters are the same, only the model changes to LG
Please refer to the FastTree manual for the details.
b) IQT searches:
Our benchmark analyses have shown that IQ-TREE (v1.6.1) runs quickly enough when the '-fast'
flag is passed to make it feasible to include a modelselection step withouth incurring in
prohibitively long computation times. Combined with its superior tree-searching algorithm,
makes IQT the clear winner in our benchmarks. Therefore, from version 2.0 (22Jan18) onwards,
$progname uses IQT as its default tree searching algorithm, both for the estimation of
gene-trees and the species-tree (from the concatenated, top-scoring alignments), now using
model selection in both cases (v 1.9 only used model selection and IQT for supermatrix search).
However, the number of models evaluated by ModelFinder (integrated in IQ-TREE) differ for the
gene-tree and species-tree search phases, as detailed below:
-IQT gene-tree searches (hard-coded): -T <high|medium|low|lowest> [default: $search_thoroughness]
high: -m MFP -nt 1 -alrt 1000 -fast [ as jModelTest]
medium: -mset K2P,HKY,TN,TNe,TIM,TIMe,TIM2,TIM2e,TIM3,TIM3e,TVM,TVMe,GTR -m MFP -nt 1 -alrt 1000 -fast
low: -mset K2P,HKY,TN,TNe,TVM,TVMe,TIM,TIMe,GTR -m MFP -nt 1 -alrt 1000 -fast
lowest: -mset K2P,HKY,TN,TNe,TVM,TIM,GTR -m MFP -nt 1 -alrt 1000 -fast
all gene trees are run in parallel under the modelset with the following parameters:
-mset X,Y,Z -m MFP -nt 1 -alrt 1000 -fast
-IQT species-tree searches on the supermatrix:
-S <string> comma-separated list of base models to be evaluated by IQ-TREE
when estimating the species tree from the concatenated supermatrix.
If no -S is passed, then sinlge default models are used, as shown below
<'JC,F81,K2P,HKY,TrN,TNe,K3P,K81u,TPM2,TPM2u,TPM3,TPM3u,
TIM,TIMe,TIM2,TIM2e,TIM3,TIM3e,TVM,TVMe,SYM,GTR'> for DNA alignments [default: $IQT_DNA_models]
<'BLOSUM62,cpREV,Dayhoff,DCMut,FLU,HIVb,HIVw,JTT,JTTDCMut,LG,
mtART,mtMAM,mtREV,mtZOA,Poisson,PMB,rtREV,VT,WAG'> for PROT alignments [default: $IQT_PROT_models]
In addition, if -T high, $progname will launch -N <integer> [default: $nrep_IQT_searches] independent IQT searches
on the supermatrix of concatenated top-scoring markers.
v) Running the pipeline in population-genetics mode (-R 2 -t DNA):
When invoked in popGen mode (-R 2), the pipeline will perform the same 4 initial steps as in phylo mode (-R 1):
1. generate codon alginments
2. chech them for the presence of recombinant sequences
3. estimate phylogenetic trees from non-recombinant alignments
4. filter gene trees for outliers with kdetrees test
The filtered alignments (non-recombinant and non-outlier) will then enter into the DNA polymorphims analysis,
which involves computing basic descriptive statistics from the alignments as well as performing the popular
Tajima\'s D and Fu and Li\'s D* neutrality tests. The results are summarized in a table with the following
fifteen columns:
Alignment_name
no_seqs
aln_len
avg_perc_identity
pars_info_sites
consistency_idx
homoplasy_idx
segregating_sites
singletons
pi_per_gene
pi_per_site
theta_per_gene
theta_per_site
tajimas_D
fu_and_li_D_star
This allows the user to identify neutral markers with desirable properties for standard population genetic analyses.
INVOCATION EXAMPLES:
1. default on DNA sequences (uses IQ-TREE evaluating a subset of models specified in the detailed help)
$progname -R 1 -t DNA
2. thorough FastTree searching and molecular clock analysis on DNA sequences using 10 cores:
$progname -R 1 -t DNA -A F -k 1.2 -m 0.7 -s 20 -l 12 -T high -K -M HKY -q 0.95 -n 10
3. FastTree searching on a huge protein dataset for fast inspection
$progname -R 1 -A F -t PROT -m 0.6 -k 1.0 -T lowest
4. To run the pipeline on a remote server, we recommend using the nohup command upfront, as shown below:
nohup $progname -R 1 -t DNA -S 'TNe,TVM,TVMe,GTR' -k 1.0 -m 0.75 -T high -N 5 &> /dev/null &
5. Run in population-genetics mode (generates a table with descritive statistics for DNA-polymorphisms
and the results of diverse neutrality tests)
$progname -R 2 -t DNA
NOTES
1: run from within the directory holding core gene clusters generated by get_homologues.pl -e or
compare_clusters.pl with -t no_genome_gbk files (core genome: all clusters with a single gene copy per genome)
2: If you encounter any problems, please run the script with the -D -V flags added at the end of the command line,
redirect STOUT to a file and send us the output, so that we can better diagnose the problem.
e.g. $progname -R 1 -t DNA -k 1.0 -m 0.7 -s 8 -l 10 -T high -K -D &> ERROR.log
USAGE
exit 0
}
#-----------------------------------------------------------------------------------------
function print_help()
{
cat <<EoH
$progname v.$VERSION OPTIONS:
REQUIRED:
-R <integer> RUNMODE
1 select optimal markers for phylogenetics/phylogenomics (genomes from different species)
2 select optimal markers for population genetics (genomes from the same species)
-t <string> type of input sequences: DNA|PROT
OPTIONAL:
-h|--help flag, print this short help notes
-H flag, print extensive Help and details about search modes and models
-A <string> Algorithm for tree searching: <F|I> [FastTree|IQ-TREE] [default:$search_algorithm]
-c <integer> NCBI codontable number (1-23) for pal2nal.pl to generate codon alignment [default:$codontable]
-C flag to print codontables
-D <int [1|2|3] to print debugging messages of low, high (set -v) and very high (set -vx)
verbosity, respectively. Please use -D <1|2> if you encounter problems [default: $DEBUG]
-e <integer> select gene trees with at least (min. = $min_no_ext_branches) external branches [default: $min_no_ext_branches]
-f <string> GET_HOMOLOGUES cluster format <STD|EST> [default: $cluster_format]
-I|--IQT_threads <integer> threads/cores for IQTree2 searching [default: $IQT_threads]
-k <real> kde stringency (0.7-1.6 are reasonable values; less is more stringent) [default: $kde_stringency]
-K flag to run molecular clock test on codon alignments [default: $eval_clock]
-l <integer> max. spr length (7-12 are reasonable values) [default: $spr_length]
-m <real> min. average support value (0.6-0.8 are reasonable values) for trees to be selected [default: $min_supp_val]
-M <string> base Model for clock test (use one of: GTR|TrN|HKY|K2P|F81); uses +G in all cases [default: $base_mod]
-n <integer> number of cores/threads to use for parallel computations except IQT searches [default: all cores]
-N <integer> number of IQ-TREE searches to run [only active with -T high] [default: $nrep_IQT_searches]
-q <real> quantile (0.95|0.99) of Chi-square distribution for computing molec. clock p-value [default: $q]
-r <string> root method (midpoint|outgroup) [default: $root_method]
-s <integer> number of spr rounds (4-20 are reasonable values) for FastTree tree searching [default: $spr]
-S <string> quoted 'comma-separated list' of base models to be evaluated by IQ-TREE
when estimating the species tree from the concatenated supermatrix (see -H for details).
If no -S is passed, then sinlge default models are used, as shown below
<'JC,F81,K2P,HKY,TrN,TNe,K3P,K81u,TPM2,TPM2u,TPM3,TPM3u,
TIM,TIMe,TIM2,TIM2e,TIM3,TIM3e,TVM,TVMe,SYM,GTR'> for DNA alignments [default: $IQT_DNA_models]
<'BLOSUM62,cpREV,Dayhoff,DCMut,FLU,HIVb,HIVw,JTT,JTTDCMut,LG,
mtART,mtMAM,mtREV,mtZOA,Poisson,PMB,rtREV,VT,WAG'> for PROT alignments [default: $IQT_PROT_models]
-T <string> tree search Thoroughness: high|medium|low|lowest (see -H for details) [default: $search_thoroughness]
-v|--version flag, print version and exit
-V|--Versions flag, print software versions
INVOCATION EXAMPLES:
1. default on DNA sequences (uses IQ-TREE evaluating a subset of models specified in the detailed help)
$progname -R 1 -t DNA -I 4
2. thorough FastTree searching and molecular clock analysis on DNA sequences using 10 cores:
$progname -R 1 -t DNA -A F -k 1.2 -m 0.7 -s 20 -l 12 -T high -K -M HKY -q 0.95 -n 10
3. FastTree searching on a huge protein dataset for fast inspection
$progname -R 1 -A F -t PROT -m 0.6 -k 1.0 -T lowest
4. To run the pipeline on a remote server, we recommend using the nohup command upfront, as shown below:
nohup $progname -R 1 -t DNA -S 'TNe,TVM,TVMe,GTR' -k 1.0 -m 0.75 -T high -N 5 -I 8 &> /dev/null &
5. Run in population-genetics mode (generates a table with descritive statistics for DNA-polymorphisms
and the results of diverse neutrality test)
$progname -R 2 -t DNA
NOTES
1: run from within the directory holding core gene clusters generated by get_homologues.pl -e or
compare_clusters.pl with -t no_genome_gbk files (core genome: all clusters with a single gene copy per genome)
2: If you encounter any problems, please run the script with the -D -V flags added at the end of the command line,
redirect STOUT to a file and send us the output, so that we can better diagnose the problem.
e.g. $progname -R 1 -t DNA -k 1.0 -m 0.7 -s 8 -l 10 -T high -K -D &> ERROR.log
EoH
exit 0
}
#-----------------------------------------------------------------------------------------
#------------------------------------#
#----------- GET OPTIONS ------------#
#------------------------------------#
# Required
runmode=0
mol_type=''
# Optional, with defaults
cluster_format=STD
search_thoroughness='high'
IQT_threads=2 # used only with concatenated supermatrix 2|INTEGER; AUTO is too slow
kde_stringency=1.5
min_supp_val=0.65
min_no_ext_branches=4
n_cores=''
#VERBOSITY=0
spr_length=8
spr=4
codontable=11 # bacterial by default, sorry for the bias ;)
base_mod=GTR
eval_clock=0
root_method=midpoint
tree_prefix=concat
q=0.99
search_algorithm=I
IQT_DNA_models=GTR
IQT_PROT_models=LG
IQT_models=''
nrep_IQT_searches=5
software_versions=0
declare -a args
args=("$@")
while getopts ':-:c:D:e:f:I:k:l:m:M:n:N:p:q:r:s:t:A:T:R:S:hCHKvV' VAL
do
case $VAL in
h) print_help
;;
A) search_algorithm=$OPTARG
;;
k) kde_stringency=$OPTARG
;;
c) codontable=$OPTARG
;;
C) print_codontables
;;
D) DEBUG=$OPTARG
;;
e) min_no_ext_branches=$OPTARG
;;
f) cluster_format=$OPTARG
;;
H) print_usage_notes
;;
I) IQT_threads=$OPTARG
;;
K) eval_clock=1
;;
l) spr_length=$OPTARG
;;
m) min_supp_val=$OPTARG
;;
M) base_mod=$OPTARG
;;
n) n_cores=$OPTARG
;;
N) nrep_IQT_searches=$OPTARG
;;
p) tree_prefix=$OPTARG
;;
q) q=$OPTARG
;;
R) runmode=$OPTARG
;;
r) root_method=$OPTARG
;;
s) spr=$OPTARG
;;
S) IQT_models=$OPTARG
;;
t) mol_type=$OPTARG
;;
T) search_thoroughness=$OPTARG
;;
v) echo "$progname v$VERSION" && exit 0
;;
V) software_versions=1
;;
#----------- add support for parsing long argument names-----------------------
- )
case $OPTARG in
h* ) print_help ;;
v* ) { echo "$progname v$VERSION" && exit 0 ; } ;;
V* ) software_versions=1 ;;
* ) echo "# ERROR: invalid long options --$OPTARG" && print_help ;;
esac >&2 # print the ERROR MESSAGES to STDERR
;;
#------------------------------------------------------------------------------
:) printf "argument missing from -%s option\n" "-$OPTARG"
print_help
;;
*) echo "invalid option: -$OPTARG"
echo
print_help
;;
esac >&2 # print the ERROR MESSAGES to STDERR
done
shift $((OPTIND - 1))
#-------------------------------------------------------#
# >>>BLOCK 0.1 SET THE ENVIRONMENT FOR THE PIPELINE <<< #
#-------------------------------------------------------#
# logdir=$(pwd) # set on top of script
# 0. Set the distribution base directory and OS-specific (linux|darwin) bindirs
env_vars=$(set_pipeline_environment) # returns: $distrodir $bindir $OS $no_proc
(( DEBUG > 0 )) && echo "env_vars:$env_vars"
distrodir=$(echo "$env_vars"|awk '{print $1}')
bindir=$(echo "$env_vars"|awk '{print $2}')
OS=$(echo "$env_vars"|awk '{print $3}')
no_proc=$(echo "$env_vars"|awk '{print $4}')
# source get_phylomarkers_fun_lib into the script to get access to reminder of the functions
source "${distrodir}"/lib/get_phylomarkers_fun_lib || { echo "ERROR: could not source ${distrodir}/lib/get_phylomarkers_fun_lib" && exit 1 ; }
if (( DEBUG == 2 )); then
# Print shell input lines as they are read.
set -v
elif (( DEBUG == 3 )); then # hysteric DEBUG level
# Print shell input lines as they are read.
set -v
# Print a trace of simple commands, for commands, case commands, select commands, and arithmetic for commands
# and their arguments or associated word lists after they are expanded and before they are executed.
set -x
fi
# check check_bash_version >= min_bash_version
check_bash_version
#-----------------------------------------------------------------------------------------
(( DEBUG > 0 )) && msg "distrodir:$distrodir|bindir:$bindir|OS:$OS|no_proc:$no_proc" DEBUG LBLUE
# 0.1 Determine if pipeline scripts are in $PATH;
# if not, add symlinks from ~/bin, if available
check_scripts_in_path "$distrodir" "$USER" || { msg "ERROR: check_scripts_in_path" ERROR RED && exit 1; }
# 0.2 Determine the bindir and add prepend it to PATH and export
set_bindirs "$bindir"
# 0.3 append the $distrodir/lib/R to R_LIBS and export
# NOTE: set -o nounset complained with the code below line 739: R_LIBS: unbound variable
#if [[ -n "$R_LIBS" ]]; then
# export R_LIBS="$R_LIBS:$distrodir/lib/R"
#else
# export R_LIBS="$distrodir/lib/R"
#fi
export R_LIBS="$distrodir/lib/R"
# 0.4 append the $distrodir/lib/perl to PERL5LIB and export
export PERL5LIB="${distrodir}/lib/perl:${distrodir}/lib/perl/bioperl-1.5.2_102"
# 0.5 check all dependencies are in place
check_dependencies 1
(( software_versions == 1 )) && print_software_versions
#--------------------------------------#
# >>> BLOCK 0.2 CHECK USER OPTIONS <<< #
#--------------------------------------#
# check for bare minimum dependencies: bash R perl awk cut grep sed sort uniq Rscript
if (( runmode < 1 )) || (( runmode > 2 ))
then
msg "# ERROR: need to define a runmode <int> [1|2]!" HELP RED
print_help
fi
if [ "$search_algorithm" != "I" ] && [ "$search_algorithm" != "F" ]
then
msg "# ERROR: search_algorithm $search_algorithm is not recognized!" ERROR RED
print_help
fi
if (( min_no_ext_branches < 4 ))
then
msg 'ERROR: -e has to be >= 4' HELP RED
print_help
fi
if [ -z "$n_cores" ]
then
n_cores="$no_proc"
fi
# make sure that the user does not request more cores than those available on host (default: AUTO)
((n_cores > no_proc)) && n_cores="$no_proc"
[[ "$IQT_threads" != 'AUTO' ]] && ((IQT_threads > no_proc)) && IQT_threads="$no_proc"
if [ "$mol_type" != "DNA" ] && [ "$mol_type" != "PROT" ] # "$mol_type" == "BOTH" not implemented yet
then
msg "ERROR: -t must be DNA or PROT" ERROR RED
print_help
fi
if [ -z "$IQT_models" ]
then
[ "$mol_type" == "DNA" ] && IQT_models="$IQT_DNA_models"
[ "$mol_type" == "PROT" ] && IQT_models="$IQT_PROT_models"
fi
if [ "$search_algorithm" == "I" ] && [ "$mol_type" == "DNA" ]
then
check_IQT_DNA_models "$IQT_models"
fi
if [ "$search_algorithm" == "I" ] && [ "$mol_type" == "PROT" ]
then
check_IQT_PROT_models "$IQT_models"
fi
if [ "$search_thoroughness" != "high" ] && [ "$search_thoroughness" != "medium" ] \
&& [ "$search_thoroughness" != "low" ] \
&& [ "$search_thoroughness" != "lowest" ]
then
msg "ERROR: -T must be lowest|low|medium|high" HELP RED
print_help
fi
if [ "$base_mod" != "GTR" ] && [ "$base_mod" != "TrN" ] && [ "$base_mod" != "HKY" ] \
&& [ "$base_mod" != "K2P" ] && [ "$base_mod" != "F81" ]
then
msg "ERROR: -M must be one of: GTR|TrN|HKY|K2P|F81" HELP RED
print_help
fi
if (( eval_clock > 0 )) && [ "$mol_type" != "DNA" ] # MolClock currently only with DNA
then
msg "-K 1 (evaluate clock) must be run on codon alignments with -t DNA" HELP RED
print_help
fi
if (( runmode > 1 )) && [[ "$mol_type" != "DNA" ]] # PopGen analysis currently only with DNA
then
msg "ERROR: runmode $runmode must be run on codon alignments with -t DNA" HELP RED
print_help
fi
#---------------------#
# >>>> MAIN CODE <<<< #
#---------------------#
(( DEBUG > 0 )) && msg "running on $OSTYPE" DEBUG LBLUE && echo "path contains: " && echo -e "${PATH//:/'\n'}"
start_time=$(date +%s)
parent_PID=$(get_script_PID "$USER" "$progname")
(( DEBUG > 0 )) && msg "parent_PID:$parent_PID" DEBUG LBLUE
if (( eval_clock == 1 )) && [[ "$search_algorithm" == "F" ]]
then
dir_suffix="A${search_algorithm}R${runmode}t${mol_type}_k${kde_stringency}_m${min_supp_val}_s${spr}_l${spr_length}_T${search_thoroughness}_K"
elif (( eval_clock != 1 )) && [[ "$search_algorithm" == "F" ]]
then
dir_suffix="A${search_algorithm}R${runmode}t${mol_type}_k${kde_stringency}_m${min_supp_val}_s${spr}_l${spr_length}_T${search_thoroughness}"
elif (( eval_clock == 1 )) && [[ "$search_algorithm" == "I" ]]
then
dir_suffix="A${search_algorithm}R${runmode}t${mol_type}_k${kde_stringency}_m${min_supp_val}_T${search_thoroughness}_K"
elif (( eval_clock != 1 )) && [ "$search_algorithm" == "I" ]
then
dir_suffix="A${search_algorithm}R${runmode}t${mol_type}_k${kde_stringency}_m${min_supp_val}_T${search_thoroughness}"
fi
msg "$progname vers. $VERSION run with the following parameters:" PROGR CYAN
# <FIXME> # SC2124: $* Explicitly concatenates all the array elements into a single string <SOLVED>
lmsg=("Run started on $TIMESTAMP_SHORT_HMS under $OSTYPE on $HOSTNAME with $n_cores cores
wkdir:$wkdir
distrodir:$distrodir
bindir:$bindir
> General run settings:
runmode:$runmode|mol_type:$mol_type|search_algorithm:$search_algorithm|cluster_format:$cluster_format
> Filtering parameters:
kde_stringency:$kde_stringency|min_supp_val:$min_supp_val
> FastTree parameters:
spr:$spr|spr_length:$spr_length|search_thoroughness:$search_thoroughness
> IQ-TREE parameters:
IQT_models:$IQT_models|search_thoroughness:$search_thoroughness
nrep_IQT_searches:$nrep_IQT_searches|IQT_threads:$IQT_threads
> Molecular Clock parmeters:
eval_clock:$eval_clock|root_method:$root_method|base_model:$base_mod|ChiSq_quantile:$q
> DEBUG=$DEBUG
# script invocation: $progname ${args[@]}")
msg "${lmsg[*]}" PROGR YELLOW
#----------------------------------------------------------------------------------------------------------------
#>>>BLOCK 1. make a new subdirectory within the one holding core genome clusters generated by compare_clusters.pl
# and generate symlinks to the faa and fna files (NOTE: BOTH REQUIRED). Fix fasta file names and headers.
# Check that we have fna faa input FASTA file pairs and that they contain the same number of sequences and
# instances of each taxon. Mark dir as top_dir
#----------------------------------------------------------------------------------------------------------------
if [ -d "get_phylomarkers_run_${dir_suffix}_${TIMESTAMP_SHORT}" ]
then
msg " >>> ERROR Found and older get_phylomarkers_run_${dir_suffix}_${TIMESTAMP_SHORT}/ directory. Please remove or rename and re-run!" ERROR RED
exit 2
fi
msg "" PROGR NC
msg " >>>>>>>>>>>>>> running input data sanity checks <<<<<<<<<<<<<< " PROGR YELLOW
msg "" PROGR NC
# populate the alnIDs2names_h hash
# 2332_hypothetical_protein_xzf.fna; key:2332 => val:2332_hypothetical_protein_xzf
while read -r a; do
alnIDs2names["${a%%_*}"]="${a%.*}"
done < <(find . -maxdepth 1 -name "*.fna")
# make sure we have *.faa and *.fna file pairs to work on
if ! nfna=$(find . -maxdepth 1 -name "*.fna" | wc -l)
then
msg " ERROR: there are no input fna files to work on!\n\tPlease check input FASTA files: [you may need to run compare_clusters.pl with -t NUM_OF_INPUT_GENOMES -n]\n\tPlease check the GET_HOMOLOGUES manual" ERROR RED
msg "http://eead-csic-compbio.github.io/get_homologues/manual/" ERROR BLUE
exit 2
fi
if ! nfaa=$(find . -maxdepth 1 -name "*.faa" | wc -l)
then
msg "ERROR: there are no input faa files to work on!\n\tPlease check input FASTA files: [you may need to run compare_clusters.pl with -t NUM_OF_INPUT_GENOMES]\n\tPlease check the GET_HOMOLOGUES manual" ERROR RED