-
Notifications
You must be signed in to change notification settings - Fork 8
/
exampleVEP.sh
executable file
·116 lines (88 loc) · 3.12 KB
/
exampleVEP.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/bin/bash
# Copyright [2020-21] EMBL-European Bioinformatics Institute
# documentation about Ensembl VEP can be found at
# http://www.ensembl.org/info/docs/tools/vep/index.html
# set Ensembl Plants release number, check it
# at the bottom of http://plants.ensembl.org
# EG stands for Ensembl Genomes
# example call: VEPATH=/path/to/ensembl-vep EGRELEASE=50 ./exampleVEP.sh
if [ -z "${EGRELEASE}" ]; then
EGRELEASE=49
fi
# edit if needed to point to ensembl-vep
if [ -z "${VEPATH}" ]; then
VEPATH=./
fi
# check VEP
if [ ! -f "${VEPATH}/ensembl-vep/vep" ]; then
echo "# ERROR: Cannot find ${VEPATH}/ensembl-vep/vep not found, please set VEPATH accordingly"
exit 1
fi
# work out Ensembl release, do not change
RELEASE=$((EGRELEASE + 53));
echo "EGRELEASE=${EGRELEASE}"
echo
## V1) Download, install and update VEP
# Fresh install
#git clone https://github.com/Ensembl/ensembl-vep.git
#cd ensembl-vep
#perl INSTALL.pl
# To update from a previous version:
#cd ensembl-vep
#git pull
#git checkout release/$RELEASE
#perl INSTALL.pl
## V2) Unpack downloaded cache file & check SIFT support
# Note: cache downloaded in recipe F8
# Note: look for "sift b"
SPECIES=arabidopsis_thaliana
VEPCACHE="${SPECIES}*.tar.gz*"
if [ ! -f ${VEPCACHE} ]; then
echo "# ERROR: Cache file ${VEPCACHE} not found, get it with recipe F8"
exit 1
else
tar xfz $VEPCACHE
pattern="${SPECIES}/${EGRELEASE}_*/info.txt"
files=( $pattern )
INFOFILE="${files[0]}"
if [ -f "${INFOFILE}" ]; then
grep sift "${INFOFILE}"
echo "${INFOFILE}"
else
echo "# ERROR: Cannot find file ${INFOFILE}, please correct/set variable EGRELEASE"
exit 1
fi
fi
## V3) Predict effect of variants
# See more options and examples at
# http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html
# http://www.ensembl.org/info/docs/tools/vep/script/vep_example.html
VCFILE="${VEPATH}/ensembl-vep/examples/arabidopsis_thaliana.TAIR10.vcf"
OUTFILE='arabidopsis_thaliana.vep.output'
VEPOPTIONS=(
--genomes # Ensembl Genomes, for Plants
--species $SPECIES
--cache # use local cache file, opposed to --database
--dir_cache ./ # location of unpacked cache $SPECIES folder
--cache_version $EGRELEASE
--check_existing # co-located known variants
--distance 5000 # max dist between variant and transcript
--biotype # show biotype of neighbor transcript
--input_file $VCFILE
--output_file $OUTFILE
)
# --sift b # only some species have SIFT precomputed
${VEPATH}/ensembl-vep/vep "${VEPOPTIONS[@]}"
## V4) Predict effect of variants for species not in Ensembl
# GFF file must be sorted and indexed with BGZIP and TABIX, see
# http://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#gff
FASTAGZFILE= # GZIP-compressed file of genome FASTA file
GFFILE= # gene models matching sequences in FASTAGZFILE
GZGFFILE=$GFFILE.sorted.gz
if [[ -f $GFFILE && -f $FASTAGZFILE ]]; then
# sort and index
grep -v "#" $GFFILE | sort -k1,1 -k4,4n -k5,5n -t$'\t' | bgzip -c > $GZGFFILE
tabix -p gff $GZGFFILE
# actually call vep
${VEPATH}/ensembl-vep/vep -i $VCFILE -gff $GZGFFILE -fasta $FASTAGZFILE
fi