-
Notifications
You must be signed in to change notification settings - Fork 8
/
exampleBiomart.R
executable file
·134 lines (95 loc) · 2.77 KB
/
exampleBiomart.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env Rscript
# Examples of R queries to Ensembl Plants Biomart
# Your usage of the data returned by the Biomart service is
# subject to same conditions as laid out on the Ensembl website.
#
# See documentation at
# https://www.ensembl.org/info/data/biomart/how_to_use_biomart.html
# https://www.ensembl.org/info/data/biomart/biomart_r_package.html
#
# Copyright [2020-21] EMBL-European Bioinformatics Institute
# To install R package biomaRt run: Rscript install_R_deps.R
local_lib = "./lib/R/"
.libPaths( c( .libPaths(), local_lib) )
library("biomaRt")
args = commandArgs(trailingOnly=TRUE)
## B1) Check plant marts and select dataset
listMarts( host="plants.ensembl.org" )
EPgenes = useEnsembl( biomart="plants_mart",
host="plants.ensembl.org")
dsets = listDatasets(EPgenes)
dsets[grep("Triticum aestivum", dsets$description),]
# dataset description version
# 69 taestivum_eg_gene Triticum aestivum genes (IWGSC) IWGSC
# take a note of the dataset name 'taestivum_eg_gene'
## B2) Check available filters and attributes
EPgenes = useMart(
biomart="plants_mart",
host="plants.ensembl.org",
dataset="taestivum_eg_gene")
head( listFilters(EPgenes) )
head( listAttributes(EPgenes) )
# stop here if just a test
if(length(args)==1 && args[1]=="test"){
q("no",1)
}
## B3) Download GO terms associated to genes
# Note genes might appear in several rows
go = getBM(
attributes=c("ensembl_gene_id", "go_id"),
mart=EPgenes)
head(go)
## B4) Get Pfam domains annotated in genes
EPgenes = useMart(
biomart="plants_mart",
host="plants.ensembl.org",
dataset="hannuus_eg_gene")
pfam = getBM(
attributes=c("ensembl_gene_id", "pfam"),
mart=EPgenes)
head(pfam)
## B5) Get SNP consequences from a selected variation source
# Note this requires connecting to a different mart (snp)
# Note this query takes a few minutes to run
EPvar = useMart( biomart="plants_variations",
host="plants.ensembl.org",
dataset="taestivum_eg_snp")
snp_source = c("EMS-induced mutation")
chrs = listFilterValues(mart=EPvar,
filter="chr_name")
attribs = c(
"refsnp_id",
"refsnp_source",
"ensembl_gene_stable_id",
"consequence_type_tv",
"sift_prediction",
"sift_score")
filts = c(
"variation_source",
"chr_name",
"sift_prediction")
preds = c(
"tolerated", # comment if unwanted
"deleterious")
snps <- NULL
for(chr in chrs){
print(chr) # show progress
for(pred in preds){
print(pred) # show progress
tmp_s <- getBM(
attributes=attribs,
filters=filts,
values=list(
variation_source=snp_source,
chr_name=chr,
sift_prediction=c(pred)),
mart=EPvar)
# append SNP batches to object snps
if(is.null(snps)){
snps<-tmp_s
}else{
snps<-rbind(snps,tmp_s)
}
}
}
head(snps)