-
Notifications
You must be signed in to change notification settings - Fork 0
/
makeProbeAnno.R
183 lines (153 loc) · 8.37 KB
/
makeProbeAnno.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
##--------------------------------------------------------------------------------
## Use NimbleGen's POS file or the output from MUMmer or BLAT or Exonerate
## (querying all probes on the chip vs genome)
## to construct probe annotaton datastructures
##
## IMPORTANT: After running MUMmer or BLAT condense their various output files
## into one table using the scripts 'condenseMUMmerOutput.pl' or
## 'condenseBlatOutput.pl' respectively. This condensed table is simlilar to
## a NimbleGen POS file and usable by this script!
##
## This script has the following parts
## 1. Read the table containing the POS file or MUMmer or BLAT hits
## 2. Construct along-chromosome vectors for probeAnno environment:
## start, end: (integer)
## index: index (1...6553600) of the PM on the chip (integer)
## unique: whether the probe hits multiple places (logical)
## 3. Construct GFF dataframe
## 4. optional: compute probes' GC content from NDF file supplied by NimbleGen
## 5. Write 'probeAnno.rda'
##--------------------------------------------------------------------------------
## Note: This script has been adopted from the 'davidTiling' package.
## However, since ChIP-chip is not strand-specific,
## the strand details were omitted.
## a. path to the result table after running the condense*Output.pl script
## FORMAT of that position file should be in NimbleGen's pos format:
# here's an example of their typical head (comment char and one whitespace at start of each line have been added only just here)
# SEQ_ID CHROMOSOME PROBE_ID POSITION LENGTH COUNT
# chr1:4483138-4484138 CHR01 CHR01P004483140 4483140 50 1
# chr1:4483138-4484138 CHR01 CHR01P004483245 4483245 50 1
# ...
# what to use:
hitResultFile <- "allChrom_Output.txt"
#hitResultFile2 <- "MOD_2003-12-05_SUZ12_2in2.pos"
# what to build:
gffFile <- "cel_ensembl_gff.rda"
probeAnnoFile <- "celegansProbeAnno.rda"
# how are the chromosomes referred to?
allChr <- c(1:5,"X") # C. elegans
# allChr <- "9" # the example data only contains information concerning reporters mapped to chromosome 9
# what can this script do:
possibleProducts <- c("probeAnno","gff","probegc")
# IMPORTANT what do we want to do now:
what <- possibleProducts[c(1,2,3)]
### READ IN ###################################################
hits <- read.delim(hitResultFile, header=TRUE, as.is=TRUE)
# do some sorting again to be on the safe side:
if (exists("hitResultFile2")){
hits2 <- read.delim(hitResultFile2, header=TRUE, as.is=TRUE)
hits <- rbind(hits, hits2)
}
if (!all(c("SEQ_ID","CHROMOSOME","PROBE_ID","POSITION","LENGTH")%in%names(hits)))
stop(paste("File",hitResultFile,"does not seem to be a POS file, since at least one of the
required column headers (SEQ_ID,CHROMOSOME,PROBE_ID,POSITION,LENGTH) is missing.\n"))
### if all the probes have the same length, one could do the sorting on the start positions alone and indeed in versions prior to 1.1.18 this was the case; for arrays with varying probe lengths, sorting on the probe's match middle position is preferable
hits$MIDDLE <- hits$POSITION + round((hits$LENGTH - 1)/2)
hitOrder <- order(hits$CHROMOSOME, hits$MIDDLE)
hits <- hits[hitOrder,]
if (length(grep("chr",hits$CHROMOSOME, ignore.case=TRUE)))
hits$CHROMOSOME <- gsub("chr(0)?","", hits$CHROMOSOME, ignore.case=TRUE)
probeindex <- 1:nrow(hits)
probeMultiplicity <- table(hits$PROBE_ID)
##--------------------------------------------------
## The along-chromosome data in 'probeAnno'
##--------------------------------------------------
## group PMs by chromosome
if("probeAnno" %in% what) {
cat("Making probeanno: ")
probeAnno = new.env()
sp <- split(probeindex, as.factor(hits$CHROMOSOME))
# for every chromosome, assign the hiting probes to the corresponding vector:
for(i in seq(along=sp)) {
chromprobes = sp[[i]]
nm = names(sp)[i]
cat(nm, "")
assign(paste(nm, "start", sep="."), as.integer(hits[chromprobes, "POSITION"]), envir=probeAnno)
assign(paste(nm, "end", sep="."), as.integer(hits[chromprobes, "POSITION"])+as.integer(hits[chromprobes, "LENGTH"])-1 , envir=probeAnno)
assign(paste(nm, "index", sep="."), hits[chromprobes, "PROBE_ID"], envir=probeAnno)
assign(paste(nm, "unique", sep="."), as.integer(probeMultiplicity[hits$PROBE_ID[chromprobes]]>1)*3,
envir=probeAnno)
## uniqueness codes: 0 = probe has one unique hit; 3= probe has multiple hits
}
cat("\n")
if (probegc %in% what) {
### add the GC content of each probe on the used arrays to the probeAnno environment
list.files(pattern="\\.ndf")
ndf.file.name <- "MOD_2003-12-05_SUZ12_1in2.ndf"
ndf <- read.delim(ndf.file.name, header=TRUE, as.is=TRUE)
stopifnot(all(c("PROBE_ID","PROBE_SEQUENCE") %in% names(ndf)))
are.unique <- !duplicated(ndf[["PROBE_ID"]])
probes.gc <- compute.gc(ndf[["PROBE_SEQUENCE"]][are.unique])
# my take some minutes
names(probes.gc) <- ndf[["PROBE_ID"]][are.unique]
assign("probes.gc", value=probes.gc, env=probeAnno)
rm(ndf, ndf2, ndf.file.name1, ndf.file.name2);gc()
}# if (probegc %in% what)
save(probeAnno, file=probeAnnoFile)
}# if("probeAnno" %in% what)
##--------------------------------------------------
## gff
##--------------------------------------------------
if("gff" %in% what) {
# augment list of transcript details with further info and save as gff
library("biomaRt")
ensembl <- useMart("ensembl", dataset="celegans_gene_ensembl")
gene.ids <- unique(unlist(lapply(as.list(allChr), function(this.chr) getFeature(type="ensembl_gene_id", chr=this.chr, mart=ensembl)[,2]), use.names=FALSE))
# what information to get for each transcript:
sel.attributes=c("ensembl_gene_id", "ensembl_transcript_id", "hgnc_symbol", "chromosome_name", "strand", "start_position","end_position", "transcript_start", "transcript_end", "description")
# if 'hgnc_symbol' is not a defined attribute for gene symbols in your data species, try to find an equivalent, using commands such as this one:
grep("symbol",listAttributes(ensembl)[,1], value=TRUE)
# retreive information:
gff <- getBM(attributes=sel.attributes, filters="ensembl_gene_id", value=gene.ids, mart=ensembl)
martDisconnect(ensembl)
## replace attribute names by standardized names
gff$gene <- gff$"ensembl_gene_id"
gff$name <- gff$ensembl_transcript_id
gff$chr <- gff$chromosome_name
gff$symbol <- gff$"hgnc_symbol"
gff$start <- gff$transcript_start
gff$end <- gff$transcript_end
gff$"gene_start" <- gff$"start_position"
gff$"gene_end" <- gff$"end_position"
gff$feature <- rep("transcript",nrow(gff))
gff$chromosome_name <- gff$markersymbol <- NULL
gff$transcript_start <- gff$transcript_end <- NULL
gff$start_position <- gff$end_position <- NULL
gff$ensembl_transcript_id <- NULL
gff$ensembl_gene_id <- NULL
### some transcripts names my occurr in multiples, usually this is because an Ensembl transcript can have more than one Symbol defined for it:
if (any(duplicated(gff$name))){
dupl.trans <- unique(gff$name[duplicated(gff$name)])
G <- lapply(as.list(dupl.trans), function(this.trans){
this.gff <- subset(gff,name == this.trans)
if (nrow(unique(this.gff[,c("gene","name","chr","start","end","description")]))>1) return(this.gff[1,,drop=FALSE])
non.zero.gff <- subset(this.gff, nchar(symbol)>0)
this.other.sym <- NULL
if (nrow(non.zero.gff)> 0){
this.new.sym <- non.zero.gff$symbol[which.min(nchar(non.zero.gff$symbol))]
if (nrow(non.zero.gff)>1)
this.other.sym <- paste("Synonyms",paste(non.zero.gff$symbol[-which.min(nchar(non.zero.gff$symbol))],collapse=","),sep=":")
} else { this.new.sym <- "" }
this.gff$symbol[1] <- this.new.sym
if (!is.null(this.other.sym))
this.gff$description[1] <- paste(this.gff$description[1],this.other.sym,sep=";")
return(this.gff[1,,drop=FALSE])
})
gff <- rbind(gff[-which(gff$name %in% dupl.trans),],do.call("rbind",G))
rm(G, dupl.trans);gc()
}#if (any(duplicated(gff$name)))
# reorder by chromosome and start position
gff <- gff[order(gff$chr, gff$gene_start, gff$start),c("name","gene","chr","strand","gene_start","gene_end","start","end","symbol", "description","feature")]
rownames(gff) <- NULL
save(gff, file=gffFile, compress=TRUE)
} # make gff