Skip to content

Commit ba6cdf9

Browse files
committed
allow aliases to be used - respect sample order when making final heatmap. resolves #6
1 parent fd24aa5 commit ba6cdf9

File tree

2 files changed

+156
-120
lines changed

2 files changed

+156
-120
lines changed

genomicHeatmapMaker.R

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,36 @@ library(pipeUtils)
99

1010
referenceGenome <- "hg18"
1111
heat_map_result_dir <- "./heatmap"
12-
sampleName <- c("pool1-1", "E9")
12+
sampleNameInput <- c("GTSP0308%", "GTSP0309%")
13+
#order should be preserved in final heatmap
14+
names(sampleNameInput) <- c("Tcells", "Monocytes")
1315

1416
# should have at least two samples
15-
stopifnot(length(sampleName) != 1)
17+
stopifnot(length(sampleNameInput) != 1)
18+
19+
#if no names are given, just duplicate the sampleNames
20+
if(is.null(names(sampleNameInput))){
21+
names(sampleNameInput) <- sampleNameInput
22+
}
23+
24+
stopifnot(length(sampleNameInput) == length(unique(names(sampleNameInput))))
25+
26+
#dereplicating wildcards and changing 'originalNames' to represent the
27+
#user-given alias
28+
sampleNames_originalNames <- getSampleNamesLike(sampleNameInput)
29+
sampleNames_originalNames$originalNames <-
30+
names(sampleNameInput)[match(sampleNames_originalNames$originalNames,
31+
sampleNameInput)]
32+
33+
sampleNames <- sampleNames_originalNames$sampleNames
34+
names(sampleNames) <- sampleNames_originalNames$originalNames
35+
1636
# check that all samples processed with the same reference genome
17-
stopifnot(unique(getRefGenome(sampleName)$refGenome) == referenceGenome)
18-
stopifnot(all(setNameExists(sampleName)))
37+
stopifnot(unique(getRefGenome(sampleNames)$refGenome) == referenceGenome)
38+
stopifnot(all(setNameExists(sampleNames)))
1939

2040
reference_genome_sequence <- get_reference_genome(referenceGenome)
21-
sites_mrcs <- get_integration_sites_with_mrcs(sampleName, reference_genome_sequence)
41+
sites_mrcs <- get_integration_sites_with_mrcs(sampleNames, reference_genome_sequence)
2242

2343
# TODO: populate from local database, at present pulled from UCSC web-site
2444
refSeq_genes <- getRefSeq_genes(referenceGenome)
@@ -31,15 +51,15 @@ oncogenes <- get_oncogene_from_file(oncogene_file)
3151
# END annotation loading
3252

3353
sites_mrcs <- getSitesInFeature(
34-
sites_mrcs, refSeq_genes, "within_refSeq_gene", asBool=TRUE)
54+
sites_mrcs, refSeq_genes, "within_refSeq_gene", asBool=TRUE)
3555

3656
# is there oncogene closer than 50k
3757
refSeq_gene_symbols <- refSeq_genes$name2
3858
is_refSeq_oncogene <- is_onco_gene(refSeq_gene_symbols, oncogenes)
3959
refSeq_oncogene <- refSeq_genes[is_refSeq_oncogene]
4060
sites_mrcs <- getNearestFeature(
41-
sites_mrcs, refSeq_oncogene, dists.only=TRUE, colnam="onco")
42-
#sites_mrcs, refSeq_oncogene, dists.only=TRUE, colnam="onco.100k")
61+
sites_mrcs, refSeq_oncogene, dists.only=TRUE, colnam="onco")
62+
#sites_mrcs, refSeq_oncogene, dists.only=TRUE, colnam="onco.100k")
4363
sites_mrcs$onco.100k <- abs(sites_mrcs$oncoDist) <= 50000
4464
sites_mrcs$oncoDist <- NULL
4565
# end oncogene
@@ -48,35 +68,39 @@ sites_mrcs <- getPositionalValuesOfFeature(sites_mrcs, refSeq_genes)
4868

4969
window_size_refSeq <- c("10k"=1e4, "100k"=1e5, "1M"=1e6)
5070
sites_mrcs <- getFeatureCounts(sites_mrcs, refSeq_genes, "refSeq_counts",
51-
width=window_size_refSeq)
71+
width=window_size_refSeq)
5272

5373
window_size_GC <- c("50"=50, "100"=100, "250"=250,
54-
"500"=500, "1k"=1000, "2k"=2000, "5k"=5000,
55-
"10k"=1e4, "25k"=2.5e4, "50k"=5e4, "100k"=1e5,
56-
"250k"=2.5e5, "500k"=5e5, "1M"=1e6)
74+
"500"=500, "1k"=1000, "2k"=2000, "5k"=5000,
75+
"10k"=1e4, "25k"=2.5e4, "50k"=5e4, "100k"=1e5,
76+
"250k"=2.5e5, "500k"=5e5, "1M"=1e6)
5777
sites_mrcs <- getGCpercentage(
58-
sites_mrcs, "GC", window_size_GC, reference_genome_sequence)
78+
sites_mrcs, "GC", window_size_GC, reference_genome_sequence)
5979

6080
window_size_CpG_counts <- c("2k"=2e3, "10k"=1e4)
6181
sites_mrcs <- getFeatureCounts(sites_mrcs, CpG_islands, "CpG_counts",
62-
width=window_size_CpG_counts)
82+
width=window_size_CpG_counts)
6383

6484
window_size_CpG_density <- c("10k"=1e4, "100k"=1e5, "1M"=1e6)
6585
sites_mrcs <- getFeatureCounts(sites_mrcs, CpG_islands, "CpG_density",
66-
width=window_size_CpG_density)
86+
width=window_size_CpG_density)
6787
sites_mrcs <- from_counts_to_density(sites_mrcs,
68-
"CpG_density", window_size_CpG_density)
88+
"CpG_density", window_size_CpG_density)
6989

7090
window_size_DNaseI <- c("1k"=1e3, "10k"=1e4, "100k"=1e5, "1M"=1e6)
7191
sites_mrcs <- getFeatureCounts(sites_mrcs, DNaseI, "DNaseI_count",
72-
width=window_size_DNaseI)
92+
width=window_size_DNaseI)
7393

7494
sites_mrcs <- as.data.frame(sites_mrcs)
7595

7696
annotation_columns <- get_annotation_columns(sites_mrcs)
7797

98+
#restore ordering of values in sites_mrcs$sampleName column so that heatmap
99+
#order reflects sample input order
100+
sites_mrcs$sampleName <- factor(sites_mrcs$sampleName, levels=names(sampleNameInput))
101+
78102
rset <- with(sites_mrcs, ROC.setup(
79-
rep(TRUE, nrow(sites_mrcs)), type, siteID, sampleName))
103+
rep(TRUE, nrow(sites_mrcs)), type, siteID, sampleName))
80104
roc.res <- ROC.strata(annotation_columns, rset, add.var=TRUE, sites_mrcs)
81105
ROCSVG(roc.res, heat_map_result_dir)
82106

utils.R

Lines changed: 114 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,54 @@
11
getRefSeq_genes <- function(reference_genome) {
2-
refSeq <- makeGRanges(
3-
getUCSCtable("refGene", "RefSeq Genes", freeze=reference_genome),
4-
freeze=reference_genome
5-
)
2+
refSeq <- makeGRanges(
3+
getUCSCtable("refGene", "RefSeq Genes", freeze=reference_genome),
4+
freeze=reference_genome
5+
)
66
}
77

88
getCpG_islands <- function(reference_genome) {
9-
cpg <- getUCSCtable("cpgIslandExt", "CpG Islands", freeze=reference_genome)
10-
cpg$strand <- "*" # either strand
11-
makeGRanges(cpg, freeze=reference_genome, chromCol='chrom')
9+
cpg <- getUCSCtable("cpgIslandExt", "CpG Islands", freeze=reference_genome)
10+
cpg$strand <- "*" # either strand
11+
makeGRanges(cpg, freeze=reference_genome, chromCol='chrom')
1212
}
1313

1414
getDNaseI <- function(reference_genome) {
15-
DNaseI <- getUCSCtable("wgEncodeRegDnaseClustered",
16-
"DNase Clusters", freeze=reference_genome)
17-
DNaseI$strand <- "*" # either strand
18-
makeGRanges(DNaseI, freeze=reference_genome, chromCol='chrom')
15+
DNaseI <- getUCSCtable("wgEncodeRegDnaseClustered",
16+
"DNase Clusters", freeze=reference_genome)
17+
DNaseI$strand <- "*" # either strand
18+
makeGRanges(DNaseI, freeze=reference_genome, chromCol='chrom')
1919
}
2020

21-
get_integration_sites_with_mrcs <- function(sampleName, refGenomeSeq) {
22-
sites <- getUniqueSites(sampleName)
23-
sites$type <- "insertion"
24-
25-
mrcs <- getMRCs(sampleName)
26-
mrcs$type <- "match"
21+
get_integration_sites_with_mrcs <- function(sampleNames, refGenomeSeq) {
2722

28-
sites_mrcs <- rbind(sites, mrcs)
23+
sampleNames <- split(sampleNames, names(sampleNames))
2924

30-
sites_mrcs <- makeGRanges(sites_mrcs, soloStart=TRUE,
31-
chromCol='chr', strandCol='strand', startCol='position')
25+
sites_mrcs <- lapply(seq(length(sampleNames)), function(x){
26+
samplesToGet <- sampleNames[[x]] #only for this scope
27+
alias <- names(sampleNames)[x]
3228

33-
#seqinfo needs to be exact here or trimming will be wrong
34-
newSeqInfo <- seqinfo(refGenomeSeq)
35-
seqInfo.new2old <- match(seqnames(newSeqInfo),
36-
seqnames(seqinfo(sites_mrcs)))
37-
seqinfo(sites_mrcs, new2old=seqInfo.new2old) <- newSeqInfo
29+
sites <- getUniqueSites(samplesToGet)
30+
sites$type <- "insertion"
31+
sites$sampleName <- alias
3832

39-
sites_mrcs
33+
mrcs <- getMRCs(samplesToGet)
34+
mrcs$type <- "match"
35+
mrcs$sampleName <- alias
36+
37+
rbind(sites, mrcs)
38+
})
39+
40+
sites_mrcs <- do.call(rbind, sites_mrcs)
41+
42+
sites_mrcs <- makeGRanges(sites_mrcs, soloStart=TRUE,
43+
chromCol='chr', strandCol='strand', startCol='position')
44+
45+
#seqinfo needs to be exact here or trimming will be wrong
46+
newSeqInfo <- seqinfo(refGenomeSeq)
47+
seqInfo.new2old <- match(seqnames(newSeqInfo),
48+
seqnames(seqinfo(sites_mrcs)))
49+
seqinfo(sites_mrcs, new2old=seqInfo.new2old) <- newSeqInfo
50+
51+
sites_mrcs
4052
}
4153

4254
#' return genome seq for human readable UCSC format
@@ -51,86 +63,86 @@ get_reference_genome <- function(reference_genome) {
5163
}
5264

5365
get_annotation_columns <- function(sites) {
54-
granges_column_names <- c("seqnames", "start", "end", "width", "strand")
55-
int_site_column_names <- c("siteID", "sampleName", "chr", "strand", "position")
56-
required_columns <- unique(c(
57-
granges_column_names, int_site_column_names, "type"))
58-
stopifnot(all(required_columns %in% names(sites)))
59-
setdiff(names(sites), required_columns)
66+
granges_column_names <- c("seqnames", "start", "end", "width", "strand")
67+
int_site_column_names <- c("siteID", "sampleName", "chr", "strand", "position")
68+
required_columns <- unique(c(
69+
granges_column_names, int_site_column_names, "type"))
70+
stopifnot(all(required_columns %in% names(sites)))
71+
setdiff(names(sites), required_columns)
6072
}
6173

6274
from_counts_to_density <- function(sites, column_prefix, window_size) {
63-
metadata <- mcols(sites)
64-
sapply(seq(window_size), function(i) {
65-
val <- window_size[i]
66-
name <- names(window_size)[i]
67-
column_name <- paste0(column_prefix, ".", name)
68-
metadata[[column_name]] <<- metadata[[column_name]]/val
69-
})
70-
mcols(sites) <- metadata
71-
sites
75+
metadata <- mcols(sites)
76+
sapply(seq(window_size), function(i) {
77+
val <- window_size[i]
78+
name <- names(window_size)[i]
79+
column_name <- paste0(column_prefix, ".", name)
80+
metadata[[column_name]] <<- metadata[[column_name]]/val
81+
})
82+
mcols(sites) <- metadata
83+
sites
7284
}
7385

7486
getPositionalValuesOfFeature <- function(sites, genomicData) {
75-
#### Boundary Distances #### Nirav Malani code TODO: refactor into several functions
76-
## (refSeq boundary.dist), Start (refSeq start.dist), non-width (), General (general.width)
77-
## when inGene is FALSE then set following: ref.left.pos, ref.right.pos, ref.left.strand, ref.right.strand
78-
## when inGene is TRUE then set following: ref.start.pos, ref.end.pos, ref.gene.strand
79-
80-
## prepare the new columns ##
81-
colnam <- paste("ref", c("left.pos", "right.pos", "left.strand", "right.strand",
82-
"start.pos", "end.pos", "gene.strand"), sep=".")
83-
mcols(sites)[colnam] <- NA
84-
85-
## add the respective columns as needed ##
86-
## beware: precede returns range which is following the query and
87-
## follow returns the range which is preceding the query!
88-
## so do a switcheroo in terms of extracting the start & stop ##
89-
left <- follow(sites, genomicData, ignore.strand=TRUE)
90-
left[is.na(left) | sites$within_refSeq_gene] <- NA
91-
rows <- na.omit(left)
92-
sites$ref.left.pos[!is.na(left)] <- end(genomicData[rows])
93-
sites$ref.left.strand[!is.na(left)] <- as.character(strand(genomicData[rows]))
94-
95-
right <- precede(sites, genomicData, ignore.strand=TRUE)
96-
right[is.na(right) | sites$within_refSeq_gene] <- NA
97-
rows <- na.omit(right)
98-
sites$ref.right.pos[!is.na(right)] <- start(genomicData[rows])
99-
sites$ref.right.strand[!is.na(right)] <- as.character(strand(genomicData[rows]))
100-
101-
inIt <- findOverlaps(sites, genomicData, ignore.strand=TRUE, select="arbitrary")
102-
inIt[is.na(inIt) | !sites$within_refSeq_gene] <- NA
103-
rows <- na.omit(inIt)
104-
sites$ref.start.pos[!is.na(inIt)] <- start(genomicData[rows])
105-
sites$ref.end.pos[!is.na(inIt)] <- end(genomicData[rows])
106-
sites$ref.gene.strand[!is.na(inIt)] <- as.character(strand(genomicData[rows]))
107-
108-
sites$boundary.dist <-
109-
eval(expression(pmin((ref.end.pos-position)/(ref.end.pos-ref.start.pos),
110-
(position-ref.start.pos)/(ref.end.pos-ref.start.pos),
111-
(ref.right.pos-position)/(ref.right.pos-ref.left.pos),
112-
(position-ref.left.pos)/(ref.right.pos-ref.left.pos),
113-
na.rm=T)), mcols(sites))
114-
115-
sites$start.dist <-
116-
eval(expression(pmin(ifelse(ref.gene.strand=="-",
117-
(ref.end.pos-position)/(ref.end.pos-ref.start.pos),
118-
(position-ref.start.pos)/(ref.end.pos-ref.start.pos)),
119-
ifelse(ref.right.strand=="-",
120-
(ref.right.pos-position)/(ref.right.pos-ref.left.pos),
121-
NA),
122-
ifelse(ref.left.strand=="+",
123-
(position-ref.left.pos)/(ref.right.pos-ref.left.pos),
124-
NA),na.rm=T)), mcols(sites))
125-
126-
sites$general.width <- eval(expression(pmin(ref.end.pos-ref.start.pos,
127-
ref.right.pos-ref.left.pos,na.rm=T)),
128-
mcols(sites))
129-
sites$gene.width <- eval(expression(ref.end.pos-ref.start.pos ), mcols(sites))
130-
131-
meta <- mcols(sites)
132-
meta <- meta[ , ! (names(meta) %in% colnam)]
133-
mcols(sites) <- meta
134-
135-
sites
87+
#### Boundary Distances #### Nirav Malani code TODO: refactor into several functions
88+
## (refSeq boundary.dist), Start (refSeq start.dist), non-width (), General (general.width)
89+
## when inGene is FALSE then set following: ref.left.pos, ref.right.pos, ref.left.strand, ref.right.strand
90+
## when inGene is TRUE then set following: ref.start.pos, ref.end.pos, ref.gene.strand
91+
92+
## prepare the new columns ##
93+
colnam <- paste("ref", c("left.pos", "right.pos", "left.strand", "right.strand",
94+
"start.pos", "end.pos", "gene.strand"), sep=".")
95+
mcols(sites)[colnam] <- NA
96+
97+
## add the respective columns as needed ##
98+
## beware: precede returns range which is following the query and
99+
## follow returns the range which is preceding the query!
100+
## so do a switcheroo in terms of extracting the start & stop ##
101+
left <- follow(sites, genomicData, ignore.strand=TRUE)
102+
left[is.na(left) | sites$within_refSeq_gene] <- NA
103+
rows <- na.omit(left)
104+
sites$ref.left.pos[!is.na(left)] <- end(genomicData[rows])
105+
sites$ref.left.strand[!is.na(left)] <- as.character(strand(genomicData[rows]))
106+
107+
right <- precede(sites, genomicData, ignore.strand=TRUE)
108+
right[is.na(right) | sites$within_refSeq_gene] <- NA
109+
rows <- na.omit(right)
110+
sites$ref.right.pos[!is.na(right)] <- start(genomicData[rows])
111+
sites$ref.right.strand[!is.na(right)] <- as.character(strand(genomicData[rows]))
112+
113+
inIt <- findOverlaps(sites, genomicData, ignore.strand=TRUE, select="arbitrary")
114+
inIt[is.na(inIt) | !sites$within_refSeq_gene] <- NA
115+
rows <- na.omit(inIt)
116+
sites$ref.start.pos[!is.na(inIt)] <- start(genomicData[rows])
117+
sites$ref.end.pos[!is.na(inIt)] <- end(genomicData[rows])
118+
sites$ref.gene.strand[!is.na(inIt)] <- as.character(strand(genomicData[rows]))
119+
120+
sites$boundary.dist <-
121+
eval(expression(pmin((ref.end.pos-position)/(ref.end.pos-ref.start.pos),
122+
(position-ref.start.pos)/(ref.end.pos-ref.start.pos),
123+
(ref.right.pos-position)/(ref.right.pos-ref.left.pos),
124+
(position-ref.left.pos)/(ref.right.pos-ref.left.pos),
125+
na.rm=T)), mcols(sites))
126+
127+
sites$start.dist <-
128+
eval(expression(pmin(ifelse(ref.gene.strand=="-",
129+
(ref.end.pos-position)/(ref.end.pos-ref.start.pos),
130+
(position-ref.start.pos)/(ref.end.pos-ref.start.pos)),
131+
ifelse(ref.right.strand=="-",
132+
(ref.right.pos-position)/(ref.right.pos-ref.left.pos),
133+
NA),
134+
ifelse(ref.left.strand=="+",
135+
(position-ref.left.pos)/(ref.right.pos-ref.left.pos),
136+
NA),na.rm=T)), mcols(sites))
137+
138+
sites$general.width <- eval(expression(pmin(ref.end.pos-ref.start.pos,
139+
ref.right.pos-ref.left.pos,na.rm=T)),
140+
mcols(sites))
141+
sites$gene.width <- eval(expression(ref.end.pos-ref.start.pos ), mcols(sites))
142+
143+
meta <- mcols(sites)
144+
meta <- meta[ , ! (names(meta) %in% colnam)]
145+
mcols(sites) <- meta
146+
147+
sites
136148
}

0 commit comments

Comments
 (0)