1
1
getRefSeq_genes <- function (reference_genome ) {
2
- refSeq <- makeGRanges(
3
- getUCSCtable(" refGene" , " RefSeq Genes" , freeze = reference_genome ),
4
- freeze = reference_genome
5
- )
2
+ refSeq <- makeGRanges(
3
+ getUCSCtable(" refGene" , " RefSeq Genes" , freeze = reference_genome ),
4
+ freeze = reference_genome
5
+ )
6
6
}
7
7
8
8
getCpG_islands <- function (reference_genome ) {
9
- cpg <- getUCSCtable(" cpgIslandExt" , " CpG Islands" , freeze = reference_genome )
10
- cpg $ strand <- " *" # either strand
11
- makeGRanges(cpg , freeze = reference_genome , chromCol = ' chrom' )
9
+ cpg <- getUCSCtable(" cpgIslandExt" , " CpG Islands" , freeze = reference_genome )
10
+ cpg $ strand <- " *" # either strand
11
+ makeGRanges(cpg , freeze = reference_genome , chromCol = ' chrom' )
12
12
}
13
13
14
14
getDNaseI <- function (reference_genome ) {
15
- DNaseI <- getUCSCtable(" wgEncodeRegDnaseClustered" ,
16
- " DNase Clusters" , freeze = reference_genome )
17
- DNaseI $ strand <- " *" # either strand
18
- makeGRanges(DNaseI , freeze = reference_genome , chromCol = ' chrom' )
15
+ DNaseI <- getUCSCtable(" wgEncodeRegDnaseClustered" ,
16
+ " DNase Clusters" , freeze = reference_genome )
17
+ DNaseI $ strand <- " *" # either strand
18
+ makeGRanges(DNaseI , freeze = reference_genome , chromCol = ' chrom' )
19
19
}
20
20
21
- get_integration_sites_with_mrcs <- function (sampleName , refGenomeSeq ) {
22
- sites <- getUniqueSites(sampleName )
23
- sites $ type <- " insertion"
24
-
25
- mrcs <- getMRCs(sampleName )
26
- mrcs $ type <- " match"
21
+ get_integration_sites_with_mrcs <- function (sampleNames , refGenomeSeq ) {
27
22
28
- sites_mrcs <- rbind( sites , mrcs )
23
+ sampleNames <- split( sampleNames , names( sampleNames ) )
29
24
30
- sites_mrcs <- makeGRanges(sites_mrcs , soloStart = TRUE ,
31
- chromCol = ' chr' , strandCol = ' strand' , startCol = ' position' )
25
+ sites_mrcs <- lapply(seq(length(sampleNames )), function (x ){
26
+ samplesToGet <- sampleNames [[x ]] # only for this scope
27
+ alias <- names(sampleNames )[x ]
32
28
33
- # seqinfo needs to be exact here or trimming will be wrong
34
- newSeqInfo <- seqinfo(refGenomeSeq )
35
- seqInfo.new2old <- match(seqnames(newSeqInfo ),
36
- seqnames(seqinfo(sites_mrcs )))
37
- seqinfo(sites_mrcs , new2old = seqInfo.new2old ) <- newSeqInfo
29
+ sites <- getUniqueSites(samplesToGet )
30
+ sites $ type <- " insertion"
31
+ sites $ sampleName <- alias
38
32
39
- sites_mrcs
33
+ mrcs <- getMRCs(samplesToGet )
34
+ mrcs $ type <- " match"
35
+ mrcs $ sampleName <- alias
36
+
37
+ rbind(sites , mrcs )
38
+ })
39
+
40
+ sites_mrcs <- do.call(rbind , sites_mrcs )
41
+
42
+ sites_mrcs <- makeGRanges(sites_mrcs , soloStart = TRUE ,
43
+ chromCol = ' chr' , strandCol = ' strand' , startCol = ' position' )
44
+
45
+ # seqinfo needs to be exact here or trimming will be wrong
46
+ newSeqInfo <- seqinfo(refGenomeSeq )
47
+ seqInfo.new2old <- match(seqnames(newSeqInfo ),
48
+ seqnames(seqinfo(sites_mrcs )))
49
+ seqinfo(sites_mrcs , new2old = seqInfo.new2old ) <- newSeqInfo
50
+
51
+ sites_mrcs
40
52
}
41
53
42
54
# ' return genome seq for human readable UCSC format
@@ -51,86 +63,86 @@ get_reference_genome <- function(reference_genome) {
51
63
}
52
64
53
65
get_annotation_columns <- function (sites ) {
54
- granges_column_names <- c(" seqnames" , " start" , " end" , " width" , " strand" )
55
- int_site_column_names <- c(" siteID" , " sampleName" , " chr" , " strand" , " position" )
56
- required_columns <- unique(c(
57
- granges_column_names , int_site_column_names , " type" ))
58
- stopifnot(all(required_columns %in% names(sites )))
59
- setdiff(names(sites ), required_columns )
66
+ granges_column_names <- c(" seqnames" , " start" , " end" , " width" , " strand" )
67
+ int_site_column_names <- c(" siteID" , " sampleName" , " chr" , " strand" , " position" )
68
+ required_columns <- unique(c(
69
+ granges_column_names , int_site_column_names , " type" ))
70
+ stopifnot(all(required_columns %in% names(sites )))
71
+ setdiff(names(sites ), required_columns )
60
72
}
61
73
62
74
from_counts_to_density <- function (sites , column_prefix , window_size ) {
63
- metadata <- mcols(sites )
64
- sapply(seq(window_size ), function (i ) {
65
- val <- window_size [i ]
66
- name <- names(window_size )[i ]
67
- column_name <- paste0(column_prefix , " ." , name )
68
- metadata [[column_name ]] <<- metadata [[column_name ]]/ val
69
- })
70
- mcols(sites ) <- metadata
71
- sites
75
+ metadata <- mcols(sites )
76
+ sapply(seq(window_size ), function (i ) {
77
+ val <- window_size [i ]
78
+ name <- names(window_size )[i ]
79
+ column_name <- paste0(column_prefix , " ." , name )
80
+ metadata [[column_name ]] <<- metadata [[column_name ]]/ val
81
+ })
82
+ mcols(sites ) <- metadata
83
+ sites
72
84
}
73
85
74
86
getPositionalValuesOfFeature <- function (sites , genomicData ) {
75
- # ### Boundary Distances #### Nirav Malani code TODO: refactor into several functions
76
- # # (refSeq boundary.dist), Start (refSeq start.dist), non-width (), General (general.width)
77
- # # when inGene is FALSE then set following: ref.left.pos, ref.right.pos, ref.left.strand, ref.right.strand
78
- # # when inGene is TRUE then set following: ref.start.pos, ref.end.pos, ref.gene.strand
79
-
80
- # # prepare the new columns ##
81
- colnam <- paste(" ref" , c(" left.pos" , " right.pos" , " left.strand" , " right.strand" ,
82
- " start.pos" , " end.pos" , " gene.strand" ), sep = " ." )
83
- mcols(sites )[colnam ] <- NA
84
-
85
- # # add the respective columns as needed ##
86
- # # beware: precede returns range which is following the query and
87
- # # follow returns the range which is preceding the query!
88
- # # so do a switcheroo in terms of extracting the start & stop ##
89
- left <- follow(sites , genomicData , ignore.strand = TRUE )
90
- left [is.na(left ) | sites $ within_refSeq_gene ] <- NA
91
- rows <- na.omit(left )
92
- sites $ ref.left.pos [! is.na(left )] <- end(genomicData [rows ])
93
- sites $ ref.left.strand [! is.na(left )] <- as.character(strand(genomicData [rows ]))
94
-
95
- right <- precede(sites , genomicData , ignore.strand = TRUE )
96
- right [is.na(right ) | sites $ within_refSeq_gene ] <- NA
97
- rows <- na.omit(right )
98
- sites $ ref.right.pos [! is.na(right )] <- start(genomicData [rows ])
99
- sites $ ref.right.strand [! is.na(right )] <- as.character(strand(genomicData [rows ]))
100
-
101
- inIt <- findOverlaps(sites , genomicData , ignore.strand = TRUE , select = " arbitrary" )
102
- inIt [is.na(inIt ) | ! sites $ within_refSeq_gene ] <- NA
103
- rows <- na.omit(inIt )
104
- sites $ ref.start.pos [! is.na(inIt )] <- start(genomicData [rows ])
105
- sites $ ref.end.pos [! is.na(inIt )] <- end(genomicData [rows ])
106
- sites $ ref.gene.strand [! is.na(inIt )] <- as.character(strand(genomicData [rows ]))
107
-
108
- sites $ boundary.dist <-
109
- eval(expression(pmin((ref.end.pos - position )/ (ref.end.pos - ref.start.pos ),
110
- (position - ref.start.pos )/ (ref.end.pos - ref.start.pos ),
111
- (ref.right.pos - position )/ (ref.right.pos - ref.left.pos ),
112
- (position - ref.left.pos )/ (ref.right.pos - ref.left.pos ),
113
- na.rm = T )), mcols(sites ))
114
-
115
- sites $ start.dist <-
116
- eval(expression(pmin(ifelse(ref.gene.strand == " -" ,
117
- (ref.end.pos - position )/ (ref.end.pos - ref.start.pos ),
118
- (position - ref.start.pos )/ (ref.end.pos - ref.start.pos )),
119
- ifelse(ref.right.strand == " -" ,
120
- (ref.right.pos - position )/ (ref.right.pos - ref.left.pos ),
121
- NA ),
122
- ifelse(ref.left.strand == " +" ,
123
- (position - ref.left.pos )/ (ref.right.pos - ref.left.pos ),
124
- NA ),na.rm = T )), mcols(sites ))
125
-
126
- sites $ general.width <- eval(expression(pmin(ref.end.pos - ref.start.pos ,
127
- ref.right.pos - ref.left.pos ,na.rm = T )),
128
- mcols(sites ))
129
- sites $ gene.width <- eval(expression(ref.end.pos - ref.start.pos ), mcols(sites ))
130
-
131
- meta <- mcols(sites )
132
- meta <- meta [ , ! (names(meta ) %in% colnam )]
133
- mcols(sites ) <- meta
134
-
135
- sites
87
+ # ### Boundary Distances #### Nirav Malani code TODO: refactor into several functions
88
+ # # (refSeq boundary.dist), Start (refSeq start.dist), non-width (), General (general.width)
89
+ # # when inGene is FALSE then set following: ref.left.pos, ref.right.pos, ref.left.strand, ref.right.strand
90
+ # # when inGene is TRUE then set following: ref.start.pos, ref.end.pos, ref.gene.strand
91
+
92
+ # # prepare the new columns ##
93
+ colnam <- paste(" ref" , c(" left.pos" , " right.pos" , " left.strand" , " right.strand" ,
94
+ " start.pos" , " end.pos" , " gene.strand" ), sep = " ." )
95
+ mcols(sites )[colnam ] <- NA
96
+
97
+ # # add the respective columns as needed ##
98
+ # # beware: precede returns range which is following the query and
99
+ # # follow returns the range which is preceding the query!
100
+ # # so do a switcheroo in terms of extracting the start & stop ##
101
+ left <- follow(sites , genomicData , ignore.strand = TRUE )
102
+ left [is.na(left ) | sites $ within_refSeq_gene ] <- NA
103
+ rows <- na.omit(left )
104
+ sites $ ref.left.pos [! is.na(left )] <- end(genomicData [rows ])
105
+ sites $ ref.left.strand [! is.na(left )] <- as.character(strand(genomicData [rows ]))
106
+
107
+ right <- precede(sites , genomicData , ignore.strand = TRUE )
108
+ right [is.na(right ) | sites $ within_refSeq_gene ] <- NA
109
+ rows <- na.omit(right )
110
+ sites $ ref.right.pos [! is.na(right )] <- start(genomicData [rows ])
111
+ sites $ ref.right.strand [! is.na(right )] <- as.character(strand(genomicData [rows ]))
112
+
113
+ inIt <- findOverlaps(sites , genomicData , ignore.strand = TRUE , select = " arbitrary" )
114
+ inIt [is.na(inIt ) | ! sites $ within_refSeq_gene ] <- NA
115
+ rows <- na.omit(inIt )
116
+ sites $ ref.start.pos [! is.na(inIt )] <- start(genomicData [rows ])
117
+ sites $ ref.end.pos [! is.na(inIt )] <- end(genomicData [rows ])
118
+ sites $ ref.gene.strand [! is.na(inIt )] <- as.character(strand(genomicData [rows ]))
119
+
120
+ sites $ boundary.dist <-
121
+ eval(expression(pmin((ref.end.pos - position )/ (ref.end.pos - ref.start.pos ),
122
+ (position - ref.start.pos )/ (ref.end.pos - ref.start.pos ),
123
+ (ref.right.pos - position )/ (ref.right.pos - ref.left.pos ),
124
+ (position - ref.left.pos )/ (ref.right.pos - ref.left.pos ),
125
+ na.rm = T )), mcols(sites ))
126
+
127
+ sites $ start.dist <-
128
+ eval(expression(pmin(ifelse(ref.gene.strand == " -" ,
129
+ (ref.end.pos - position )/ (ref.end.pos - ref.start.pos ),
130
+ (position - ref.start.pos )/ (ref.end.pos - ref.start.pos )),
131
+ ifelse(ref.right.strand == " -" ,
132
+ (ref.right.pos - position )/ (ref.right.pos - ref.left.pos ),
133
+ NA ),
134
+ ifelse(ref.left.strand == " +" ,
135
+ (position - ref.left.pos )/ (ref.right.pos - ref.left.pos ),
136
+ NA ),na.rm = T )), mcols(sites ))
137
+
138
+ sites $ general.width <- eval(expression(pmin(ref.end.pos - ref.start.pos ,
139
+ ref.right.pos - ref.left.pos ,na.rm = T )),
140
+ mcols(sites ))
141
+ sites $ gene.width <- eval(expression(ref.end.pos - ref.start.pos ), mcols(sites ))
142
+
143
+ meta <- mcols(sites )
144
+ meta <- meta [ , ! (names(meta ) %in% colnam )]
145
+ mcols(sites ) <- meta
146
+
147
+ sites
136
148
}
0 commit comments