-
Notifications
You must be signed in to change notification settings - Fork 4
/
LSDDbiclustering.R
137 lines (116 loc) · 4.87 KB
/
LSDDbiclustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Author: Ricardo Cachucho
# Edited by Kai on March 2016
#
# Update: add progressBar option
###########################################################################################################
LSDDbiclustering <- function(data, segments, delta, k, progressBar=FALSE) {
# use information from segmented data
aSegStart <- segments$segStart
aSegEnd <- segments$segEnd
LSDDsigma <- segments$sigma
LSDDlambda <- segments$lambda
# initialization
segIndex <- 1:length(aSegStart)
colIndex <- 1:ncol(data)
rowIndex <- 1:nrow(data)
newSegIndex <- segIndex
newColIndex <- colIndex
newRowIndex <- rowIndex
nClusters <- 1
scores <- c()
Bicluster <- setClass(
# Set the name for the class
"bicluster",
# Define the slots
slots = c(
Number = "numeric",
RowxNumber = "matrix",
NumberxCol = "matrix",
rRemoved = "list",
newRowIndex = "list"
),
# Set the default values for the slots. (optional)
prototype=list(
Number = c(0),
RowxNumber = matrix(data=FALSE, nrow = length(segIndex), ncol = k),
NumberxCol = matrix(data=FALSE, nrow = k, ncol = length(colIndex))
)
)
output <- Bicluster()
# first iteration
LSDDscores <- matrix(data=NA, ncol=ncol(data), nrow=length(aSegStart))
for(j in colIndex)
LSDDscores[segIndex,j] <- sapply(segIndex, function(i) LSDDfast(matrix(data[aSegStart[i]:aSegEnd[i],j], nrow=1, ncol=length(aSegStart[i]:aSegEnd[i])), matrix(data[,j], nrow=1, ncol=length(data[,j])), sigma=LSDDsigma[j], lambda=LSDDlambda[j]))
# check for stop policy
score <- mean(LSDDscores[segIndex,colIndex], na.rm=FALSE)
#print(score)
while(length(newSegIndex) != 0 && nClusters <= k) {
# bookeeping
sRemoved <- c()
cRemoved <- c()
rRemoved <- c()
# calculate LSDD for all segments and variables part of bicluster
LSDDscores <- matrix(data=NA, ncol=ncol(data), nrow=length(aSegStart))
for(j in newColIndex)
LSDDscores[newSegIndex,j] <- sapply(newSegIndex, function(i) LSDDfast(matrix(data[aSegStart[i]:aSegEnd[i],j], nrow=1, ncol=length(aSegStart[i]:aSegEnd[i])), matrix(data[newRowIndex,j], nrow=1, ncol=length(data[newRowIndex,j])), sigma=LSDDsigma[j], lambda=LSDDlambda[j]))
score <- mean(LSDDscores[newSegIndex,newColIndex], na.rm=FALSE)
print(score)
# delta is between 0 and 1
delta <- delta * score
# an estimation of all work
if(progressBar){
allProgress <- min(length(newSegIndex), k)
nowProgress <- (nClusters - 1)/allProgress
}
# iterations to remove columns and segments
while(length(newSegIndex) != 0 && score > delta) {
# bookeeping scores
scores <- c(scores, score)
# find the row or columns with the largest LSDD average
rMeans <- rowMeans(LSDDscores, na.rm = TRUE)
sIndex <- which(rMeans == max(rMeans, na.rm=TRUE))
cMeans <- colMeans(LSDDscores, na.rm = TRUE)
cIndex <- which(cMeans == max(cMeans, na.rm=TRUE))
if(length(newColIndex) < 2 || rMeans[sIndex] >= cMeans[cIndex]) {
sRemoved <- c(sRemoved, sIndex)
rRemoved <- c(rRemoved, c(aSegStart[sIndex]:aSegEnd[sIndex]))
newSegIndex <- newSegIndex[-which(newSegIndex == sIndex)]
newRowIndex <- newRowIndex[-sapply(c(aSegStart[sIndex]:aSegEnd[sIndex]), function(i) which(newRowIndex == i))]
} else {
cRemoved <- c(cRemoved, cIndex)
newColIndex <- colIndex[-cRemoved]
}
# calculate LSDD for all segments and variables part of bicluster
LSDDscores[] <- NA
for(j in newColIndex)
LSDDscores[newSegIndex,j] <- sapply(newSegIndex, function(i) LSDDfast(matrix(data[aSegStart[i]:aSegEnd[i],j], nrow=1, ncol=length(aSegStart[i]:aSegEnd[i])), matrix(data[newRowIndex,j], nrow=1, ncol=length(data[newRowIndex,j])), sigma=LSDDsigma[j], lambda=LSDDlambda[j]))
score <- mean(LSDDscores[newSegIndex,newColIndex], na.rm=FALSE)
print(score)
# estimate stop time
if(progressBar){
scoreL <- length(scores)
if(scoreL > 1){
done <- scores[1] - scores[scoreL]
distance <- scores[1] - delta
value <- nowProgress + (1/allProgress)*(done/distance)
setProgress(value = value, detail = paste(percent(value),"done..."))
}
}
}
# output
output@RowxNumber[newSegIndex,nClusters] <- TRUE
output@NumberxCol[nClusters,newColIndex] <- TRUE
output@rRemoved[[nClusters]] <- rRemoved
output@newRowIndex[[nClusters]] <- newRowIndex
# book keeping
print(paste('number of clusters found:', nClusters))
nClusters <- nClusters + 1
newSegIndex <- sort(segIndex[unique(sRemoved)])
print(paste("number of segments remaining to be biclustered:", length(newSegIndex)))
newRowIndex <- sort(rowIndex[unique(rRemoved)])
newColIndex <- colIndex
}
# return
output@Number <- nClusters - 1
return(output)
}