-
Notifications
You must be signed in to change notification settings - Fork 4
/
1.StemIndexTrain.R
114 lines (93 loc) · 3.31 KB
/
1.StemIndexTrain.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
library(gelnet)
library(dplyr)
library(gdata)
library(biomaRt)
library(synapseClient)
#mRNAsi
synapseLogin()
genes2hugo <- function( v, srcType = "ensembl_gene_id" )
{
## Retrieve the EMSEMBL -> HUGO mapping
ensembl <- biomaRt::useMart( "ENSEMBL_MART_ENSEMBL", host="www.ensembl.org", dataset="hsapiens_gene_ensembl" )
ID <- biomaRt::getBM( attributes=c(srcType, "hgnc_symbol"), filters=srcType, values=v, mart=ensembl )
## Make sure there was at least one mapping
if( nrow(ID) < 1 ) stop( "No IDs mapped successfully" )
## Drop empty duds
j <- which( ID[,2] == "" )
if( length(j) > 0 ) ID <- ID[-j,]
stopifnot( all( ID[,1] %in% v ) )
ID
}
main.train <- function( fnOut = "pcbc-stemsig.tsv", fnGenes = NULL )
{
## Load RNAseq data
# synRNA <- synGet( "syn2701943", downloadLocation = "/data/PCBC" )
# X <- read.delim( synRNA@filePath ) %>%
X <- read.delim( "PCBC/rnaseq_norm.tsv" ) %>%
tibble::column_to_rownames( "tracking_id" ) %>%
as.matrix()
## Retrieve metadata
synMeta <- synTableQuery( "SELECT UID, Diffname_short FROM syn3156503" )
Y <- synMeta@values %>%
mutate( UID = gsub("-", ".", UID) ) %>%
tibble::column_to_rownames( "UID" )
## Retrieve the labels from the metadata
y <- Y[colnames(X),]
names(y) <- colnames(X)
## Fix the missing labels by hand
y["SC11.014BEB.133.5.6.11"] <- "EB"
y["SC12.039ECTO.420.436.92.16"] <- "ECTO"
## Drop the splice form ID from the gene names
v <- strsplit( rownames(X), "\\." ) %>% lapply( "[[", 1 ) %>% unlist()
rownames(X) <- v
## Map Ensembl IDs to HUGO
V <- genes2hugo( rownames(X) )
X <- X[V[,1],]
rownames(X) <- V[,2]
## Reduce the gene set to the provided list (if applicable)
if( is.null( fnGenes ) == FALSE )
{
vGenes <- read.delim( fnGenes, header=FALSE ) %>% as.matrix() %>% drop()
VE <- genes2hugo( vGenes, "entrezgene" )
X <- X[intersect( rownames(X), VE[,2] ),]
}
## Mean-center the data
m <- apply( X, 1, mean )
X <- X - m
## Identify stem cell samples
j <- which( y == "SC" )
X.tr <- X[,j]
X.bk <- X[,-j]
## Train a one-class model
mm <- gelnet( t(X.tr), NULL, 0, 1 )
## Store the signature to a file
write.table(mm$w, file = fnOut, sep = "\t", quote = FALSE, col.names = FALSE)
## Perform leave-one-out cross-validation
auc <- c()
for( i in 1:ncol(X.tr) )
{
## Train a model on non-left-out data
X1 <- X.tr[,-i]
m1 <- gelnet( t(X1), NULL, 0, 1 )
## Score the left-out sample against the background
s.bk <- apply( X.bk, 2, function(z) {cor( m1$w, z, method="sp" )} )
s1 <- cor( m1$w, X.tr[,i], method="sp" )
## AUC = P( left-out sample is scored above the background )
auc[i] <- sum( s1 > s.bk ) / length(s.bk)
cat( "Current AUC: ", auc[i], "\n" )
cat( "Average AUC: ", mean(auc), "\n" )
}
return(auc)
}
main.train()
#mDNAsi
load("pcbc.pd.f.Rda")
load("pcbc.data.Rda")
m <- apply(pcbc.data, 1, mean )
pcbc.data.2 <- pcbc.data - m
M1_smp <- pcbc.pd.f[pcbc.pd.f$Diffname_short %in% "SC",] #SC
M2_smp <- pcbc.pd.f[!(pcbc.pd.f$Diffname_short %in% "SC"),] #non-SC
X.tr <- pcbc.data.2[, as.character(M1_smp$UID)] # 44 samples
X.bk <- pcbc.data.2[, as.character(M2_smp$UID)] # 55 samples
mm <- gelnet(t(X.tr), NULL, 0, 1) # NULL for a one-class task
save( mm, file = "pcbc-stemsig.p219.Rda")