-
Notifications
You must be signed in to change notification settings - Fork 0
/
RunMultiSPCARandomSampling.R
executable file
·168 lines (139 loc) · 8.75 KB
/
RunMultiSPCARandomSampling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
###################################################################################################################################
# Authors: Emilie Mathian
# This script runs the multi-sample spatial PCA developed by L. Shang and X. Zhou and adapted to whole slide images (WSI).
# This method takes as input a matrix of coded vectors of tiles from a WSI, generated by a trained neural network.
# These projections are then associated with a matrix of coordinates indicating the position of each coded vector,
# and therefore each tile, in the WSI, allowing spatial information to be incorporated into the model.
# This method has a quadratic cost in terms of memory and time, so the coded vectors are selected randomly to create the SpatialPCA object.
# The number of tiles selected can be specified using the argline ntiles command. Warning = from experience, if ntiles = 100 000,
# 300GB of RAM will be required; we know from experience that 50 000 elements are sufficient to produce a coherent latent space.
# This script produces the R SpatialPCA object that allows users to access the latent space, which can then be used to project new coded vectors.
# As well as a csv file of the projections obtained on the selected coded vectors.
#
# Contacts: [email protected]
# Rare cancer genomics teams, International Agency of Research on Cancer - WHO
####################################################################################################
print("#-------------------------------------- Install package ------------------------------------------------------------------------")
print(.libPaths())
.libPaths("/home/mathiane/R/x86_64-pc-linux-gnu-library/4.0")
packages <- c("stringr", "cluster", "Matrix","SpatialPCA", "optparse" , "tictoc" )
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
print("# --------------------------------------- Load libraies ------------------------------------------------------------------------")
library(rlang)
library(stringr)
library(cluster)
library(Matrix)
library(SpatialPCA)
library("optparse")
library(tictoc)
source('ImgSpatialPCAMultipleSamles.R')
environment(SpatialPCA_Multiple_Sample) <- asNamespace('SpatialPCA')
assignInNamespace("SpatialPCA_Multiple_Sample", SpatialPCA_Multiple_Sample, ns = "SpatialPCA")
source('ImgSpatialPCA.R')
environment(CreateSpatialPCAObject) <- asNamespace('SpatialPCA')
assignInNamespace("CreateSpatialPCAObject", CreateSpatialPCAObject, ns = "SpatialPCA")
#setwd("~/script_spatial_pca")
tic("total")
print("# --------------------------------- Command line arguments ------------------------------------------------------------------------")
option_list = list(
make_option(c("-n", "--n_tiles"), type="numeric",
help="Number of tiles selected for the PCA", metavar="number"),
make_option(c("-o", "--output_folder"), type="character",
help="Folder where to store the results", metavar="character"),
make_option(c("-p", "--path2projectors"), type="character",
help="Path to the csv file containing all the encoded vectors", metavar="character"),
make_option(c("-d", "--dimension_enc_vectors"), type="integer", default=128,
help="Dimension of encoded vectors")
)
opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)
n_tiles = opt$n_tiles
output_folder = opt$output_folder
dir.create(output_folder, showWarnings = FALSE)
path2projectors = opt$path2projectors
d_enc_vectors = opt$dimension_enc_vectors
# -------------------- My function -----------------------
correct_tiles_id <- function(df_enc_vector_c){
# Tiles are expected to have an id such as patientID_coordsX_coordsY.jpg
# This function garantee that tile ID respect the pattern mentionned above.
tiles_id <- unlist(lapply(df_enc_vector_c$img_id, function(x) paste(substr(x,1,7),
str_split(x, pattern ='_')[[1]][2],
str_split(x, pattern ='_')[[1]][3], sep= '_') ))
return(tiles_id)
}
print("# --------------------------------- Read scales encoded vectors ------------------------------------------------------------------------")
df_enc_vector <- read.csv(path2projectors)
colnames(df_enc_vector)[colnames(df_enc_vector) == "tne_id"] ="sample_id"
df_enc_vector <- df_enc_vector[!duplicated(df_enc_vector$img_id),]
df_enc_vector <- df_enc_vector[,2:ncol(df_enc_vector)]
print("#--------------------- Samples n number of tiles to build the PCA ---------------------------------------------------------")
select_tiles_v <- c(rep(0,nrow(df_enc_vector)-n_tiles), rep(1,n_tiles))
select_tiles_v <- sample(select_tiles_v)
df_enc_vector$TilesForPCA <- select_tiles_v
df_enc_vectors_to_pca <- df_enc_vector[df_enc_vector$TilesForPCA == 1,]
df_enc_vectors_to_proj <- df_enc_vector[df_enc_vector$TilesForPCA == 0,]
df_enc_vectors_to_pca$x <- unlist(lapply(df_enc_vectors_to_pca$img_id, function(x) str_split(x, pattern ='_')[[1]][2]))
df_enc_vectors_to_pca$y <- unlist(lapply(df_enc_vectors_to_pca$img_id, function(x) str_split(x, pattern ='_')[[1]][3]))
df_enc_vectors_to_pca$img_id_c <- correct_tiles_id(df_enc_vectors_to_pca)
rm(df_enc_vector)
print(head(df_enc_vectors_to_pca))
print("# -----------------------List of tables --------------------------------------------")
# Get all sample name
Samples_from_PCA <- df_enc_vectors_to_pca$sample_id
# Remove Samples with less than 3 tiles from spatial PCA
for (sampl in unique(Samples_from_PCA)){
if(nrow(df_enc_vectors_to_pca[df_enc_vectors_to_pca$sample_id == sampl,]) < 3){
c_sampl <- df_enc_vectors_to_pca[df_enc_vectors_to_pca$sample_id == sampl, ]
df_enc_vectors_to_proj <- rbind(df_enc_vectors_to_proj, c_sampl)
df_enc_vectors_to_pca <- df_enc_vectors_to_pca[df_enc_vectors_to_pca$sample_id != sampl, ]
}
}
# Create lists of data frames for coordinates and encoded vectors
Samples_from_PCA <- df_enc_vectors_to_pca$sample_id
coords_df_list_for_pca <- list()
encoded_vectors_df_list_for_pca <- list()
c = 1
for (ele in unique(Samples_from_PCA)){
# Coords data frame
df_enc_vector_c_for_PCA <- df_enc_vectors_to_pca[df_enc_vectors_to_pca$sample_id == ele,]
# Corrds matrices
coords_df_c_for_PCA <- data.frame(x_coord=as.numeric(df_enc_vector_c_for_PCA$x))
coords_df_c_for_PCA$y_coord <- as.numeric(df_enc_vector_c_for_PCA$y)
rownames(coords_df_c_for_PCA) = df_enc_vector_c_for_PCA$img_id_c
coords_df_c_for_PCA = as.matrix(coords_df_c_for_PCA)
coords_df_list_for_pca[[c]] <- coords_df_c_for_PCA
# Encoded vecotrs data frame
enc_val_for_pca = df_enc_vector_c_for_PCA[,1:d_enc_vectors]
enc_val_t_for_pca <- t(enc_val_for_pca)
enc_val_t_for_pca <- Matrix(nrow = nrow(enc_val_t_for_pca), ncol = ncol(enc_val_t_for_pca), data = enc_val_t_for_pca, sparse = TRUE)
colnames(enc_val_t_for_pca) <- df_enc_vector_c_for_PCA$img_id_c
rownames(enc_val_t_for_pca) <- colnames(enc_val_for_pca)[1:d_enc_vectors]
encoded_vectors_df_list_for_pca[[c]] <- enc_val_t_for_pca
c = c + 1
}
print("--------------------------------------- encoded_vectors_df_list_for_pca ------------------------------------------------------")
print(head(encoded_vectors_df_list_for_pca[[1]]))
print("--------------------------------------- coords_df_list_for_pca ------------------------------------------------------")
print(head(coords_df_list_for_pca[[1]]))
print("# -------------------------------------- Compute spatial PCA -------------------------------------")
LIBD_multi = SpatialPCA_Multiple_Sample(encoded_vectors_df_list_for_pca, coords_df_list_for_pca ,gene.type="spatial",sparkversion="spark",
numCores_spark=5, gene.number=d_enc_vectors, customGenelist=NULL, min.loctions = 20, min.features=20, bandwidth_common=0.1)
# Save spatial PCA in a R obj
save(LIBD_multi, file = paste(output_folder, "SpatialPCA_forWSI_Object.RData", sep='/'))
multi_spatial_pcs <- LIBD_multi$MultipleSample_SpatialPCA_object@SpatialPCs
multi_spatial_pcs <- t(multi_spatial_pcs)
colnames(multi_spatial_pcs)<-unlist(lapply(1:20, function(x) paste('axis', x, sep='_')))
multi_spatial_pcs <- data.frame(multi_spatial_pcs)
img_id <- unlist(lapply( rownames(multi_spatial_pcs), function(x) substr( x, str_locate(x, 'TNE')[[1]], nchar(x))))
rownames(multi_spatial_pcs) <- img_id
samples_id <- unlist(lapply(rownames(multi_spatial_pcs), function(x) str_split(x, '_')[[1]][1] ))
unique(samples_id)
multi_spatial_pcs$sample_id <- samples_id
multi_spatial_pcs <- merge(multi_spatial_pcs, df_enc_vectors_to_pca[,129:ncol(df_enc_vectors_to_pca)], by.x="row.names", by.y="img_id_c")
write.csv(multi_spatial_pcs, paste(output_folder, "spatial_pca_coords.csv", sep='/'))
rm(multi_spatial_pcs)
rm(df_enc_vectors_to_pca)
toc()