-
Notifications
You must be signed in to change notification settings - Fork 0
/
Test01_BeatAML_sc2.py
145 lines (124 loc) · 5.04 KB
/
Test01_BeatAML_sc2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 28 03:32:29 2020
@author: SRDhruba
"""
# In[ ]:
# %pip install scikit-survival synapseclient
# In[ ]:
# import getpass
# import synapseclient
# import synapseutils
# syn = synapseclient.Synapse()
# syn.login(input(prompt="Enter Synapse Username"), getpass.getpass("Enter Synapse Password"))
# downloaded_files = synapseutils.syncFromSynapse(syn, 'syn21212904', path='training')
# Now, load the data, and train a model!
# In[ ]:
# Auto-reload the custom python modules, for easy development.
# %load_ext autoreload
# %aimport input_manager
# %aimport model
# %autoreload 1
import os
os.chdir("C:\\Users\\SRDhruba\\Google Drive\\Study\\ECE 5332-009 - Topics in EE, Data Science\\BeatAML\\")
from beataml_example2_master.input_manager import RawInputs
raw_inputs = RawInputs('training')
raw_inputs.load()
# In[ ]:
from beataml_example2_master.input_manager import InputManager
im = InputManager(raw_inputs)
im.prepInputs()
im.printStats()
# In[ ]:
most_variant_genes = im.rnaseq_by_spec.var().nlargest(1000).index
inhibitors = im.aucs.columns
# In[ ]:
import numpy as np
import pandas as pd
from beataml_example2_master.model import makeFullFeatureVector
lab_ids = im.getAllSpecimens()
feature_matrix = pd.DataFrame()
for lab_id in lab_ids:
feature_vector = makeFullFeatureVector(im, most_variant_genes, inhibitors, lab_id)
feature_series = pd.Series(data = feature_vector, name = lab_id)
feature_matrix = feature_matrix.append(feature_series)
# In[ ]:
feature_means = feature_matrix.mean()
feature_stds = feature_matrix.std()
normed_features = (feature_matrix - feature_means) / feature_stds
normed_features = normed_features.fillna(0.0)
# In[ ]:
from sksurv.datasets import get_x_y
full_dataset = pd.read_csv('training/response.csv').set_index('lab_id').join(normed_features)
X, Y = get_x_y(full_dataset, ['vitalStatus', 'overallSurvival'], pos_label = 'Dead')
# In[ ]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
# This package allows general elastic net tuning, but by setting
# l1_ratio=1, we restrict to LASSO.
regr = CoxnetSurvivalAnalysis(l1_ratio = 1, alpha_min_ratio = 0.05, max_iter = 3e5)
n_folds = 10
alphas = np.logspace(-1.3, 0, num = 100)
cv = KFold(n_splits = 5, shuffle = True, random_state = 328)
gcv = GridSearchCV(regr, {"alphas": [[v] for v in alphas]}, cv = cv).fit(X, Y)
#In[ ]:
import matplotlib.pyplot as plt
scores = gcv.cv_results_['mean_test_score']
scores_std = gcv.cv_results_['std_test_score']
std_error = scores_std / np.sqrt(n_folds)
plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores)
plt.fill_between(alphas, scores + std_error, scores - std_error, alpha = 0.2)
plt.xlabel('alpha')
plt.ylabel('Concordance Index')
plt.axvline(gcv.best_params_['alphas'][0], color = 'r', ls = '--', label = ('Best alpha, CI = %0.3f' % gcv.best_score_))
plt.legend()
plt.title('Cross Validation Concordance Index, LASSO')
# In[ ]:
# TODO: Investigate which features were chosen...
pd.Series(gcv.best_estimator_.coef_[:, 0]).to_numpy().nonzero()
# Pickle the data for evaluation.
# We have:
#
# feature mean and variance (for computing z-score)
# feature weights
# most variable genes
# In[ ]:
np.save('beataml_example2_master/model/feature_means.npy', feature_means.to_numpy())
np.save('beataml_example2_master/model/feature_stds.npy', feature_stds.to_numpy())
np.save('beataml_example2_master/model/estimator_coef.npy', gcv.best_estimator_.coef_[:, 0])
np.save('beataml_example2_master/model/most_variant_genes.npy', most_variant_genes.to_numpy())
np.save('beataml_example2_master/model/inhibitors.npy', np.array(inhibitors))
# Run the model
# We run the model with
# SYNAPSE_PROJECT_ID=<your project ID>
# docker build -t docker.synapse.org/$SYNAPSE_PROJECT_ID/sc2_model .
# docker run \
# -v "$PWD/training/:/input/" \
# -v "$PWD/output:/output/" \
# docker.synapse.org/$SYNAPSE_PROJECT_ID/sc2_model
# Maybe push to Synapse.
# docker login docker.synapse.org
# docker push docker.synapse.org/$SYNAPSE_PROJECT_ID/sc2_model
# Note: you must pass the Synapse certified user test before you can push to the Synapse docker hub.
# Look at predictions vs goldstandard for training data
# Assumes predictions are in output/predictions.csv. Note that the performance on training dataset is going to be better than on the leaderboard data. Therefore, this is a good test of formatting / sanity check, but not of predictive performance.
# In[ ]:
from sksurv.metrics import concordance_index_censored
groundtruth = pd.read_csv('training/response.csv').set_index('lab_id')
predictions = pd.read_csv('beataml_example2_master/output/predictions.csv').set_index('lab_id')
data = groundtruth.join(predictions)
# data = data[data.vitalStatus == 'Dead']
cindex = concordance_index_censored(
data.vitalStatus == 'Dead', data.overallSurvival, -data.survival)[0]
print(cindex)
# In[ ]:
import seaborn
seaborn.scatterplot(
x='overallSurvival',
y='survival',
data=data,
hue='vitalStatus',
alpha=1)
plt.title('SC2 baseline predictor')