forked from chl8856/Dynamic-DeepHit
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8c6a803
Showing
11 changed files
with
5,944 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
1,946 changes: 1,946 additions & 0 deletions
1,946
data/.ipynb_checkpoints/pbc2_cleaned-checkpoint.csv
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
|
||
|
||
##### USER-DEFINED FUNCTIONS | ||
def f_get_Normalization(X, norm_mode): | ||
num_Patient, num_Feature = np.shape(X) | ||
|
||
if norm_mode == 'standard': #zero mean unit variance | ||
for j in range(num_Feature): | ||
if np.nanstd(X[:,j]) != 0: | ||
X[:,j] = (X[:,j] - np.nanmean(X[:, j]))/np.nanstd(X[:,j]) | ||
else: | ||
X[:,j] = (X[:,j] - np.nanmean(X[:, j])) | ||
elif norm_mode == 'normal': #min-max normalization | ||
for j in range(num_Feature): | ||
X[:,j] = (X[:,j] - np.nanmin(X[:,j]))/(np.nanmax(X[:,j]) - np.nanmin(X[:,j])) | ||
else: | ||
print("INPUT MODE ERROR!") | ||
|
||
return X | ||
|
||
|
||
def f_get_fc_mask1(meas_time, num_Event, num_Category): | ||
''' | ||
mask3 is required to get the contional probability (to calculate the denominator part) | ||
mask3 size is [N, num_Event, num_Category]. 1's until the last measurement time | ||
''' | ||
mask = np.zeros([np.shape(meas_time)[0], num_Event, num_Category]) # for denominator | ||
for i in range(np.shape(meas_time)[0]): | ||
mask[i, :, :int(meas_time[i, 0]+1)] = 1 # last measurement time | ||
|
||
return mask | ||
|
||
|
||
def f_get_fc_mask2(time, label, num_Event, num_Category): | ||
''' | ||
mask4 is required to get the log-likelihood loss | ||
mask4 size is [N, num_Event, num_Category] | ||
if not censored : one element = 1 (0 elsewhere) | ||
if censored : fill elements with 1 after the censoring time (for all events) | ||
''' | ||
mask = np.zeros([np.shape(time)[0], num_Event, num_Category]) # for the first loss function | ||
for i in range(np.shape(time)[0]): | ||
if label[i,0] != 0: #not censored | ||
mask[i,int(label[i,0]-1),int(time[i,0])] = 1 | ||
else: #label[i,2]==0: censored | ||
mask[i,:,int(time[i,0]+1):] = 1 #fill 1 until from the censoring time (to get 1 - \sum F) | ||
return mask | ||
|
||
|
||
def f_get_fc_mask3(time, meas_time, num_Category): | ||
''' | ||
mask5 is required calculate the ranking loss (for pair-wise comparision) | ||
mask5 size is [N, num_Category]. | ||
- For longitudinal measurements: | ||
1's from the last measurement to the event time (exclusive and inclusive, respectively) | ||
denom is not needed since comparing is done over the same denom | ||
- For single measurement: | ||
1's from start to the event time(inclusive) | ||
''' | ||
mask = np.zeros([np.shape(time)[0], num_Category]) # for the first loss function | ||
if np.shape(meas_time): #lonogitudinal measurements | ||
for i in range(np.shape(time)[0]): | ||
t1 = int(meas_time[i, 0]) # last measurement time | ||
t2 = int(time[i, 0]) # censoring/event time | ||
mask[i,(t1+1):(t2+1)] = 1 #this excludes the last measurement time and includes the event time | ||
else: #single measurement | ||
for i in range(np.shape(time)[0]): | ||
t = int(time[i, 0]) # censoring/event time | ||
mask[i,:(t+1)] = 1 #this excludes the last measurement time and includes the event time | ||
return mask | ||
|
||
|
||
|
||
##### TRANSFORMING DATA | ||
def f_construct_dataset(df, feat_list): | ||
''' | ||
id : patient indicator | ||
tte : time-to-event or time-to-censoring | ||
- must be synchronized based on the reference time | ||
times: time at which observations are measured | ||
- must be synchronized based on the reference time (i.e., times start from 0) | ||
label: event/censoring information | ||
- 0: censoring | ||
- 1: event type 1 | ||
- 2: event type 2 | ||
... | ||
''' | ||
|
||
grouped = df.groupby(['id']) | ||
id_list = pd.unique(df['id']) | ||
max_meas = np.max(grouped.count())[0] | ||
|
||
data = np.zeros([len(id_list), max_meas, len(feat_list)+1]) | ||
pat_info = np.zeros([len(id_list), 5]) | ||
|
||
for i, tmp_id in enumerate(id_list): | ||
tmp = grouped.get_group(tmp_id).reset_index(drop=True) | ||
|
||
pat_info[i,4] = tmp.shape[0] #number of measurement | ||
pat_info[i,3] = np.max(tmp['times']) #last measurement time | ||
pat_info[i,2] = tmp['label'][0] #cause | ||
pat_info[i,1] = tmp['tte'][0] #time_to_event | ||
pat_info[i,0] = tmp['id'][0] | ||
|
||
data[i, :int(pat_info[i, 4]), 1:] = tmp[feat_list] | ||
data[i, :int(pat_info[i, 4]-1), 0] = np.diff(tmp['times']) | ||
|
||
return pat_info, data | ||
|
||
|
||
def import_dataset(norm_mode = 'standard'): | ||
|
||
df_ = pd.read_csv('./data/pbc2_cleaned.csv') | ||
|
||
bin_list = ['drug', 'sex', 'ascites', 'hepatomegaly', 'spiders'] | ||
cont_list = ['age', 'edema', 'serBilir', 'serChol', 'albumin', 'alkaline', 'SGOT', 'platelets', 'prothrombin', 'histologic'] | ||
feat_list = cont_list + bin_list | ||
df_ = df_[['id', 'tte', 'times', 'label']+feat_list] | ||
df_org_ = df_.copy(deep=True) | ||
|
||
df_[cont_list] = f_get_Normalization(np.asarray(df_[cont_list]).astype(float), norm_mode) | ||
|
||
pat_info, data = f_construct_dataset(df_, feat_list) | ||
_, data_org = f_construct_dataset(df_org_, feat_list) | ||
|
||
data_mi = np.zeros(np.shape(data)) | ||
data_mi[np.isnan(data)] = 1 | ||
data_org[np.isnan(data)] = 0 | ||
data[np.isnan(data)] = 0 | ||
|
||
x_dim = np.shape(data)[2] # 1 + x_dim_cont + x_dim_bin (including delta) | ||
x_dim_cont = len(cont_list) | ||
x_dim_bin = len(bin_list) | ||
|
||
last_meas = pat_info[:,[3]] #pat_info[:, 3] contains age at the last measurement | ||
label = pat_info[:,[2]] #two competing risks | ||
time = pat_info[:,[1]] #age when event occurred | ||
|
||
num_Category = int(np.max(pat_info[:, 1]) * 1.2) #or specifically define larger than the max tte | ||
num_Event = len(np.unique(label)) - 1 | ||
|
||
if num_Event == 1: | ||
label[np.where(label!=0)] = 1 #make single risk | ||
|
||
mask1 = f_get_fc_mask1(last_meas, num_Event, num_Category) | ||
mask2 = f_get_fc_mask2(time, label, num_Event, num_Category) | ||
mask3 = f_get_fc_mask3(time, -1, num_Category) | ||
|
||
DIM = (x_dim, x_dim_cont, x_dim_bin) | ||
DATA = (data, time, label) | ||
# DATA = (data, data_org, time, label) | ||
MASK = (mask1, mask2, mask3) | ||
|
||
return DIM, DATA, MASK, data_mi |
Oops, something went wrong.