GeorgeBatch · Dec 17, 2020
diff --git a/‎README.md
+14-5 b/‎README.md
+14-5
diff --git a/‎thumbnails/bag.png
229 KB b/‎thumbnails/bag.png
229 KB
diff --git a/‎thumbnails/bags.png
130 KB b/‎thumbnails/bags.png
130 KB
diff --git a/‎train_tcga.py
+5-2 b/‎train_tcga.py
+5-2
@@ -70,11 +70,20 @@ Train DSMIL on TCGA Lung Cancer dataset (precomputed features):
 ```
 
 ## Training on your own datasets
-You could modify train_tcga.py to easily let it work with your datasets. You will need to:  
-1. For each bag, generate a .csv file where each row contains the feature of an instance. The .csv file should be named as "_bagID_.csv" and put into a folder named "_dataset-name_".  
-2. Generate a "_dataset-name_.csv" file with two columns where the first column contains _bagID_, and the second column contains the class label.
-3. Replace the corresponding file path in the script with the file path of "_dataset_.csv" file, and change the data directory path in the dataloader to the path of the folder "_dataset-name_"
-4. Configure the number of class for creating the DSMIL model.
+You could modify train_tcga.py to easily let it work with your datasets. After you have trained your embedder, you will need to compute the features and organize them as:  
+1. For each bag, generate a .csv file where each row contains the feature of an instance. The .csv file should be named as "_bagID_.csv" and put into a folder named "_dataset-name_".
+<div align="center">
+  <img src="thumbnails/bag.png" width="400px" />
+</div>  
+2. Generate a "_dataset-name_.csv" file with two columns where the first column contains the paths to all _bagID_.csv files, and the second column contains the bag labels.
+<div align="center">
+  <img src="thumbnails/bags.png" width="400px" />
+</div>  
+3. Replace the corresponding file path in the script with the file path of "_dataset_.csv". 
+```
+  bags_path = pd.read_csv(PATH_TO_[_dataset-name_.csv])
+```
+4. Configure the corresponding number of classes argument for creating the DSMIL model.
 
 ## Citation
 If you use the code or results in your research, please use the following BibTeX entry.  
 
@@ -19,7 +19,7 @@ def get_bag_feats(csv_file_df, args):
     if args.simclr == 0:
         feats_csv_path = 'datasets/tcga-dataset/tcga_lung_data_feats/' + csv_file_df.iloc[0].split(os.sep)[1] + '.csv'
     else:
-        feats_csv_path = 'datasets/wsi-tcga-lung/' + os.path.join(csv_file_df.iloc[0].split(os.sep)[-2], csv_file_df.iloc[0].split(os.sep)[-1])
+        feats_csv_path = csv_file_df.iloc[0]
     df = pd.read_csv(feats_csv_path)
     feats = shuffle(df).reset_index(drop=True)
     feats = feats.to_numpy()
@@ -127,7 +127,7 @@ def main():
     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_epoch, 0)
 
     if args.simclr == 0:
-        bags_path = pd.read_csv('datasets'+os.sep+'tcga-dataset'+os.sep+'TCGA.csv')
+        bags_csv = 'datasets/tcga-dataset/TCGA.csv'
     else:
         luad_list = glob.glob('datasets'+os.sep+'wsi-tcga-lung'+os.sep+'LUAD'+os.sep+'*.csv')
         lusc_list = glob.glob('datasets'+os.sep+'wsi-tcga-lung'+os.sep+'LUSC'+os.sep+'*.csv')
@@ -140,6 +140,9 @@ def main():
         bags_path = luad_df.append(lusc_df, ignore_index=True)
         bags_path = shuffle(bags_path)
         bags_path.to_csv('datasets/wsi-tcga-lung/TCGA.csv', index=False)
+        bags_csv = 'datasets/wsi-tcga-lung/TCGA.csv'
+        
+    bags_path = pd.read_csv(bags_csv)
     train_path = bags_path.iloc[0:int(len(bags_path)*0.8), :]
     test_path = bags_path.iloc[int(len(bags_path)*0.8):, :]