Skip to content

Commit

Permalink
Fix dataloading efficiency and store model with min val loss
Browse files Browse the repository at this point in the history
  • Loading branch information
ange1a-j14 committed Aug 12, 2024
1 parent 2117900 commit d99a9ed
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 42 deletions.
6 changes: 3 additions & 3 deletions decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ def training_step(self, batch, batch_idx):
def validation_step(self, batch, batch_idx):
# validation_step defines the validation loop.
x, y = batch
print(type(x))
print(f"x size {x.size()}")
print(f"y size {y.size()}")
# print(type(x))
# print(f"x size {x.size()}")
# print(f"y size {y.size()}")
preds = self.model(x)
loss = self.loss_function(preds, y)
acc = (preds == y).float().mean()
Expand Down
9 changes: 6 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,18 @@
import decoder
import models

import datetime

if __name__ == '__main__':
np.random.seed(0x5EED+3)
if len(sys.argv) == 1:
"""Run functions in this scratch area.
"""
valid_file = 'C:\\Users\\aj14\\Desktop\\SMI\\data\\valid_data.h5py'
train_file = 'C:\\Users\\aj14\\Desktop\\SMI\\data\\training_data.h5py'
test_file = 'C:\\Users\\aj14\\Desktop\\SMI\\data\\test_data.h5py'
valid_file = 'C:\\Users\\aj14\\Desktop\\SMI\\data\\valid_30to1kHz_2kshots_dec=256_randampl.h5py'
train_file = 'C:\\Users\\aj14\\Desktop\\SMI\\data\\training_30to1kHz_10kshots_dec=256_randampl.h5py'
test_file = 'C:\\Users\\aj14\\Desktop\\SMI\\data\\test_30to1kHz_2kshots_dec=256_randampl.h5py'

print('begin main', datetime.datetime.now())
runner = train.TrainingRunner(train_file, valid_file, test_file)
runner.scan_hyperparams()

Expand Down
24 changes: 11 additions & 13 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,22 @@
act_fn_by_name = {'Tanh': nn.Tanh(), 'LeakyReLU': nn.LeakyReLU(), 'ReLU': nn.ReLU()}

class CNN(nn.Module):
def __init__(self, input_size, output_size, activation='ReLU'):
def __init__(self, input_size, output_size, activation='LeakyReLU'):
super(CNN, self).__init__()
self.conv_layers = nn.Sequential(
nn.Conv1d(input_size, 16, kernel_size=7), # Lout = 250, given L = 256
nn.Conv1d(input_size, 16, kernel_size=7), # Lout = 250, given L = 256
act_fn_by_name[activation],
nn.MaxPool1d(2), # Lout = 125, given L = 250
nn.Conv1d(16, 32, kernel_size=7), # Lout = 119, given L = 125
nn.MaxPool1d(2), # Lout = 125, given L = 250
nn.Conv1d(16, 32, kernel_size=7), # Lout = 119, given L = 125
act_fn_by_name[activation],
nn.MaxPool1d(2), # Lout = 59, given L = 119
nn.Conv1d(32, 64, kernel_size=7), # Lout = 53, given L = 59
nn.MaxPool1d(2), # Lout = 59, given L = 119
nn.Conv1d(32, 64, kernel_size=7), # Lout = 53, given L = 59
act_fn_by_name[activation],
nn.MaxPool1d(2), # Lout = 26, given L = 53
nn.MaxPool1d(2), # Lout = 26, given L = 53
nn.Dropout(0.1),
nn.Conv1d(64, 64, kernel_size=7), # Lout = 20, given L = 26
nn.Conv1d(64, 64, kernel_size=7), # Lout = 20, given L = 26
act_fn_by_name[activation],
nn.MaxPool1d(2) # Lout = 10, given L = 20
nn.MaxPool1d(2) # Lout = 10, given L = 20
)
self.fc_layers = nn.Sequential(
nn.Linear(10, 16),
Expand All @@ -30,9 +30,7 @@ def __init__(self, input_size, output_size, activation='ReLU'):
)

def forward(self, x):
out = self.conv_layers(x)
print(f"out size after conv: {out.size()}") # expect [128, 64, 10]
out = self.fc_layers(out)
print(f"out size after fc: {out.size()}") # expect [128, 64, 1]
out = self.conv_layers(x) # expect out [128, 64, 10]
out = self.fc_layers(out) # expect out [128, 64, 1]
return out

69 changes: 46 additions & 23 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,19 @@
import lightning as L
from lightning.pytorch.loggers import WandbLogger
from itertools import product
import datetime
import time

# Define a custom Dataset class
class VelocityDataset(Dataset):
def __init__(self, h5_file):
self.h5_file = h5_file
with h5py.File(self.h5_file, 'r') as f:
self.length = len(f['Time (s)']) # num shots
print(self.h5_file)
self.opened_flag = False

def open_hdf5(self, group_size=256, step=1):
def open_hdf5(self, group_size=256, step=1, num_groups=64):
"""Set up inputs and targets. For each shot, buffer is split into rolling data.
Inputs include grouped photodiode trace of 'group_size', spaced interval 'step' apart.
Targets include average velocity of each group.
Expand All @@ -37,15 +41,24 @@ def open_hdf5(self, group_size=256, step=1):
# solves issue where hdf5 file opened in __init__ prevents multiple
# workers: https://github.com/pytorch/pytorch/issues/11929
self.file = h5py.File(self.h5_file, 'r')
pds = self.file['PD (V)'] # [num_shots, buffer_size]
vels = self.file['Speaker (Microns/s)'] # [num_shots, buffer_size]

grouped_pds = np.array(np.hsplit(self.file['PD (V)'], num_groups)) # [num_groups, num_shots, group_size]
self.inputs = np.transpose(grouped_pds, [1, 0, 2]) # [num_shots, num_groups, group_size]
grouped_vels = np.array(np.hsplit(self.file['Speaker (Microns/s)'], num_groups)) # [num_groups, num_shots, group_size]
grouped_vels = np.transpose(grouped_vels, [1, 0, 2]) # [num_shots, num_groups, group_size]
grouped_vels = np.average(grouped_vels, axis=2) # store average velocity per group per shot: [num_shots, num_groups]
self.targets = np.expand_dims(grouped_vels, axis=2) # [num_shots, num_groups, 1]
self.length = len(self.file['Time (s)']) # num shots
# print(torch.cuda.get_device_name(0))
pds = torch.Tensor(np.array(self.file['PD (V)'])) # [num_shots, buffer_size]
vels = torch.Tensor(np.array(self.file['Speaker (Microns/s)'])) # [num_shots, buffer_size]

grouped_pds = torch.stack(torch.split(pds, group_size, dim=1))
self.inputs = torch.transpose(grouped_pds, dim0=0, dim1=1)
grouped_vels = torch.stack(torch.split(vels, group_size, dim=1))
grouped_vels = torch.transpose(grouped_vels, dim0=0, dim1=1)
self.targets = torch.unsqueeze(torch.mean(grouped_vels, dim=2), dim=2)
# print(self.inputs.size()) # [2k, 64, 256]
# print(self.targets.size()) # [2k, 64, 1]
# grouped_pds = np.array(np.hsplit(self.file['PD (V)'], num_groups)) # [num_groups, num_shots, group_size]
# self.inputs = np.transpose(grouped_pds, [1, 0, 2]) # [num_shots, num_groups, group_size]
# grouped_vels = np.array(np.hsplit(self.file['Speaker (Microns/s)'], num_groups)) # [num_groups, num_shots, group_size]
# grouped_vels = np.transpose(grouped_vels, [1, 0, 2]) # [num_shots, num_groups, group_size]
# grouped_vels = np.average(grouped_vels, axis=2) # store average velocity per group per shot: [num_shots, num_groups]
# self.targets = np.expand_dims(grouped_vels, axis=2) # [num_shots, num_groups, 1]

## FOR ROLLING INPUT
# grouped_pds = np.array([pds[:, i:i+n] for i in range(0, len(pds[0])-n+1, m)]) # [num_groups, num_shots, group_size]
Expand All @@ -59,10 +72,18 @@ def __len__(self):
return self.length

def __getitem__(self, idx):
if not hasattr(self, self.h5_file):
# print("getitem entered")
if not self.opened_flag: #not hasattr(self, 'h5_file'):
self.open_hdf5()
self.opened_flag = True
# print("open_hdf5 finished")
return FloatTensor(self.inputs[idx]), FloatTensor(self.targets[idx])

# def __getitems__(self, indices: list):
# if not hasattr(self, 'h5_file'):
# self.open_hdf5()
# return FloatTensor(self.inputs[indices]), FloatTensor(self.targets[indices])

class TrainingRunner:
def __init__(self, training_h5, validation_h5, testing_h5,
velocity_only=False, num_groups=64):
Expand All @@ -73,30 +94,32 @@ def __init__(self, training_h5, validation_h5, testing_h5,

# get dataloaders
self.set_dataloaders()

print("dataloaders set:", datetime.datetime.now())
# dimensions
input_ref = next(iter(self.train_loader))
output_ref = next(iter(self.train_loader))
# print("loaded next(iter", datetime.datetime.now())
self.input_size = num_groups #input_ref[0].size(-1) #** 2
self.output_size = num_groups # output_ref[1].size(-1)
self.output_size = num_groups # output_ref[1].size(-1)
print(f"input ref {len(input_ref)} , {input_ref[0].size()}")
print(f"output ref {len(output_ref)} , {output_ref[1].size()}")
print(f"train.py input_size {self.input_size}")
print(f"train.py output_size {self.output_size}")
# print(f"train.py input_size {self.input_size}")
# print(f"train.py output_size {self.output_size}")

# directories
self.checkpoint_dir = "./checkpoints"
print('TrainingRunner initialized', datetime.datetime.now())

def get_custom_dataloader(self, h5_file, batch_size=128, shuffle=True,
velocity_only=True):
# if velocity_only:
dataset = VelocityDataset(h5_file)

print("dataset initialized")
# We can use DataLoader to get batches of data
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
num_workers=16, persistent_workers=True,
pin_memory=True)

print("dataloader initialized")
return dataloader

def set_dataloaders(self, batch_size=128):
Expand Down Expand Up @@ -129,7 +152,7 @@ def train_model(self, model_name, save_name=None, **kwargs):
verbose=True,
mode="min")
checkpoint_callback = ModelCheckpoint(save_weights_only=True,
mode="max", monitor="val_acc")
mode="min", monitor="val_loss")
# Save the best checkpoint based on the maximum val_acc recorded.
# Saves only weights and not optimizer

Expand All @@ -138,9 +161,9 @@ def train_model(self, model_name, save_name=None, **kwargs):
default_root_dir=os.path.join(self.checkpoint_dir, save_name),
accelerator="gpu",
devices=[0],
max_epochs=180,
max_epochs=1000,
callbacks=[early_stop_callback, checkpoint_callback],
check_val_every_n_epoch=1, #10,
check_val_every_n_epoch=10,
logger=logger
)

Expand All @@ -165,7 +188,7 @@ def train_model(self, model_name, save_name=None, **kwargs):
return model, result

def scan_hyperparams(self):
for lr in [1e-3]:#, 1e-2, 3e-2]:
for lr in [1e-4]:#[1e-3]: #, 1e-2, 3e-2]:

model_config = {"input_size": self.input_size,
"output_size": self.output_size}
Expand All @@ -175,11 +198,11 @@ def scan_hyperparams(self):

self.train_model(model_name="CNN",
model_hparams=model_config,
optimizer_name="SGD",
optimizer_name="Adam",
optimizer_hparams=optimizer_config,
misc_hparams=misc_config)

def load_model(self, model_name='CNN', model_tag):
def load_model(self, model_tag, model_name='CNN'):
# Check whether pretrained model exists. If yes, load it and skip training
pretrained_filename = os.path.join(self.checkpoint_dir, model_name, "SMI", model_tag,
"checkpoints", "*" + ".ckpt")
Expand Down

0 comments on commit d99a9ed

Please sign in to comment.