Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bert encoder #5

Open
wants to merge 57 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
5391b2d
inital
hudaAlamri Nov 12, 2020
cd4047c
gitignore
hudaAlamri Nov 12, 2020
ceb4c4b
Adds s3d feature extraction changes to dataloader
Nov 12, 2020
92e0833
add video extention mp4 to the keys
Nov 14, 2020
1fef60e
Adds S3D features and model into the network
Nov 14, 2020
31daa5f
Resolves conflict
Nov 14, 2020
32f6594
Fixes some argument
Nov 14, 2020
be118d2
Adds provision for freezing all but few layers
Nov 14, 2020
7ce14ca
update evaluate
Nov 15, 2020
164521e
Merge branch 'apoorva' of https://github.com/hudaAlamri/avsd into apo…
Nov 15, 2020
354267b
adds naming conentions for saving checkpoints and args.txt
Nov 15, 2020
a752a3c
update evaluation
Nov 15, 2020
ffb4f61
merge training
Nov 15, 2020
67c6c9c
Adds finetune to the naming convention
Nov 15, 2020
7ac71af
Adds writing video files as npy for easy reading
Nov 15, 2020
29c1737
minor changes
Nov 16, 2020
99c5887
deletes data folder
Nov 16, 2020
e6b2863
minor bug fix
Nov 16, 2020
c6d3c60
add input type ot the dir name
Nov 19, 2020
53ff2fc
update input_type to add it to the savd dir
Nov 19, 2020
e2cf045
update train
Nov 19, 2020
3077e3b
update training parameters
Nov 23, 2020
772f90d
add viz
hudaAlamri Nov 23, 2020
4a299e0
Fixes code for dataparallel
Nov 24, 2020
7d88d15
Cleans the code, as dataprallel can create dict chunks
Nov 24, 2020
bd555d4
Tried to fix flatten_paramter warning
Nov 24, 2020
d92bef9
Adds loss calculation into the model class
Nov 24, 2020
57fc90a
ad num_workers to the dataloader
Nov 24, 2020
e0f4e93
update
Nov 27, 2020
e2f6f27
update evaluation
Nov 27, 2020
e5277b2
mid, unchecked commit for multiGPU
Nov 28, 2020
8867b6b
removes checkpoint from git
Nov 28, 2020
1f292e8
update the dataloader for test set
hudaAlamri Nov 28, 2020
3f8125a
minor bug
Nov 29, 2020
e3a7606
add eval to train
hudaAlamri Nov 29, 2020
b61f10c
add eval
hudaAlamri Nov 30, 2020
4c07a2f
print rank results for each save point
hudaAlamri Nov 30, 2020
7b09d73
update train
Nov 30, 2020
d24eda2
update eval for all epcohs
hudaAlamri Nov 30, 2020
f9078bd
fix log
hudaAlamri Nov 30, 2020
727ee2d
minor bug
Dec 1, 2020
0257351
fix eval
Dec 3, 2020
49d3731
Moves arguments into args.py
Dec 4, 2020
bca63e9
Merge branch 'apoorva' of github.com:hudaAlamri/avsd into apoorva
Dec 4, 2020
e2c7bc7
Adds minor changes to evaluate.py, not tested
Dec 4, 2020
e083a9d
merges conflict
Dec 4, 2020
da62920
bert_encoder
hudaAlamri Dec 4, 2020
1ed639b
add dataparallel with bert
hudaAlamri Dec 4, 2020
092736b
freeze all bert layers
hudaAlamri Dec 4, 2020
559507b
add the option_embed to the current device
Dec 4, 2020
e7b7d52
unfreeze bert layers
hudaAlamri Dec 4, 2020
ca6ba06
freeze bert.encoder 10 layers, bert.embedding
hudaAlamri Dec 4, 2020
e604d16
freeze all layers
Dec 4, 2020
520eee8
update for bert
Dec 7, 2020
044f1c5
evaluate
hudaAlamri Dec 7, 2020
6be2f14
fix a minor bug
hudaAlamri Jan 10, 2021
f1b7a38
update train
hudaAlamri Feb 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .args.py.swn
Binary file not shown.
Binary file added .args.py.swo
Binary file not shown.
51 changes: 51 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Byte-compiled / optimized / DLL files
__pycache__
**/__pycache__
*.py[cod]
*$py.class
.idea
*.swp
# C extensions
*.so
*.pyc
*._
*.png
__pycache__
/venv
/.idea
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Datasets, pretrained models, checkpoints and preprocessed files
data/
!visdialch/data/
checkpoints
checkpoints/*
logs/
results/
log/
launcher.sh
data
# IPython Notebook
.ipynb_checkpoints
data
# virtualenv
venv/
.vscode
.swp
91 changes: 91 additions & 0 deletions args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import argparse


def get_args(parser, description='MILNCE'):
if parser is None:
parser = argparse.ArgumentParser(description=description)

parser.add_argument_group('Input modalites arguments')

parser.add_argument('-input_type', default='Q_DH_V',
choices=['Q_only', 'Q_DH', 'Q_A', 'Q_I', 'Q_V', 'Q_C_I', 'Q_DH_V', 'Q_DH_I', 'Q_V_A', 'Q_DH_V_A'], help='Specify the inputs')

parser.add_argument_group('Encoder Decoder choice arguments')
parser.add_argument('-encoder', default='lf-ques-im-hist',
choices=['lf-ques-im-hist'], help='Encoder to use for training')
parser.add_argument('-concat_history', default=True,
help='True for lf encoding')
parser.add_argument('-decoder', default='disc',
choices=['disc'], help='Decoder to use for training')
parser.add_argument('-finetune_textEncoder', default=0, type=int,
help= 'Finetune the text encoder')
parser.add_argument_group('Optimization related arguments')
parser.add_argument('-num_epochs', default=45, type=int, help='Epochs')
parser.add_argument('-batch_size', default=12, type=int, help='Batch size')
parser.add_argument('-lr', default=1e-4, type=float, help='Learning rate')
parser.add_argument('-lr_decay_rate', default=0.9,
type=float, help='Decay for lr')
parser.add_argument('-min_lr', default=5e-5, type=float,
help='Minimum learning rate')
parser.add_argument('-weight_init', default='xavier',
choices=['xavier', 'kaiming'], help='Weight initialization strategy')
parser.add_argument('-weight_decay', default=5e-4,
help='Weight decay for l2 regularization')
parser.add_argument('-overfit', action='store_true',
help='Overfit on 5 examples, meant for debugging')
parser.add_argument('-gpuid', default=0, type=int, help='GPU id to use')

parser.add_argument_group('Checkpointing related arguments')
parser.add_argument('-load_path', default='',
help='Checkpoint to load path from')
parser.add_argument('-save_path', default='checkpoints/',
help='Path to save checkpoints')
parser.add_argument('-save_step', default=4, type=int,
help='Save checkpoint after every save_step epochs')
parser.add_argument('-eval_step', default=100, type=int,
help='Run validation after every eval_step iterations')
parser.add_argument('-input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled",
help=".h5 file path for the charades s3d features.")
parser.add_argument('-finetune', default=0, type=int,
help="When set true, the model finetunes the s3dg model for video")

# S3DG parameters and dataloader
parser.add_argument('-num_frames', type=int, default=40,
help='num_frame')
parser.add_argument('-video_size', type=int, default=224,
help='random seed')
parser.add_argument('-fps', type=int, default=16, help='')
parser.add_argument('-crop_only', type=int, default=1,
help='random seed')
parser.add_argument('-center_crop', type=int, default=0,
help='random seed')
parser.add_argument('-random_flip', type=int, default=0,
help='random seed')
parser.add_argument('-video_root', default='data/videos')
parser.add_argument('-unfreeze_layers', default=1, type=int,
help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers")
parser.add_argument("-text_encoder", default="lstm",
help="lstm or transformer", type=str)
parser.add_argument("-use_npy", default=1, type=int,
help="Uses npy instead of reading from videos")
parser.add_argument("-numpy_path", default="data/charades")
parser.add_argument("-num_workers", default=8, type=int)

parser.add_argument_group('Visualzing related arguments')
parser.add_argument('-enableVis', type=int, default=1)
parser.add_argument('-visEnvName', type=str, default='s3d_Nofinetune')
parser.add_argument('-server', type=str, default='127.0.0.1')
parser.add_argument('-serverPort', type=int, default=8855)
parser.add_argument('-set_cuda_device', type=str, default='')
parser.add_argument("-seed", type=int, default=1,
help="random seed for initialization")

parser.add_argument('-save_ranks', action='store_true', help='Whether to save retrieved ranks')
parser.add_argument('-use_gt', action='store_true', help='Whether to use ground truth for retriveing ranks')
parser.add_argument('--split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on')
# ----------------------------------------------------------------------------
# input arguments and options
# ----------------------------------------------------------------------------

args = parser.parse_args()
return args
208 changes: 208 additions & 0 deletions create_npy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import argparse
import os
import random

import cv2
import ffmpeg
import h5py
import numpy as np
import pandas as pd
import torch
import torch as th
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision import io, transforms
from tqdm import tqdm

random.seed(42)
np.random.seed(42)


class Transform(object):

def __init__(self):
self.mean = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float32)
self.std = torch.tensor([0.229, 0.224, 0.225], dtype=torch.float32)

def __call__(self, add_jitter=False, crop_size=224):
transform = transforms.Compose([
self.random_crop(crop_size),
])
return transform

def to_tensor(self):
return transforms.ToTensor()

def random_crop(self, size):
return transforms.RandomCrop(size, pad_if_needed=True)

def colorJitter(self):
return transforms.ColorJitter(0.4, 0.2, 0.2, 0.1)


class CustomDataset(Dataset):

def __init__(self, args, path):
"""Initialize the dataset with splits given by 'subsets', where
subsets is taken from ['train', 'val', 'test']
"""
super(CustomDataset, self).__init__()
self.args = args
self.path = path
self.fl_list = self.get_filenames(
os.path.join(args.video_root, path))
self.transform = Transform()

def __len__(self):
return len(self.fl_list)

def _get_opencv_video(self, video_path):
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_FPS, 30)
ret, frame = cap.read()
frames = [frame]
while ret:
ret, frame = cap.read()
if frame is not None:
frames.append(frame)
cap.release()
frames_array = np.concatenate(np.expand_dims(frames, 0))
return frames_array

def get_filenames(self, path):
results = []
results += [each for each in os.listdir(path) if each.endswith('.mp4')]
return results

def _get_video_torch(self, video_path):
vframes, _, vmeta = io.read_video(video_path)
vframes = vframes.permute(0, 3, 1, 2)
vframes = self.transform(self.args.video_size)(vframes)
if vframes.shape[0] < self.args.num_frames:
zeros = th.zeros(
(3, self.args.num_frames - video.shape[0], self.args.video_size, self.args.video_size), dtype=th.uint8)
vframes = th.cat((vframes, zeros), axis=0)
# Gets n_frames from tne entire video, linearly spaced
vid_indices = np.linspace(
0, vframes.shape[0] - 1, self.args.num_frames, dtype=int)
vid = vframes[vid_indices, :].permute(1, 0, 2, 3)
for i in range(3):
for j in range(vid.shape[1]):
if vid[i, j, :, :].sum() == 0:
print(i, j)
return vid

def _get_video(self, video_path, start=0, end=0):
'''
:param video_path: Path of the video file
start: Start time for the video
end: End time.
:return: video: video_frames.
'''
# start_seek = random.randint(start, int(max(start, end - self.num_sec)))
start_seek = 0
cmd = (
ffmpeg
.input(video_path)
.filter('fps', fps=self.args.fps)
)
if self.args.center_crop:
aw, ah = 0.5, 0.5
else:
aw, ah = random.uniform(0, 1), random.uniform(0, 1)
if self.args.crop_only:
'''
Changes from the original code, because we have few videos that have <224 resolution and needs to be scaled up after cropping, and cropping needs to take care of the size of the image which it did not before.
cmd = (cmd.crop('(iw - {})*{}'.format(self.args.video_size, aw),
'(ih - {})*{}'.format(self.args.video_size, ah),
str(self.args.video_size), str(self.args.video_size))
)'''
cmd = (
cmd.crop('max(0, (iw-{}))*{}'.format(self.args.video_size, aw),
'max(0, (ih-{}))*{}'.format(self.args.video_size, ah),
'min(iw, {})'.format(self.args.video_size),
'min(ih, {})'.format(self.args.video_size))
.filter('scale', self.args.video_size, self.args.video_size)
)
else:
cmd = (
cmd.crop('(iw - max(0, min(iw,ih)))*{}'.format(aw),
'(ih - max(0, min(iw,ih)))*{}'.format(ah),
'min(iw,ih)',
'min(iw,ih)')
.filter('scale', self.args.video_size, self.args.video_size)
)
if self.args.random_flip and random.uniform(0, 1) > 0.5:
cmd = cmd.hflip()
out, _ = (
cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24')
.run(capture_stdout=True, quiet=True)
)

video = np.frombuffer(out, np.uint8).reshape(
[-1, self.args.video_size, self.args.video_size, 3])
video = th.from_numpy(video)
video = video.permute(3, 0, 1, 2)
if video.shape[1] < self.args.num_frames:
zeros = th.zeros(
(3, self.args.num_frames - video.shape[1], self.args.video_size, self.args.video_size), dtype=th.uint8)
video = th.cat((video, zeros), axis=1)
# Gets n_frames from tne entire video, linearly spaced
vid_indices = np.linspace(
0, video.shape[1]-1, self.args.num_frames, dtype=int)
return video[:, vid_indices]

def __getitem__(self, idx):
video_file = self.fl_list[idx]
write_file = os.path.join(
self.args.write_path, video_file.replace(".mp4", ".npy"))
video_path = os.path.join(
self.args.video_root, self.path, video_file)
vid = self._get_video_torch(video_path)
np.save(write_file, vid)
return video_file


def main(args):
dataloader = torch.utils.data.DataLoader(
CustomDataset(args, args.train_val_path),
batch_size=1,
shuffle=False, drop_last=True)

dataloader_val = torch.utils.data.DataLoader(
CustomDataset(args, args.test_path),
batch_size=1,
shuffle=False, drop_last=True)

if args.train:
for i, batch in tqdm(enumerate(dataloader)):
print("train ", batch)
if args.test:
for i, batch in tqdm(enumerate(dataloader_val)):
print("val ", batch)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--num_frames', type=int, default=40,
help='num_frame')
parser.add_argument('--video_root', default='./data/charades/videos')

parser.add_argument('--write_path', default="./data/charades")
parser.add_argument('--video_size', type=int, default=224,
help='random seed')
parser.add_argument('--fps', type=int, default=16, help='')
parser.add_argument('--crop_only', type=int, default=1,
help='random seed')
parser.add_argument('--center_crop', type=int, default=0,
help='random seed')
parser.add_argument('--random_flip', type=int, default=0,
help='random seed')
parser.add_argument('--train', default=1)
parser.add_argument('--test', default=1)
args = parser.parse_args()
args.train_val_path = "train_val"
args.test_path = "test"
args.write_path += "/num_frames_{}".format(args.num_frames)
os.makedirs(args.write_path, exist_ok=True)
main(args)
Loading