From 5391b2d5a4a9431f2339d46115684c3f2514777d Mon Sep 17 00:00:00 2001 From: hudaAlamri Date: Thu, 12 Nov 2020 10:25:35 -0500 Subject: [PATCH 01/49] inital --- .gitignore | 44 +++++++++ data | 1 + decoders/__pycache__/__init__.cpython-36.pyc | Bin 348 -> 344 bytes decoders/__pycache__/disc.cpython-36.pyc | Bin 1817 -> 1813 bytes encoders/__pycache__/__init__.cpython-36.pyc | Bin 343 -> 339 bytes encoders/__pycache__/lf.cpython-36.pyc | Bin 3733 -> 3729 bytes env.yml | 89 +++++++++---------- train.py | 10 +-- utils/__pycache__/__init__.cpython-36.pyc | Bin 270 -> 266 bytes utils/__pycache__/dynamic_rnn.cpython-36.pyc | Bin 2380 -> 2376 bytes utils/__pycache__/eval_utils.cpython-36.pyc | Bin 1514 -> 1510 bytes 11 files changed, 90 insertions(+), 54 deletions(-) create mode 100644 .gitignore create mode 120000 data diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..45220cc --- /dev/null +++ b/.gitignore @@ -0,0 +1,44 @@ +# Byte-compiled / optimized / DLL files +__pycache__ +**/__pycache__ +*.py[cod] +*$py.class +.idea + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Datasets, pretrained models, checkpoints and preprocessed files +data/ +!visdialch/data/ +checkpoints/ +logs/ +results/ + +# IPython Notebook +.ipynb_checkpoints + +# virtualenv +venv/ +HowTo100/weights +.vscode +.swp diff --git a/data b/data new file mode 120000 index 0000000..b6b39ef --- /dev/null +++ b/data @@ -0,0 +1 @@ +/srv/share/halamri3/data/avsd \ No newline at end of file diff --git a/decoders/__pycache__/__init__.cpython-36.pyc b/decoders/__pycache__/__init__.cpython-36.pyc index c38bdf3900d37a6e7d181c67e75636b885ecc67a..69b5d9f7debe72c31bb2a2c34ec46174daf1fc3b 100644 GIT binary patch delta 43 zcmcb^bc2b*n3tF9mFL>{i5%y|we|B-OEU6vQ}r_va}sllGL7{U%ZgJbe#`~{N-+>A delta 47 zcmcb?bccz66@;i5%yo_4PCIb5r%pGV_vC^GfuC6Z2A%@+{jT{dc#ntumQcE)Ob5r#*5_1xBi!zP%6U&NIHZwDcu>t^I Cw+_$% delta 50 zcmbQrH66@;jT{dcrM2}l@^e%5%QEwlQ}asngA?;olJYC`9m9fMHveZ7 GV+8=PD-lBg diff --git a/encoders/__pycache__/__init__.cpython-36.pyc b/encoders/__pycache__/__init__.cpython-36.pyc index 0661356ac38d1d2fe4371cbffc7ebeaf64171206..ef6d01c592949b4aa57b7d2d09de2e1b6785ddf0 100644 GIT binary patch delta 43 zcmcc4beW06n3tF9mFL>{i5#cIwe|B-OEU6vQ}r_va}sllGL7{U%ZgJbzRdyvNE8qj delta 47 zcmcc2be)OAn3tF9>66@;i5#b-_4PCIb5r%pGV_vC^GfuC6Z2A%@+{jT{Ng;;Q<2sU;ctxvBaYi8+b6MVZF>iDktpo12-1xB*oB B4(R{@ delta 50 zcmbOzJ5`p$n3tF9>66@;jT{Ng(wh1i`MIh3Wtn-&sd**(!HIb(N% 0.0: - running_loss = 0.95 * running_loss + 0.05 * cur_loss.data[0] + running_loss = 0.95 * running_loss + 0.05 * cur_loss.item() else: - running_loss = cur_loss.data[0] + running_loss = cur_loss.item() if optimizer.param_groups[0]['lr'] > args.min_lr: scheduler.step() @@ -201,7 +201,7 @@ dec_out = decoder(enc_out, val_batch) cur_loss = criterion(dec_out, val_batch['ans_ind'].view(-1)) - validation_losses.append(cur_loss.data[0]) + validation_losses.append(cur_loss.item()) validation_loss = np.mean(validation_losses) diff --git a/utils/__pycache__/__init__.cpython-36.pyc b/utils/__pycache__/__init__.cpython-36.pyc index 6e557463e053497dbb279b12a29ee08fb505a134..0bbfee287e61caf64bdc2e804e20414e78aacbef 100644 GIT binary patch delta 43 ycmeBU>SE$B=H=yj<+(O~BF7PN4gI{-l8pS^RQ-&^oW$IsOk@4Tvf`A9&(#4g+Yd4T delta 47 zcmeBT>SN+C=H=yj`Xo1IBF7PF9sP{_+*JLt%)I2(yb}H3#JrTG{0e=?uwa*okJSNp CZxE&c diff --git a/utils/__pycache__/dynamic_rnn.cpython-36.pyc b/utils/__pycache__/dynamic_rnn.cpython-36.pyc index daab02884c6457728a48ceecd828bc2a2cc12e67..8d5efd782081c9cc215dd4457c88a98822591fc7 100644 GIT binary patch delta 46 zcmX>jbV7*3n3tF9wddOSjT}Xc;@bLosU;ctxvBaYi8+b6MVZF>iDktpo2N2vX9oai C(GUp$ delta 50 zcmX>hbVi87n3tF9>66@;jT}Xc()#)t`MIh3Wtn-&sd**(!HIb(N%iDktpo2?nwvj6~O Ccn@~~ delta 50 zcmaFH{fe8zn3tF9>66@;jT{n;(t7$C`MIh3Wtn-&sd**(!HIb(N% Date: Thu, 12 Nov 2020 10:37:05 -0500 Subject: [PATCH 02/49] gitignore --- .gitignore | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 45220cc..9ef21ee 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ __pycache__ *.py[cod] *$py.class .idea - +*.swp # C extensions *.so @@ -33,7 +33,8 @@ data/ checkpoints/ logs/ results/ - +log/ +launcher.sh # IPython Notebook .ipynb_checkpoints From ceb4c4bf446c9e2e5a0f8ebc9431a320654b8c24 Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Thu, 12 Nov 2020 18:25:16 -0500 Subject: [PATCH 03/49] Adds s3d feature extraction changes to dataloader Signed-off-by: Apoorva Beedu --- .gitignore | 1 - data | 2 +- dataloader.py | 83 ++++++++++++------ decoders/__pycache__/__init__.cpython-35.pyc | Bin 400 -> 0 bytes decoders/__pycache__/__init__.cpython-36.pyc | Bin 344 -> 0 bytes decoders/__pycache__/disc.cpython-35.pyc | Bin 2002 -> 0 bytes decoders/__pycache__/disc.cpython-36.pyc | Bin 1813 -> 0 bytes .../disc_realdialogs.cpython-36.pyc | Bin 1820 -> 0 bytes encoders/__pycache__/__init__.cpython-36.pyc | Bin 339 -> 0 bytes encoders/__pycache__/lf.cpython-36.pyc | Bin 3729 -> 0 bytes encoders/lf.py | 60 +++++++------ train.py | 74 ++++++++++------ utils/__pycache__/__init__.cpython-36.pyc | Bin 266 -> 0 bytes utils/__pycache__/dynamic_rnn.cpython-36.pyc | Bin 2376 -> 0 bytes utils/__pycache__/eval_utils.cpython-36.pyc | Bin 1510 -> 0 bytes 15 files changed, 135 insertions(+), 85 deletions(-) delete mode 100644 decoders/__pycache__/__init__.cpython-35.pyc delete mode 100644 decoders/__pycache__/__init__.cpython-36.pyc delete mode 100644 decoders/__pycache__/disc.cpython-35.pyc delete mode 100644 decoders/__pycache__/disc.cpython-36.pyc delete mode 100644 decoders/__pycache__/disc_realdialogs.cpython-36.pyc delete mode 100644 encoders/__pycache__/__init__.cpython-36.pyc delete mode 100644 encoders/__pycache__/lf.cpython-36.pyc delete mode 100644 utils/__pycache__/__init__.cpython-36.pyc delete mode 100644 utils/__pycache__/dynamic_rnn.cpython-36.pyc delete mode 100644 utils/__pycache__/eval_utils.cpython-36.pyc diff --git a/.gitignore b/.gitignore index 9ef21ee..ba74f36 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,5 @@ launcher.sh # virtualenv venv/ -HowTo100/weights .vscode .swp diff --git a/data b/data index b6b39ef..75dea0c 120000 --- a/data +++ b/data @@ -1 +1 @@ -/srv/share/halamri3/data/avsd \ No newline at end of file +/home/apoorva/hdd1/avsd \ No newline at end of file diff --git a/dataloader.py b/dataloader.py index 9e0e61e..15c6c00 100644 --- a/dataloader.py +++ b/dataloader.py @@ -4,6 +4,7 @@ from random import shuffle import h5py +import hdfdict import numpy as np from tqdm import tqdm @@ -17,12 +18,18 @@ class VisDialDataset(Dataset): @staticmethod def add_cmdline_args(parser): parser.add_argument_group('Dataloader specific arguments') - parser.add_argument('-input_img', default='data/data_img.h5', help='HDF5 file with image features') - parser.add_argument('-input_vid', default='data/data_video.h5', help='HDF5 file with image features') - parser.add_argument('-input_audio', default='data/data_audio.h5', help='HDF5 file with audio features') - parser.add_argument('-input_ques', default='data/dialogs.h5', help='HDF5 file with preprocessed questions') - parser.add_argument('-input_json', default='data/params.json', help='JSON file with image paths and vocab') - parser.add_argument('-img_norm', default=1, choices=[1, 0], help='normalize the image feature. 1=yes, 0=no') + parser.add_argument( + '-input_img', default='data/data_img.h5', help='HDF5 file with image features') + # parser.add_argument( + # '-input_vid', default='data/data_video.h5', help='HDF5 file with video features') + parser.add_argument( + '-input_audio', default='data/data_audio.h5', help='HDF5 file with audio features') + parser.add_argument('-input_ques', default='data/dialogs.h5', + help='HDF5 file with preprocessed questions') + parser.add_argument('-input_json', default='data/params.json', + help='JSON file with image paths and vocab') + parser.add_argument( + '-img_norm', default=1, choices=[1, 0], help='normalize the image feature. 1=yes, 0=no') return parser def __init__(self, args, subsets): @@ -60,14 +67,13 @@ def __init__(self, args, subsets): print("Dataloader loading h5 file: {}".format(args.input_ques)) ques_file = h5py.File(args.input_ques, 'r') - if 'image' in args.input_type: print("Dataloader loading h5 file: {}".format(args.input_img)) img_file = h5py.File(args.input_img, 'r') if 'video' in args.input_type: print("Dataloader loading h5 file: {}".format(args.input_vid)) - vid_file = h5py.File(args.input_vid, 'r') + vid_file = args.input_vid if 'audio' in args.input_type: print("Dataloader loading h5 file: {}".format(args.input_audio)) @@ -104,16 +110,23 @@ def __init__(self, args, subsets): if 'video' in args.input_type: print("Reading video features...") - vid_feats = torch.from_numpy(np.array(vid_file['images_' + dtype])) + # Charades dataset features are all saved in one h5 file as a key, feat dictionary + vid_feats = hdfdict.load( + args.input_vid + "_{0}.h5".format(dtype)) + # If this throws an error because it cannot find the video filename,uncomment below + # vid_feats = hdfdict.load( + # args.input_vid + "_{0}.h5".format("train")) + # vid_feats.update(hdfdict.load( + # args.input_vid + "_{0}.h5".format("test"))) img_fnames = getattr(self, 'unique_img_' + dtype) self.data[dtype + '_img_fnames'] = img_fnames self.data[dtype + '_vid_fv'] = vid_feats - if 'image' in args.input_type: print("Reading image features...") - img_feats = torch.from_numpy(np.array(img_file['images_' + dtype])) + img_feats = torch.from_numpy( + np.array(img_file['images_' + dtype])) if args.img_norm: print("Normalizing image features...") @@ -125,7 +138,8 @@ def __init__(self, args, subsets): if 'audio' in args.input_type: print("Reading audio features...") - audio_feats = torch.from_numpy(np.array(audio_file['images_' + dtype])) + audio_feats = torch.from_numpy( + np.array(audio_file['images_' + dtype])) audio_feats = F.normalize(audio_feats, dim=1, p=2) self.data[dtype + '_audio_fv'] = audio_feats @@ -139,14 +153,15 @@ def __init__(self, args, subsets): self.max_ans_len = self.data[dtype + '_ans'].size(2) # reduce amount of data for preprocessing in fast mode - #TODO + # TODO if args.overfit: print('\n \n \n ---------->> NOT IMPLEMENTED OVERFIT CASE <-----\n \n \n ') self.num_data_points = {} for dtype in subsets: self.num_data_points[dtype] = len(self.data[dtype + '_ques']) - print("[{0}] no. of threads: {1}".format(dtype, self.num_data_points[dtype])) + print("[{0}] no. of threads: {1}".format( + dtype, self.num_data_points[dtype])) print("\tMax no. of rounds: {}".format(self.max_ques_count)) print("\tMax ques len: {}".format(self.max_ques_len)) print("\tMax ans len: {}".format(self.max_ans_len)) @@ -191,8 +206,11 @@ def __getitem__(self, idx): # get video features if 'video' in self.args.input_type: - item['vid_feat'] = self.data[dtype + '_vid_fv'][idx] + # item['img_fnames'] is as train_val/vid_id.jpg hence the splits item['img_fnames'] = self.data[dtype + '_img_fnames'][idx] + vid_id = item['img_fnames'].split("/")[-1].split(".")[0] + item['vid_feat'] = torch.from_numpy( + self.data[dtype + '_vid_fv'][vid_id]).reshape(-1) # get image features if 'image' in self.args.input_type: @@ -226,20 +244,20 @@ def __getitem__(self, idx): item['opt'] = option_in item['opt_len'] = opt_len - #if dtype != 'test': + # if dtype != 'test': ans_ind = self.data[dtype + '_ans_ind'][idx] item['ans_ind'] = ans_ind.view(-1) # convert zero length sequences to one length # this is for handling empty rounds of v1.0 test, they will be dropped anyway - #if dtype == 'test': + # if dtype == 'test': item['ques_len'][item['ques_len'] == 0] += 1 item['opt_len'][item['opt_len'] == 0] += 1 return item - #------------------------------------------------------------------------- + # ------------------------------------------------------------------------- # collate function utilized by dataloader for batching - #------------------------------------------------------------------------- + # ------------------------------------------------------------------------- def collate_fn(self, batch): dtype = self._split @@ -254,15 +272,18 @@ def collate_fn(self, batch): out[key] = torch.stack(merged_batch[key], 0) # Dynamic shaping of padded batch if 'hist' in out: - out['hist'] = out['hist'][:, :, :torch.max(out['hist_len'])].contiguous() - out['ques'] = out['ques'][:, :, :torch.max(out['ques_len'])].contiguous() - out['opt'] = out['opt'][:, :, :, :torch.max(out['opt_len'])].contiguous() + out['hist'] = out['hist'][:, :, :torch.max( + out['hist_len'])].contiguous() + out['ques'] = out['ques'][:, :, :torch.max( + out['ques_len'])].contiguous() + out['opt'] = out['opt'][:, :, :, :torch.max( + out['opt_len'])].contiguous() return out - #------------------------------------------------------------------------- + # ------------------------------------------------------------------------- # preprocessing functions - #------------------------------------------------------------------------- + # ------------------------------------------------------------------------- def _process_history(self, dtype): """Process caption as well as history. Optionally, concatenate history @@ -278,10 +299,13 @@ def _process_history(self, dtype): num_convs, num_rounds, max_ans_len = answers.size() if self.args.concat_history: - self.max_hist_len = min(num_rounds * (max_ques_len + max_ans_len), 400) - history = torch.zeros(num_convs, num_rounds, self.max_hist_len).long() + self.max_hist_len = min( + num_rounds * (max_ques_len + max_ans_len), 400) + history = torch.zeros(num_convs, num_rounds, + self.max_hist_len).long() else: - history = torch.zeros(num_convs, num_rounds, max_ques_len + max_ans_len).long() + history = torch.zeros(num_convs, num_rounds, + max_ques_len + max_ans_len).long() hist_len = torch.zeros(num_convs, num_rounds).long() if 'dialog' in self.args.input_type: @@ -319,13 +343,14 @@ def _process_history(self, dtype): hlen = alen + qlen # save the history length hist_len[th_id][round_id] = hlen - else: # -- caption only + else: # -- caption only # go over each question and append it with answer for th_id in range(num_convs): clen = cap_len[th_id] hlen = min(clen, max_ques_len + max_ans_len) for round_id in range(num_rounds): - history[th_id][round_id][:max_ques_len + max_ans_len] = captions[th_id][:max_ques_len + max_ans_len] + history[th_id][round_id][:max_ques_len + + max_ans_len] = captions[th_id][:max_ques_len + max_ans_len] hist_len[th_id][round_id] = hlen self.data[dtype + '_hist'] = history diff --git a/decoders/__pycache__/__init__.cpython-35.pyc b/decoders/__pycache__/__init__.cpython-35.pyc deleted file mode 100644 index faff7c2125d9d7a7464b97f3fa34d4863b2a9bd5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 400 zcmYjNy-ve05I#FWDOD@6fW*d{0sI08AAyii1@ z%a}{VEU8Z*v5*5~nZ>C=?1l@!4zW<+SDIaXg6 diff --git a/decoders/__pycache__/__init__.cpython-36.pyc b/decoders/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 69b5d9f7debe72c31bb2a2c34ec46174daf1fc3b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 344 zcmYk2u}TCn5Qb-xm0gd!QqV_O1XElqB6y9>HFn#CunAsdlRJ|2Kv*k%84F*<2S{tB zk6`6wZxILnnUI}lpK804b=k?d$9P-1~$M0p1}YEzAz%Pbt*KmY8LWx zD)j+X^E7kB?4%0!F_p%TFEn$noRKsno!Y%YC~4{ETzJ!Ow$6x+>XZw1Dbzk{F|neM z(t6uVd9iy+c`!BO-p2lAv9>y$NJ#NkDAsC5_f>R*TZ$xA8_&M||BuKN$guKuqy4l`Ob*$2P znngu=F85`eCsM_$p#0y$%l|q9DTr=}P6bo36tpA?9a^Gn*YYkE9&Z7v)3&9Zx`Z1? zuyNNKtkt#LAP9s$6+L!8?b1yZv&)-6frqAd5*6_@9H&Yb$Hq<4`K>@9gT86$av_!J zhGCkeMHm_{QWI_bi(Dn4oS(?VI9X=AgQMeT#H5N!10tm!LVr#Y@#iHhy&< zgv7d7bNb?e*mkczdTalNt0QR|xeh-dmY(3%Qyd^#257u@jkbYvzg58*pq5>mwLEW<(=+m;xh%Pz|(j}HS5L{T)ij?B)FZz~MNDJgzF{%Q8H0VXI zaNJLExw4?fOOThvqAY?JScl+W1VBg-gQQ7R$d$SRWuPUn0wn2OKd7IDl1UmDK@{jX zS27sqDv(h;U3sothJNGnrJ9s;piwuH$E&B2)a3-921Tyo=`bpbe4?Uc_)VnJ=wv1v zORnR2wTZFH=Rq-*&E|@!%FD?#NDIw{T|_Dg7PuP2;Ni(`@W#~}G$E_q>L&R?o=uKr zrgPPpaMmZyDMgvSkScidl&UmYIS=cIy8*6pIK%Nj3O>HeXI$t_2ZVKsXNqcr-10p? zy!{W4dY^?28TiIKPvr$h08bG=&a)z&lzFL@4@1*ZQ8tlkgZXx}Sja3fEqjv2)#V)X zs)*tr2kUB+EqHiv^G(YRI9my3kT$)#ZZ6CC)%4?pquy(q8ufTnb@8TcVW3BC7Nm)K zm+QU{GFoGa9erw>g_i8t{B>{cVW9&M>iNQVJ`oSaw)jA7i2H6^+{0=|Y>EfYrnvg> z?X#wigO3R&F+Ncelh_Z#d7hLr=KEoIRz|Z*V@H`9AlqF%1EIE9z-1{Hga?)OK>A{{ zXC;G<3a~ca3Yb}@dc16vMLN@VGMn}@o6wry#eQwW-Os9$KIc|zZgpR5IPbXs1D~q( Aq5uE@ diff --git a/decoders/__pycache__/disc.cpython-36.pyc b/decoders/__pycache__/disc.cpython-36.pyc deleted file mode 100644 index dcd51bfa28d726a94da8874cd08c26610bd349e8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1813 zcmZuyOK%)S5bmDW&d#or2%G>>Ky!#hRv23-oDf2wSc;GwBgGa-BcRis?%kbb9_#KN zJF#X@8C-W1yK-J9d+KMt#S5<#qT~%FOUGs2jEBfc#zt|r=LjES# zJ`d!tVd&3+2qI`sl8)1vlQ@7lxm$RNN6BL%T;aVW!jsO-U%E*k=n)zDm#~u@j;M|H zUzEJa(x-=qDPrr-T?zU=2$CdJki>aK##B(@ybKe!vzvIYh({71ZQ9%WnNC$!WFD+@w=B&XhKn6np^YimVt7t*`63R5oNRD>K8G^|+d78=Y55 zFj<^PVV$zH-oerF6C1Sv)7gczH&t1(X(og$J0Pm&CaX#YxQz~~$x$^n1%GY_c{O3W znL*n)jfP2>>_4S6SxBj^WY;R1Jn zdyP9IFsN`LwaC3C-R=;7Nkwnzg02TT-bx2~t0Tgt)9!~0{n1_ci(t_M3l3Z{5&b4; zdfNmv;40yg!8%Cj%f}TFExm<1A&c<&&&F$lCEX`KJzex0R|F#X)oJ=2@95ZrNTbGE zuZ=NsEC;`dngRG2HsLHVy;;}{8;Bm_bC#5n)8Bs}fcBdZkww^qP&Nbp8>HbelxrEU z?kBElGp|kj4CV~vXE85DoC3)NH*&q)6+x`!X$^`}??mzHn=2`@)Wkg2X{BU5u2d{} zI$iHvZ`qADTfR|~x{#&O8|2gVw?S6n#9zdwQt5P;*QS~%E_T1?D&r@)+-PDP-*tx= ztE!02RBjGurmCtZ(>OC413TwR#B(Uc-T1-DPW(oxk2WT|)9Q`$gQ}by%TiZrqk>)^ zZL-8meJ)k}W|lfPWnHk9#hnHjhhuSBIDX!Kdyb zxFMs+dS{tDhbM=xHcG3~WRtq8wTeK|`ihqmscxbi@cCSp!uo9{t*h$-`h(%=FQYB> z4r<_I#ml#T+fmv`;LYn~u!e z?0nGbC3L=sv7!=nj`EPP)0*eqO50-81Mp_IRxK)(dCu5Xcq6K|VO?RA3R^k$6OF&5 zkPZheGU}-q<~Ho=v@F#Cr+#g+T(^zU22Wc5t@@SiYv&KY>>~UMZ(avLLK-CONE diff --git a/decoders/__pycache__/disc_realdialogs.cpython-36.pyc b/decoders/__pycache__/disc_realdialogs.cpython-36.pyc deleted file mode 100644 index 52b741569ce9008633513f82046710f6612f2fe4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1820 zcmZt{%Wm8@kko6n+I147PTB;0=un_t)Lx@#PeoC*h=E)j7l@4(5Zo}d$lcXeqShp3 zCx-U4X>-b@f6?#gU+~(Kfqp?R?NHj4jUWOz9L@}9hV#-_TU*h|U&YBq523%&jn4!8 zdsymg01Pplqom#O+({guIJsMRiHFf+#9ZdRLd@gN++Vp#!0-_o`B!ig9geVx_FoEG zWa-nx!_?9?pSwoXr+{#jV1^RsHJV_Cne!@4+*WVmy+$5Ne5=#k+s{-gvmz5zXXkvM zrzPVu-Kes=56kZA0Dwj-G{-B1QH^UvoZ6YYu)Eru`!${iuzJ?q8%qs=+P6mESlLJq zYBUd577rP+<`0kIIfGNG(;1m$Qt2xUHh|``tQZZAuc`%?CL|;inI^<|R8EzNE=tJ= zFHSi#ju6H>I68h}qJ}_JcFE0cDFm5ijB(KtqH>|LQV^h<=%Ab)l@ndi7iN%`Q=-~~ zbgXgLc#7u}*@uJaanS?>T_K(wh>4`S|Q<}2{5bIw;a}Ac}sVSmf?$M+N*;V-bcSYUG{62`ON>#sr&8T z!ifi7MYXr#O{|x;H2gZM2Vi4Zhx0)9=3zao@iz3q%2{EI&VK(H0kvO;ma+`%5K1Vb ze?ZARhSC-BI#+R7=|!dE=Wr%~KaZ(kaS9+~RPznHEr?k0vkC;c+KJ-zw~(A=sg7x^ z(o*txQp%Xqbhf#0wVSkHL(I6aePHJ!zoR@Si#Dp|aMvK+^spYFtOmHntw*}hhF#t%v{J?27{ zvJWx8<8Bs|3&=ewLRkk-}C@jp0sT(?I1b4`VT(& zz@9rWii~%j@eAlY+q_X)3Y|@>vQjbvLE}p*rd-~(ct95mE|~F~AR1Ry1&j$z(_cqh z@~)MzJ!|uB{ARE;PC)CcWUzMGdSs$@>R5xx;KqBBfql=~Zxb{Xppw86GA7@*et!TU z>03%OapXNqY{31-{yHu1!9m3U&>+O2`w-v7x9~Q!_#M{+>_dD95Aj21h%Y~ScdM`U zi~>6VVd8TbfDs`@$*SDqLqg6fnzx*0lE}}%n%P=Us+8wBA=mDWu-pc|v|dU((e2n! z_RkdJ;h>?6dh!!Eo3JfWA>_aw{Yq!KYUZN}o;3Cw@oU{T-XDJ3dib49x(XWgku!Av E2PuB>{Qv*} diff --git a/encoders/__pycache__/__init__.cpython-36.pyc b/encoders/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index ef6d01c592949b4aa57b7d2d09de2e1b6785ddf0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 339 zcmYk1!Ab)$5Qb;6ty@ZqcoBVsMbC1TV75X5# zdeTSm|KjiU9P=u>Wg~ONaYr>6DyLU&PjfsOkxWfpRvOZ?(h&gctK*Z`O~gh=J#vm zb8YiV`!-`P-VCYwWo9glEx`>J?$(!+T3E%Y$E{esfg957_fb)+wrZ>@Dq)0mdRB-B zFN-b07o6)_x11lZ9z(hfJBZn$0UI%w(mKzQz|69Lzdz{d|7?eb!svwJtc^O2pY<{8 GB>4gFy-oK3 diff --git a/encoders/__pycache__/lf.cpython-36.pyc b/encoders/__pycache__/lf.cpython-36.pyc deleted file mode 100644 index 981e09e2785b47d5c2f14b6e90d7561b1fa94059..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3729 zcmai1%a7wo8TTuW+j(|6{hY@JG>ZtDo(AoS)oOv=+2J+QLd>3g!1B19j?;dGvePpi zWlkBf64Idk12}Sk1N;FTIq(N?LUlrdOCwI45EuA;Wjmejh{aKT{@!0zx$0My_gXFc z@8AEedw!rOe^u5V1N^tqvX4NR!qiA{=&eOctU9VnydG(>?ii9cqFQV^rmB2bVKrvH zQJ5L1lloG3ET%qJhV?7tQBH@d(2@ky9?cRz57WepNP2vkcyZ|e`1I5#zv|ITCi^;= zKyg&2I9i}Hm1%F3rRo@;DI=BX%y?rtwW6+Lex?|TQ>U6@`>B@)Pi7hFe3JMn3%IYA zDd}B9`xq_z1;|2~V3-N?HPcy@&lWqxovLwQcqQK(UI@bvko&rl@xUZF^qr~J~5 zg5>H03b2kDPp9cDKZAtlFW(aN$Atz{*z*{3t54yc@pLv7&2=4Cv^n*77VzIFyp8yO zzk8Ad`8bV(ld%_hF%RE6@h&oU5)`X4JBdb*rkA2iS^StqVG>Y<%tmn5r!Hu=+P(Fg ze|ckhKf0a3rwO2kwo<)(XMe&c$dx75`BK69Ru_t^<@&@}sFNC+NgM2~4sD(3kganw zd!LaN9B1&{A}=%>t-&a!$;`JkM6~iY_D^d`Ia^=o*^!Dhs2S7xcdrN}0Di&^Iy z{~b8{wI<~)$bTT2*?pzdHK6-5>Cm?KD{b>!%l;u_TCo2MT#tNK_V&5eau;Z zaxgh0{7LRhdtcgrFYQO`cB#1|HFqb6z$}d*7?`np$_du6{2Eg%1neR=WUqwDLN7|s zMBVqM1i7DL4L4qRxj%MCA%5BYC(vjh8`q zHqPaMgh9g+?PB2WED1*`kA+#d2!naKCpzVHtjw05CRy$!I2c}pUN+Q)kpWiq(io~9x(M4WVMmCwx}WNJD?V8ECC15D1hkN zQhiN*uor-8);v^TA+Cj(`l61Q0caIdmoat3P`$>&f@fRCl=YbeN**n&*JUqQ0hd3| zo0As&%@Hn5s8Lx9!2Gqe>I>_ZwE)aNN~^YEEx_?_(rRp22^uwSRI#Y6->AGR?GLKx z4f~$7KmMQb`_j(Jc zm!o42IpK|?BRl_yI9tN2glGO(Dr8Tj!f67?a1Xr*e3WrIg(OV``30KMg-t@5r~^1( zdYpX+n(QYa3V_PedH^f}t4;1m>TpxjEWHPi>uPP)000sQuD2fK_14#HK;-;z6ZXm; z3@tjY@E<9_ab>$M?!2>z_@;}A_M+0#1cK+GAJbivar_@BV4432oM@G(?FUg*AvcY6 zsPgy8Q=&K_Djo2dNsQKPHNVD5H{9Ys#H%=RHVgMh~($A=#g_&(?P5r;m&h=XW diff --git a/encoders/lf.py b/encoders/lf.py index bd6af23..6f11f36 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -10,13 +10,20 @@ class LateFusionEncoder(nn.Module): @staticmethod def add_cmdline_args(parser): parser.add_argument_group('Encoder specific arguments') - parser.add_argument('-img_feature_size', default=4096, help='Channel size of image feature') - parser.add_argument('-vid_feature_size', default=4096, help='Channel size of video feature') - parser.add_argument('-audio_feature_size', default=4096, help='Channel size of audio feature') - parser.add_argument('-embed_size', default=300, help='Size of the input word embedding') - parser.add_argument('-rnn_hidden_size', default=512, help='Size of the multimodal embedding') - parser.add_argument('-num_layers', default=2, help='Number of layers in LSTM') - parser.add_argument('-max_history_len', default=60, help='Size of the multimodal embedding') + parser.add_argument('-img_feature_size', default=1024, + help='Channel size of image feature') + parser.add_argument('-vid_feature_size', default=1024, + help='Channel size of video feature') + parser.add_argument('-audio_feature_size', default=1024, + help='Channel size of audio feature') + parser.add_argument('-embed_size', default=300, + help='Size of the input word embedding') + parser.add_argument('-rnn_hidden_size', default=512, + help='Size of the multimodal embedding') + parser.add_argument('-num_layers', default=2, + help='Number of layers in LSTM') + parser.add_argument('-max_history_len', default=60, + help='Size of the multimodal embedding') parser.add_argument('-dropout', default=0.5, help='Dropout') return parser @@ -24,13 +31,14 @@ def __init__(self, args): super().__init__() self.args = args - self.word_embed = nn.Embedding(args.vocab_size, args.embed_size, padding_idx=0) - + self.word_embed = nn.Embedding( + args.vocab_size, args.embed_size, padding_idx=0) + if 'dialog' in args.input_type or 'caption' in args.input_type: self.hist_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.hist_rnn = DynamicRNN(self.hist_rnn) - + self.ques_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) # questions and history are right padded sequences of variable length @@ -42,18 +50,20 @@ def __init__(self, args): if args.input_type == 'question_only': fusion_size = args.rnn_hidden_size if args.input_type == 'question_dialog': - fusion_size =args.rnn_hidden_size * 2 + fusion_size = args.rnn_hidden_size * 2 if args.input_type == 'question_audio': - fusion_size =args.rnn_hidden_size + args.audio_feature_size - if args.input_type == 'question_image' or args.input_type=='question_video': - fusion_size = args.img_feature_size + args.rnn_hidden_size - if args.input_type == 'question_caption_image' or args.input_type=='question_dialog_video' or args.input_type=='question_dialog_image': + fusion_size = args.rnn_hidden_size + args.audio_feature_size + if args.input_type == 'question_image' or args.input_type == 'question_video': + fusion_size = args.img_feature_size + args.rnn_hidden_size + if args.input_type == 'question_caption_image' or args.input_type == 'question_dialog_video' or args.input_type == 'question_dialog_image': fusion_size = args.img_feature_size + args.rnn_hidden_size * 2 if args.input_type == 'question_video_audio': - fusion_size = args.img_feature_size + args.rnn_hidden_size + args.audio_feature_size + fusion_size = args.img_feature_size + \ + args.rnn_hidden_size + args.audio_feature_size if args.input_type == 'question_dialog_video_audio': - fusion_size = args.img_feature_size + args.rnn_hidden_size * 2 + args.audio_feature_size - + fusion_size = args.img_feature_size + \ + args.rnn_hidden_size * 2 + args.audio_feature_size + self.fusion = nn.Linear(fusion_size, args.rnn_hidden_size) if args.weight_init == 'xavier': @@ -69,7 +79,7 @@ def forward(self, batch): img = img.view(-1, 1, self.args.img_feature_size) img = img.repeat(1, self.args.max_ques_count, 1) img = img.view(-1, self.args.img_feature_size) - + if 'audio' in self.args.input_type: audio = batch['audio_feat'] # repeat audio feature vectors to be provided for every round @@ -83,21 +93,21 @@ def forward(self, batch): vid = vid.view(-1, 1, self.args.vid_feature_size) vid = vid.repeat(1, self.args.max_ques_count, 1) vid = vid.view(-1, self.args.vid_feature_size) - + if 'dialog' in self.args.input_type or 'caption' in self.args.input_type: hist = batch['hist'] # embed history hist = hist.view(-1, hist.size(2)) hist_embed = self.word_embed(hist) hist_embed = self.hist_rnn(hist_embed, batch['hist_len']) - + ques = batch['ques'] # embed questions ques = ques.view(-1, ques.size(2)) ques_embed = self.word_embed(ques) ques_embed = self.ques_rnn(ques_embed, batch['ques_len']) - + if self.args.input_type == 'question_only': fused_vector = ques_embed if self.args.input_type == 'question_dialog': @@ -106,9 +116,9 @@ def forward(self, batch): fused_vector = torch.cat((audio, ques_embed), 1) if self.args.input_type == 'question_image': fused_vector = torch.cat((img, ques_embed), 1) - if self.args.input_type=='question_video': + if self.args.input_type == 'question_video': fused_vector = torch.cat((vid, ques_embed), 1) - if self.args.input_type=='question_dialog_image': + if self.args.input_type == 'question_dialog_image': fused_vector = torch.cat((img, ques_embed, hist_embed), 1) if self.args.input_type == 'question_dialog_video': fused_vector = torch.cat((vid, ques_embed, hist_embed), 1) @@ -118,7 +128,7 @@ def forward(self, batch): fused_vector = torch.cat((vid, audio, ques_embed), 1) if self.args.input_type == 'question_dialog_video_audio': fused_vector = torch.cat((vid, audio, ques_embed, hist_embed), 1) - + fused_vector = self.dropout(fused_vector) fused_embedding = F.tanh(self.fusion(fused_vector)) diff --git a/train.py b/train.py index 071ad4a..0c24494 100644 --- a/train.py +++ b/train.py @@ -21,43 +21,57 @@ parser.add_argument_group('Input modalites arguments') parser.add_argument('-input_type', default='question_dialog_video', choices=['question_only', - 'question_dialog', - 'question_audio', - 'question_image', - 'question_video', - 'question_caption_image', - 'question_dialog_video', - 'question_dialog_image', - 'question_video_audio', - 'question_dialog_video_audio'], help='Specify the inputs') + 'question_dialog', + 'question_audio', + 'question_image', + 'question_video', + 'question_caption_image', + 'question_dialog_video', + 'question_dialog_image', + 'question_video_audio', + 'question_dialog_video_audio'], help='Specify the inputs') parser.add_argument_group('Encoder Decoder choice arguments') -parser.add_argument('-encoder', default='lf-ques-im-hist', choices=['lf-ques-im-hist'], help='Encoder to use for training') -parser.add_argument('-concat_history', default=True, help='True for lf encoding') -parser.add_argument('-decoder', default='disc', choices=['disc'], help='Decoder to use for training') +parser.add_argument('-encoder', default='lf-ques-im-hist', + choices=['lf-ques-im-hist'], help='Encoder to use for training') +parser.add_argument('-concat_history', default=True, + help='True for lf encoding') +parser.add_argument('-decoder', default='disc', + choices=['disc'], help='Decoder to use for training') parser.add_argument_group('Optimization related arguments') parser.add_argument('-num_epochs', default=20, type=int, help='Epochs') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') parser.add_argument('-lr', default=1e-3, type=float, help='Learning rate') -parser.add_argument('-lr_decay_rate', default=0.9997592083, type=float, help='Decay for lr') -parser.add_argument('-min_lr', default=5e-5, type=float, help='Minimum learning rate') -parser.add_argument('-weight_init', default='xavier', choices=['xavier', 'kaiming'], help='Weight initialization strategy') -parser.add_argument('-weight_decay', default=0.00075, help='Weight decay for l2 regularization') -parser.add_argument('-overfit', action='store_true', help='Overfit on 5 examples, meant for debugging') +parser.add_argument('-lr_decay_rate', default=0.9997592083, + type=float, help='Decay for lr') +parser.add_argument('-min_lr', default=5e-5, type=float, + help='Minimum learning rate') +parser.add_argument('-weight_init', default='xavier', + choices=['xavier', 'kaiming'], help='Weight initialization strategy') +parser.add_argument('-weight_decay', default=0.00075, + help='Weight decay for l2 regularization') +parser.add_argument('-overfit', action='store_true', + help='Overfit on 5 examples, meant for debugging') parser.add_argument('-gpuid', default=0, type=int, help='GPU id to use') parser.add_argument_group('Checkpointing related arguments') -parser.add_argument('-load_path', default='', help='Checkpoint to load path from') -parser.add_argument('-save_path', default='checkpoints/', help='Path to save checkpoints') -parser.add_argument('-save_step', default=2, type=int, help='Save checkpoint after every save_step epochs') +parser.add_argument('-load_path', default='', + help='Checkpoint to load path from') +parser.add_argument('-save_path', default='checkpoints/', + help='Path to save checkpoints') +parser.add_argument('-save_step', default=2, type=int, + help='Save checkpoint after every save_step epochs') +parser.add_argument( + '--input_vid', default="./data/charades/charades_s3d_mixed_5c_fps_16_480p_scaled", help=".h5 file path for the charades s3d features.") # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- args = parser.parse_args() -start_time = datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') +start_time = datetime.datetime.strftime( + datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') if args.save_path == 'checkpoints/': args.save_path += start_time @@ -114,7 +128,8 @@ setattr(model_args, key, getattr(dataset, key)) # iterations per epoch -setattr(args, 'iter_per_epoch', math.ceil(dataset.num_data_points['train'] / args.batch_size)) +setattr(args, 'iter_per_epoch', math.ceil( + dataset.num_data_points['train'] / args.batch_size)) print("{} iter per epoch.".format(args.iter_per_epoch)) # ---------------------------------------------------------------------------- @@ -123,9 +138,11 @@ encoder = Encoder(model_args) decoder = Decoder(model_args, encoder) -optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay) +optimizer = optim.Adam(list(encoder.parameters( +)) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss() -scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_decay_rate) +scheduler = lr_scheduler.StepLR( + optimizer, step_size=1, gamma=args.lr_decay_rate) if args.load_path != '': encoder.load_state_dict(components['encoder']) @@ -149,7 +166,8 @@ running_loss = 0.0 train_begin = datetime.datetime.utcnow() -print("Training start time: {}".format(datetime.datetime.strftime(train_begin, '%d-%b-%Y-%H:%M:%S'))) +print("Training start time: {}".format( + datetime.datetime.strftime(train_begin, '%d-%b-%Y-%H:%M:%S'))) log_loss = [] for epoch in range(1, model_args.num_epochs + 1): @@ -185,7 +203,6 @@ if optimizer.param_groups[0]['lr'] > args.min_lr: scheduler.step() - # -------------------------------------------------------------------- # print after every few iterations # -------------------------------------------------------------------- @@ -217,8 +234,8 @@ # print current time, running average, learning rate, iteration, epoch print("[{}][Epoch: {:3d}][Iter: {:6d}][Loss: {:6f}][val loss: {:6f}][lr: {:7f}]".format( datetime.datetime.utcnow() - train_begin, epoch, - iteration, running_loss, validation_loss, - optimizer.param_groups[0]['lr'])) + iteration, running_loss, validation_loss, + optimizer.param_groups[0]['lr'])) # ------------------------------------------------------------------------ # save checkpoints and final model @@ -239,4 +256,3 @@ }, os.path.join(args.save_path, 'model_final.pth')) np.save(os.path.join(args.save_path, 'log_loss'), log_loss) - diff --git a/utils/__pycache__/__init__.cpython-36.pyc b/utils/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 0bbfee287e61caf64bdc2e804e20414e78aacbef..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 266 zcmX|5yG{c!5VX&C5jt|>Cn&fE7ikfqprN?}IyWopY$X{y@O+neH8ppQ`AENT^BOUaL)o)oLAlWCI}@$tJ=`1u6W{wjqN8d zrz2nb$5Yh7&Ck;`ie=bink^Baj<2*_(VWl{HBzpSNjTsY`Q?1ICE8hU)E6 zdpKn02}Lomt<)8IW4Nli()W#wHt$lfa;Lox=r&Rx$;jWJ1aQ%1fZP3Pxws3-9K5}+ Jjj8@v^BEJ{4@BgN66pg zt;YuWD;R156DORe#K(V@l8pM4qHLusvwX|Qc4}uGziZ@9+RGf@q2xQlyS#Tvcu&xI ze`)zHr%%Ypc?~+rWJDX5=P)}zgfa<_QvtF&E%QhxMINT`c4HAn&x1wCx!{2kFG`U| zBEr+_$65*M3lIeHDJMRANv4!j&MqC_;tscY=aTxi=(OG}so#A`Y~uI8mww|uIM2f@ zi5^cT5nE})e;lY?-GE7xC7IJDp`@Y}2^c4EmQF@i(^2I@$i@kRBu{h@G<})pK~`{) zj#y(WkxpeFG{9~8K@g>(QbF(sdGqN(E_7UE;vf#wFq6r>gYZQ0gHk7{I^eBO0q7bp z&ZPqiP>I5@5bDy4>sznKYtxg+e1;vnFgh@d|G$G!kvXZ@l0LI)`&UwTDmn**ej!U% zcQvcJmCYHqURiaovbg=cFS)jA$KVL4T{@NXfGlaBuuG48GgzY>|B|bSd+9aJ}O0k*AV;2GLq!gnq~2LV(Yqj$#AC zT?}l{T5eJCqC!fc7DdkC^c95=e;o_2C^dx3xQ4GA_q?~qdSoIMCqu=7WY4f zcZCMK;kw=|9E|Ee{Z$C)v$+!zOC`C&W6QjV4`(9K5J;;TXB&4Rs@ra9BNr% zARJMNMcN?Q^j0__tR_@se~%6*qXXtpo9@$F%%(TkKE2p~7u-$25gr_0qZaIe8Y&o6 zHwJyc;+-XQ7Y>#3l?649*35jx`pRmku-dMu?kt%+1bTF9s(TgGOC(FhDtl@HO}Y~k zvlaiK?{T3bVNyUMzaVlQQ=K3x07BzNMXr-sS(K`=PZDw3xCvw%BrSAAiFLlQd8ot2 zMpBGAz73cw`96GTvGGv4jgukWhI#TPtR28mxJ1}(x(ofKOC5S~>n(iVVG(ve2iqPr zPXTSu4Y28zDawN&cu|IF`^7Xb`2px|23TW~D8t6bjp1gv>WNr`f?(?#gmedvn2^dY zNZ10E5$M2nKQwEj9r+P%!enW7uNLzb6Q^CwbBV37q2K!Z>y@$Tv}2%>$%Ur-y>Z_t a-ulr>$fd?PY%h2p6I@{qIKV6Ov;G6N^Lowz diff --git a/utils/__pycache__/eval_utils.cpython-36.pyc b/utils/__pycache__/eval_utils.cpython-36.pyc deleted file mode 100644 index 22b81d34d1b9a4b0e6d735c775ea88e08a97b24a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1510 zcmZ`(L2u(k7@hIhj^j4jRaL=8n>3b?kP= zDc!7nGFScp`~nXAAzwK~`~xoVzHwR_5VmGK-}B7#_UWx&`v<9r^rB#YEc5`&WN(NRrr}b>FUR4nmg7|sr+OUShUv@%rjC`~j=jc5bJ%r2R3(+j9K;U%axxQ*`8TTO7jWecl(( z;PpN~usr^exiL2PAy-4peege(hKKTyAZYzhCFUpVJF7xjD6NFoEP;b@D#YDc0aqqA zMKwo}II8Ts7ln|HiJn#DBvIM}ln2v01BP;Oe!*@|P%?0mK4XL!DYi0lP%v^;G|oKDGm<$@)0v_pYm$RXp=94f Date: Sat, 14 Nov 2020 10:02:37 -0500 Subject: [PATCH 04/49] add video extention mp4 to the keys --- .gitignore | 2 +- dataloader.py | 32 ++++++++++++++++++-------------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index ba74f36..8ef9b08 100644 --- a/.gitignore +++ b/.gitignore @@ -37,7 +37,7 @@ log/ launcher.sh # IPython Notebook .ipynb_checkpoints - +data # virtualenv venv/ .vscode diff --git a/dataloader.py b/dataloader.py index 15c6c00..0473695 100644 --- a/dataloader.py +++ b/dataloader.py @@ -111,13 +111,13 @@ def __init__(self, args, subsets): if 'video' in args.input_type: print("Reading video features...") # Charades dataset features are all saved in one h5 file as a key, feat dictionary - vid_feats = hdfdict.load( - args.input_vid + "_{0}.h5".format(dtype)) + #vid_feats = hdfdict.load( + # args.input_vid + "_{0}.h5".format(dtype)) # If this throws an error because it cannot find the video filename,uncomment below - # vid_feats = hdfdict.load( - # args.input_vid + "_{0}.h5".format("train")) - # vid_feats.update(hdfdict.load( - # args.input_vid + "_{0}.h5".format("test"))) + vid_feats = hdfdict.load( + args.input_vid + "_{0}.h5".format("train")) + vid_feats.update(hdfdict.load( + args.input_vid + "_{0}.h5".format("test"))) img_fnames = getattr(self, 'unique_img_' + dtype) self.data[dtype + '_img_fnames'] = img_fnames @@ -200,17 +200,21 @@ def __len__(self): return self.num_data_points[self._split] def __getitem__(self, idx): + dtype = self._split item = {'index': idx} item['num_rounds'] = self.data[dtype + '_num_rounds'][idx] - - # get video features - if 'video' in self.args.input_type: - # item['img_fnames'] is as train_val/vid_id.jpg hence the splits - item['img_fnames'] = self.data[dtype + '_img_fnames'][idx] - vid_id = item['img_fnames'].split("/")[-1].split(".")[0] - item['vid_feat'] = torch.from_numpy( - self.data[dtype + '_vid_fv'][vid_id]).reshape(-1) + try: + # get video features + if 'video' in self.args.input_type: + # item['img_fnames'] is as train_val/vid_id.jpg hence the splits + item['img_fnames'] = self.data[dtype + '_img_fnames'][idx] + vid_id = item['img_fnames'].split("/")[-1].split(".")[0] + '.mp4' + item['vid_feat'] = torch.from_numpy( + self.data[dtype + '_vid_fv'][vid_id]).reshape(-1) + except: + import pdb + pdb.set_trace() # get image features if 'image' in self.args.input_type: From 1fef60e4fd55ac4b9ad7909d8d982d35d51b7100 Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Sat, 14 Nov 2020 13:04:37 -0500 Subject: [PATCH 05/49] Adds S3D features and model into the network Signed-off-by: Apoorva Beedu --- dataloader.py | 92 +++++++++++-- encoders/__init__.py | 2 +- encoders/__init__.pyc | Bin 436 -> 0 bytes encoders/lf.py | 25 +++- encoders/lf.pyc | Bin 4709 -> 0 bytes encoders/s3dg_video.py | 291 +++++++++++++++++++++++++++++++++++++++++ train.py | 22 +++- 7 files changed, 412 insertions(+), 20 deletions(-) delete mode 100644 encoders/__init__.pyc delete mode 100644 encoders/lf.pyc create mode 100644 encoders/s3dg_video.py diff --git a/dataloader.py b/dataloader.py index 15c6c00..34d15cd 100644 --- a/dataloader.py +++ b/dataloader.py @@ -7,11 +7,16 @@ import hdfdict import numpy as np from tqdm import tqdm +import ffmpeg +import random import torch +import torch as th import torch.nn.functional as F from torch.utils.data import Dataset +video_root = './data/charades/videos' + class VisDialDataset(Dataset): @@ -111,13 +116,13 @@ def __init__(self, args, subsets): if 'video' in args.input_type: print("Reading video features...") # Charades dataset features are all saved in one h5 file as a key, feat dictionary - vid_feats = hdfdict.load( - args.input_vid + "_{0}.h5".format(dtype)) - # If this throws an error because it cannot find the video filename,uncomment below # vid_feats = hdfdict.load( - # args.input_vid + "_{0}.h5".format("train")) - # vid_feats.update(hdfdict.load( - # args.input_vid + "_{0}.h5".format("test"))) + # args.input_vid + "_{0}.h5".format(dtype)) + # If this throws an error because it cannot find the video filename,uncomment below + vid_feats = hdfdict.load( + args.input_vid + "_{0}.h5".format("train")) + vid_feats.update(hdfdict.load( + args.input_vid + "_{0}.h5".format("test"))) img_fnames = getattr(self, 'unique_img_' + dtype) self.data[dtype + '_img_fnames'] = img_fnames @@ -199,6 +204,65 @@ def split(self, split): def __len__(self): return self.num_data_points[self._split] + def _get_video(self, video_path, start=0, end=0): + ''' + :param video_path: Path of the video file + start: Start time for the video + end: End time. + :return: video: video_frames. + ''' + # start_seek = random.randint(start, int(max(start, end - self.num_sec))) + start_seek = 0 + cmd = ( + ffmpeg + .input(video_path) + .filter('fps', fps=self.args.fps) + ) + if self.args.center_crop: + aw, ah = 0.5, 0.5 + else: + aw, ah = random.uniform(0, 1), random.uniform(0, 1) + if self.args.crop_only: + ''' + Changes from the original code, because we have few videos that have <224 resolution and needs to be scaled up after cropping, and cropping needs to take care of the size of the image which it did not before. + cmd = (cmd.crop('(iw - {})*{}'.format(self.args.video_size, aw), + '(ih - {})*{}'.format(self.args.video_size, ah), + str(self.args.video_size), str(self.args.video_size)) + )''' + cmd = ( + cmd.crop('max(0, (iw - {}))*{}'.format(self.args.video_size, aw), + 'max(0, (ih - {}))*{}'.format(self.args.video_size, ah), + 'min(iw, {})'.format(self.args.video_size), + 'min(ih, {})'.format(self.args.video_size)) + .filter('scale', self.args.video_size, self.args.video_size) + ) + else: + cmd = ( + cmd.crop('(iw - max(0, min(iw,ih)))*{}'.format(aw), + '(ih - max(0, min(iw,ih)))*{}'.format(ah), + 'min(iw,ih)', + 'min(iw,ih)') + .filter('scale', self.args.video_size, self.args.video_size) + ) + if self.args.random_flip and random.uniform(0, 1) > 0.5: + cmd = cmd.hflip() + out, _ = ( + cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24') + .run(capture_stdout=True, quiet=True) + ) + video = np.frombuffer(out, np.uint8).reshape( + [-1, self.args.video_size, self.args.video_size, 3]) + video = th.from_numpy(video) + video = video.permute(3, 0, 1, 2) + if video.shape[1] < self.args.num_frames: + zeros = th.zeros( + (3, self.args.num_frames - video.shape[1], self.args.video_size, self.args.video_size), dtype=th.uint8) + video = th.cat((video, zeros), axis=1) + # Gets n_frames from tne entire video, linearly spaced + vid_indices = np.linspace( + 0, video.shape[1]-1, self.args.num_frames, dtype=int) + return video[:, vid_indices] + def __getitem__(self, idx): dtype = self._split item = {'index': idx} @@ -206,11 +270,21 @@ def __getitem__(self, idx): # get video features if 'video' in self.args.input_type: - # item['img_fnames'] is as train_val/vid_id.jpg hence the splits item['img_fnames'] = self.data[dtype + '_img_fnames'][idx] + # item['img_fnames'] is as train_val/vid_id.jpg hence the splits vid_id = item['img_fnames'].split("/")[-1].split(".")[0] - item['vid_feat'] = torch.from_numpy( - self.data[dtype + '_vid_fv'][vid_id]).reshape(-1) + if ".mp4" not in vid_id: + vid_id = vid_id + ".mp4" + + if self.args.finetune: + f_dtype = "train_val" + if dtype == "test": + f_dtype + "test" + video_path = os.path.join(video_root, f_dtype, vid_id) + item['vid_feat'] = self._get_video(video_path) + else: + item['vid_feat'] = torch.from_numpy( + self.data[dtype + '_vid_fv'][vid_id]).reshape(-1) # get image features if 'image' in self.args.input_type: diff --git a/encoders/__init__.py b/encoders/__init__.py index 6dccef4..4c92256 100644 --- a/encoders/__init__.py +++ b/encoders/__init__.py @@ -1,3 +1,4 @@ +from .s3dg_video import S3D from .lf import LateFusionEncoder @@ -6,4 +7,3 @@ def Encoder(model_args): 'lf-ques-im-hist': LateFusionEncoder } return name_enc_map[model_args.encoder](model_args) - diff --git a/encoders/__init__.pyc b/encoders/__init__.pyc deleted file mode 100644 index 1f9c75869408571886f3ce7ff1583f187b63fe2d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 436 zcmbVIJx{|h5IrXdst5#(40O&CyfGmJ;$uP`8i}DovO?lgqu436gFu3b{#5t{`~lvX zNbFd5@9ewtdp@V%JK5K#y{l(zYBVU5j_ezK==?K=KhJ_1-A3zsn@les%rJ%!#0ktvRa`@*=m{NOcA`qPA)pD<+E|Mc0rbv zYwo)jd45}7%7N8OrHnH{siWm~l6>VTRL(cFqTt1fMa+e;J6pSV9vK%?EO0uMt>{6{*DvYcuu~b5V0EUCv z0Wby{1N}< z%bkDyGycq1Tu4C=hURB;+8sW zVcb-qrN;AWXsMv3zEJ8ysZS3VsBzGrhxX`y_&MlX=xQ5Hl9+x6{b7;#I!Y67ay9q( zEb-#VcMcBp9k4v{bohRegXz7*PlGVKTmzN8gD?Lvnh@SwDzv#6Isxw+^Z|Xi9oW)o zDreB=(tUcvuZ8bjeEHwdoGUe!P(a$^084^G=z)eNmv>>=6=A+cSf~@)HA1IOSQNsN zs9LTQR)o+M!fKsxM+j>|Sg#W{gs^#2;;s<3gs@#n4Ej4*-9dg2Z5hN~J`Me782LRf zJ1*id(Rt2jKaApIcNluQ$U--deiLeLVGs_zVxl9piange#3-Ch^9Sg>JMxkwob<@r zONYHE_Kw3|slaGFiGmr8x5|MjOfTiu>4{eaQTol2!geXSNXalh3Ip*zdL5gb_b@pq z9n_<+7bVj|_fFF+=m~2OCC53IrXFTV;*O#q2ooc&aK>(k#|%ZB2HvD5E{2mLb|>Ch znB@@ZNPbCbM|Zr(TwaSsy(mjU>+gD2>DYlPK|V2=)hF z5*($^_TT z-;q8tuBi|ZWG*L>bPb#4R+R2awyz*X#DNu+z%%&p?Uk%nN|>^u1T)Zpq@AzKNf~jM zWr&gp=WUgBxoUX$+@?{pil8l|M_^k}r z#YpFxz={a`u@dOc4p&9~FEjG=YS!o-;rjOs*Vb*`tqJ30uV@dbMvk1o z+@QaSV|$>P*;AZ*6bUy9o<$fc&2$PPZ;~EMUi#jYFebc5p7uxXFv@amil@n0O}Lrr zWwgI8!kp}GnoQ1c5^FO?r&JHUQYT26o1KzcJ3A$1RNt)&mae&mn@hShZ;ZJ{=r)Rx zVR~Q62(xRv*{u`m%1Sze^vpYn!b}43)Qe&SvA>6D-9(cYQ;=93TsKN0?YaUktSb&C zm2RVXkHwt9*Q_-ssqY;bVB+wq(hHR(-IX~RN}YH0*)$Zrthy5IYgJx&uS&P@ew-Dd z!<`ZLSfO1=IZ}7&FykRJT%)PPH{ePN<_$chz!Q{DFOx$wl&g&#oYOEm9%+e$_`s0J z_L>;FMG_6uEEX}tD{O(MJy}*6-%CFF>8DAqy+jM=QRL-v{gF`~PKM5p=sr)l@k1ip z$ME|Agu{*9!k52E4c4;NwqLUMtX*r<+O*L&*6eM&Yb{w_YZc$RwTJJawT1pYe0xH@ zkKTPtpo4z#oCtLM2S5TyKtaGA4d?+F$YQ2DGIj@*5LIJbUDisb2I@VlEJ?V*0Wpj) zu5xoVl1N>N*kxe`bj%q0yBKT1SVNto0$nqP%OQ*z(yhj7Ql(yqz*vJQK**@C>=!Jl zz>A&d@Otisv3W6ONVgiRNtJpb0%LPx3?NrCmOr&nit97kpTztk%p(ZR(mXprmL-;2 z>N4?xt|n)m17o2SiUe!{Ks#h=l^7SSKp7S}UnxVUuFR|-%HSuIEr_y3b&d`K(KX7< zEUaM40Mr8;9M9Wgq*_v7Q0W63$Pdh?ik2E;yRhl-=o+&@rqhZUyR`82%m#seUoqoo zy@1(Z&|fQNY{?6lHwCb)M(ML?rrYxPxW(-=;{4(QAhR@8Z0wzg7oCwAOwSMltfRa9o~=~0F ziFRHi3vb<(=a-K%zQOq9a$ZKq;rrKl6^$$yZwY^dD-w#orwxWNO7DwlV8O_Fgkhh> z)%htJ5t2)b2qC`C&p1sDFzmd=0RtMD0 z@O&R=k^2L{@6^iz+Ri_xB5Td+09Wv9-`0CYs9kHv?lg9RDtE!vhRp7@Zx7@8_mbVU zx2?V)J2&5T@dt?zmlFoTb@B2mCLD6X09PIPai~XW;E*mL=nT-iTC4QF+hfaN^l3b& z8!fxL+yYzHHu%iVL^k^oZYJDf=L1d?F$*0{a&udu{DwZ diff --git a/encoders/s3dg_video.py b/encoders/s3dg_video.py new file mode 100644 index 0000000..cfc9445 --- /dev/null +++ b/encoders/s3dg_video.py @@ -0,0 +1,291 @@ + +"""Contains a PyTorch definition for Gated Separable 3D network (S3D-G) +with a text module for computing joint text-video embedding from raw text +and video input. The following code will enable you to load the HowTo100M +pretrained S3D Text-Video model from: + A. Miech, J.-B. Alayrac, L. Smaira, I. Laptev, J. Sivic and A. Zisserman, + End-to-End Learning of Visual Representations from Uncurated Instructional Videos. + https://arxiv.org/abs/1912.06430. + +S3D-G was proposed by: + S. Xie, C. Sun, J. Huang, Z. Tu and K. Murphy, + Rethinking Spatiotemporal Feature Learning For Video Understanding. + https://arxiv.org/abs/1712.04851. + Tensorflow code: https://github.com/tensorflow/models/blob/master/research/slim/nets/s3dg.py + +The S3D architecture was slightly modified with a space to depth trick for TPU +optimization. +""" + +import torch as th +import torch.nn.functional as F +import torch.nn as nn +import os +import numpy as np +import re + + +class InceptionBlock(nn.Module): + def __init__( + self, + input_dim, + num_outputs_0_0a, + num_outputs_1_0a, + num_outputs_1_0b, + num_outputs_2_0a, + num_outputs_2_0b, + num_outputs_3_0b, + gating=True, + ): + super(InceptionBlock, self).__init__() + self.conv_b0 = STConv3D(input_dim, num_outputs_0_0a, [1, 1, 1]) + self.conv_b1_a = STConv3D(input_dim, num_outputs_1_0a, [1, 1, 1]) + self.conv_b1_b = STConv3D( + num_outputs_1_0a, num_outputs_1_0b, [3, 3, 3], padding=1, separable=True + ) + self.conv_b2_a = STConv3D(input_dim, num_outputs_2_0a, [1, 1, 1]) + self.conv_b2_b = STConv3D( + num_outputs_2_0a, num_outputs_2_0b, [3, 3, 3], padding=1, separable=True + ) + self.maxpool_b3 = th.nn.MaxPool3d((3, 3, 3), stride=1, padding=1) + self.conv_b3_b = STConv3D(input_dim, num_outputs_3_0b, [1, 1, 1]) + self.gating = gating + self.output_dim = ( + num_outputs_0_0a + num_outputs_1_0b + num_outputs_2_0b + num_outputs_3_0b + ) + if gating: + self.gating_b0 = SelfGating(num_outputs_0_0a) + self.gating_b1 = SelfGating(num_outputs_1_0b) + self.gating_b2 = SelfGating(num_outputs_2_0b) + self.gating_b3 = SelfGating(num_outputs_3_0b) + + def forward(self, input): + """Inception block + """ + b0 = self.conv_b0(input) + b1 = self.conv_b1_a(input) + b1 = self.conv_b1_b(b1) + b2 = self.conv_b2_a(input) + b2 = self.conv_b2_b(b2) + b3 = self.maxpool_b3(input) + b3 = self.conv_b3_b(b3) + if self.gating: + b0 = self.gating_b0(b0) + b1 = self.gating_b1(b1) + b2 = self.gating_b2(b2) + b3 = self.gating_b3(b3) + return th.cat((b0, b1, b2, b3), dim=1) + + +class SelfGating(nn.Module): + def __init__(self, input_dim): + super(SelfGating, self).__init__() + self.fc = nn.Linear(input_dim, input_dim) + + def forward(self, input_tensor): + """Feature gating as used in S3D-G. + """ + spatiotemporal_average = th.mean(input_tensor, dim=[2, 3, 4]) + weights = self.fc(spatiotemporal_average) + weights = th.sigmoid(weights) + return weights[:, :, None, None, None] * input_tensor + + +class STConv3D(nn.Module): + def __init__( + self, input_dim, output_dim, kernel_size, stride=1, padding=0, separable=False + ): + super(STConv3D, self).__init__() + self.separable = separable + self.relu = nn.ReLU(inplace=True) + assert len(kernel_size) == 3 + if separable and kernel_size[0] != 1: + spatial_kernel_size = [1, kernel_size[1], kernel_size[2]] + temporal_kernel_size = [kernel_size[0], 1, 1] + if isinstance(stride, list) and len(stride) == 3: + spatial_stride = [1, stride[1], stride[2]] + temporal_stride = [stride[0], 1, 1] + else: + spatial_stride = [1, stride, stride] + temporal_stride = [stride, 1, 1] + if isinstance(padding, list) and len(padding) == 3: + spatial_padding = [0, padding[1], padding[2]] + temporal_padding = [padding[0], 0, 0] + else: + spatial_padding = [0, padding, padding] + temporal_padding = [padding, 0, 0] + if separable: + self.conv1 = nn.Conv3d( + input_dim, + output_dim, + kernel_size=spatial_kernel_size, + stride=spatial_stride, + padding=spatial_padding, + bias=False, + ) + self.bn1 = nn.BatchNorm3d(output_dim) + self.conv2 = nn.Conv3d( + output_dim, + output_dim, + kernel_size=temporal_kernel_size, + stride=temporal_stride, + padding=temporal_padding, + bias=False, + ) + self.bn2 = nn.BatchNorm3d(output_dim) + else: + self.conv1 = nn.Conv3d( + input_dim, + output_dim, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=False, + ) + self.bn1 = nn.BatchNorm3d(output_dim) + + def forward(self, input): + out = self.relu(self.bn1(self.conv1(input))) + if self.separable: + out = self.relu(self.bn2(self.conv2(out))) + return out + + +class MaxPool3dTFPadding(th.nn.Module): + def __init__(self, kernel_size, stride=None, padding="SAME"): + super(MaxPool3dTFPadding, self).__init__() + if padding == "SAME": + padding_shape = self._get_padding_shape(kernel_size, stride) + self.padding_shape = padding_shape + self.pad = th.nn.ConstantPad3d(padding_shape, 0) + self.pool = th.nn.MaxPool3d(kernel_size, stride, ceil_mode=True) + + def _get_padding_shape(self, filter_shape, stride): + def _pad_top_bottom(filter_dim, stride_val): + pad_along = max(filter_dim - stride_val, 0) + pad_top = pad_along // 2 + pad_bottom = pad_along - pad_top + return pad_top, pad_bottom + + padding_shape = [] + for filter_dim, stride_val in zip(filter_shape, stride): + pad_top, pad_bottom = _pad_top_bottom(filter_dim, stride_val) + padding_shape.append(pad_top) + padding_shape.append(pad_bottom) + depth_top = padding_shape.pop(0) + depth_bottom = padding_shape.pop(0) + padding_shape.append(depth_top) + padding_shape.append(depth_bottom) + return tuple(padding_shape) + + def forward(self, inp): + inp = self.pad(inp) + out = self.pool(inp) + return out + + +class S3D(nn.Module): + def __init__(self, dict_path, num_classes=512, gating=True, space_to_depth=True): + super(S3D, self).__init__() + self.num_classes = num_classes + self.gating = gating + self.space_to_depth = space_to_depth + if space_to_depth: + self.conv1 = STConv3D( + 24, 64, [2, 4, 4], stride=1, padding=(1, 2, 2), separable=False + ) + else: + self.conv1 = STConv3D( + 3, 64, [3, 7, 7], stride=2, padding=(1, 3, 3), separable=False + ) + self.conv_2b = STConv3D(64, 64, [1, 1, 1], separable=False) + self.conv_2c = STConv3D(64, 192, [3, 3, 3], padding=1, separable=True) + self.gating = SelfGating(192) + self.maxpool_2a = MaxPool3dTFPadding( + kernel_size=(1, 3, 3), stride=(1, 2, 2), padding="SAME" + ) + self.maxpool_3a = MaxPool3dTFPadding( + kernel_size=(1, 3, 3), stride=(1, 2, 2), padding="SAME" + ) + self.mixed_3b = InceptionBlock(192, 64, 96, 128, 16, 32, 32) + self.mixed_3c = InceptionBlock( + self.mixed_3b.output_dim, 128, 128, 192, 32, 96, 64 + ) + self.maxpool_4a = MaxPool3dTFPadding( + kernel_size=(3, 3, 3), stride=(2, 2, 2), padding="SAME" + ) + self.mixed_4b = InceptionBlock( + self.mixed_3c.output_dim, 192, 96, 208, 16, 48, 64 + ) + self.mixed_4c = InceptionBlock( + self.mixed_4b.output_dim, 160, 112, 224, 24, 64, 64 + ) + self.mixed_4d = InceptionBlock( + self.mixed_4c.output_dim, 128, 128, 256, 24, 64, 64 + ) + self.mixed_4e = InceptionBlock( + self.mixed_4d.output_dim, 112, 144, 288, 32, 64, 64 + ) + self.mixed_4f = InceptionBlock( + self.mixed_4e.output_dim, 256, 160, 320, 32, 128, 128 + ) + self.maxpool_5a = self.maxPool3d_5a_2x2 = MaxPool3dTFPadding( + kernel_size=(2, 2, 2), stride=(2, 2, 2), padding="SAME" + ) + self.mixed_5b = InceptionBlock( + self.mixed_4f.output_dim, 256, 160, 320, 32, 128, 128 + ) + self.mixed_5c = InceptionBlock( + self.mixed_5b.output_dim, 384, 192, 384, 48, 128, 128 + ) + self.fc = nn.Linear(self.mixed_5c.output_dim, num_classes) + + ''' + if init == 'kaiming_normal': + for m in self.modules(): + if isinstance(m, nn.Conv3d): + nn.init.kaiming_normal_(m.weight, + mode='fan_in', + nonlinearity='relu') + elif isinstance(m, nn.BatchNorm3d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + ''' + + def _space_to_depth(self, input): + """3D space to depth trick for TPU optimization. + """ + B, C, T, H, W = input.shape + input = input.view(B, C, T // 2, 2, H // 2, 2, W // 2, 2) + input = input.permute(0, 3, 5, 7, 1, 2, 4, 6) + input = input.contiguous().view(B, 8 * C, T // 2, H // 2, W // 2) + return input + + def forward(self, inputs): + """Defines the S3DG base architecture. + """ + if self.space_to_depth: + inputs = self._space_to_depth(inputs) + net = self.conv1(inputs) + if self.space_to_depth: + # we need to replicate 'SAME' tensorflow padding + net = net[:, :, 1:, 1:, 1:] + net = self.maxpool_2a(net) + net = self.conv_2b(net) + net = self.conv_2c(net) + if self.gating: + net = self.gating(net) + net = self.maxpool_3a(net) + net = self.mixed_3b(net) + net = self.mixed_3c(net) + net = self.maxpool_4a(net) + net = self.mixed_4b(net) + net = self.mixed_4c(net) + net = self.mixed_4d(net) + net = self.mixed_4e(net) + net = self.mixed_4f(net) + net = self.maxpool_5a(net) + net = self.mixed_5b(net) + net = self.mixed_5c(net) + net = th.mean(net, dim=[2, 3, 4]) + return {'video_embedding': self.fc(net), 'mixed_5c': net} diff --git a/train.py b/train.py index 0c24494..661a2ca 100644 --- a/train.py +++ b/train.py @@ -4,6 +4,7 @@ import math import os import numpy as np +from tqdm import tqdm import torch from torch import nn, optim @@ -63,8 +64,21 @@ parser.add_argument('-save_step', default=2, type=int, help='Save checkpoint after every save_step epochs') parser.add_argument( - '--input_vid', default="./data/charades/charades_s3d_mixed_5c_fps_16_480p_scaled", help=".h5 file path for the charades s3d features.") - + '--input_vid', default="./data/charades/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") +parser.add_argument('--finetune', default=0, type=int, + help="When set true, the model finetunes the s3dg model for video") +# S3DG parameters and dataloader +parser.add_argument('--num_frames', type=int, default=16, + help='random seed') +parser.add_argument('--video_size', type=int, default=224, + help='random seed') +parser.add_argument('--fps', type=int, default=16, help='') +parser.add_argument('--crop_only', type=int, default=1, + help='random seed') +parser.add_argument('--center_crop', type=int, default=0, + help='random seed') +parser.add_argument('--random_flip', type=int, default=0, + help='random seed') # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -171,7 +185,7 @@ log_loss = [] for epoch in range(1, model_args.num_epochs + 1): - for i, batch in enumerate(dataloader): + for i, batch in tqdm(enumerate(dataloader)): optimizer.zero_grad() for key in batch: if not isinstance(batch[key], list): @@ -208,7 +222,7 @@ # -------------------------------------------------------------------- if i % 100 == 0: validation_losses = [] - for _, val_batch in enumerate(dataloader_val): + for _, val_batch in tqdm(enumerate(dataloader_val)): for key in val_batch: if not isinstance(val_batch[key], list): val_batch[key] = Variable(val_batch[key]) From 32f65949ef45253d8f1e33986423aec901e6821b Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Sat, 14 Nov 2020 13:47:28 -0500 Subject: [PATCH 06/49] Fixes some argument Signed-off-by: Apoorva Beedu --- dataloader.py | 5 ++--- train.py | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dataloader.py b/dataloader.py index 5124287..6d695a5 100644 --- a/dataloader.py +++ b/dataloader.py @@ -15,8 +15,6 @@ import torch.nn.functional as F from torch.utils.data import Dataset -video_root = './data/charades/videos' - class VisDialDataset(Dataset): @@ -282,7 +280,8 @@ def __getitem__(self, idx): f_dtype = "train_val" if dtype == "test": f_dtype + "test" - video_path = os.path.join(video_root, f_dtype, vid_id) + video_path = os.path.join( + self.args.video_root, f_dtype, vid_id) item['vid_feat'] = self._get_video(video_path) else: item['vid_feat'] = torch.from_numpy( diff --git a/train.py b/train.py index 661a2ca..6f3a701 100644 --- a/train.py +++ b/train.py @@ -68,7 +68,7 @@ parser.add_argument('--finetune', default=0, type=int, help="When set true, the model finetunes the s3dg model for video") # S3DG parameters and dataloader -parser.add_argument('--num_frames', type=int, default=16, +parser.add_argument('--num_frames', type=int, default=40, help='random seed') parser.add_argument('--video_size', type=int, default=224, help='random seed') @@ -79,6 +79,7 @@ help='random seed') parser.add_argument('--random_flip', type=int, default=0, help='random seed') +parser.add_argument('--video_root', default='./data/charades/videos') # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- From be118d276a120453447ee73b5595fba6134500d7 Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Sat, 14 Nov 2020 18:39:07 -0500 Subject: [PATCH 07/49] Adds provision for freezing all but few layers Signed-off-by: Apoorva Beedu --- dataloader.py | 13 +++++++++---- encoders/lf.py | 12 ++++++++++++ encoders/s3dg_video.py | 3 +-- train.py | 10 +++++++++- 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/dataloader.py b/dataloader.py index 6d695a5..373c4ee 100644 --- a/dataloader.py +++ b/dataloader.py @@ -9,6 +9,7 @@ from tqdm import tqdm import ffmpeg import random +import pdb import torch import torch as th @@ -245,10 +246,14 @@ def _get_video(self, video_path, start=0, end=0): ) if self.args.random_flip and random.uniform(0, 1) > 0.5: cmd = cmd.hflip() - out, _ = ( - cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24') - .run(capture_stdout=True, quiet=True) - ) + try: + out, _ = ( + cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24') + .run(capture_stdout=True, quiet=True) + ) + except: + print(video_path, cmd) + video = np.frombuffer(out, np.uint8).reshape( [-1, self.args.video_size, self.args.video_size, 3]) video = th.from_numpy(video) diff --git a/encoders/lf.py b/encoders/lf.py index 84ae7ba..299204b 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -40,6 +40,8 @@ def __init__(self, args): self.video_embed = S3D( dict_path='data/s3d_dict.npy', space_to_depth=True) self.video_embed.train() + if self.args.unfreeze_layers: + self.__freeze_s3dg_layers() if 'dialog' in args.input_type or 'caption' in args.input_type: self.hist_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, @@ -80,6 +82,16 @@ def __init__(self, args): nn.init.kaiming_uniform(self.fusion.weight.data) nn.init.constant(self.fusion.bias.data, 0) + def __freeze_s3dg_layers(self): + # Only train _4 and _5 layers + layers = ["mixed_5c"] + if self.args.unfreeze_layers == 2: + layers = ["mixed_5b", "mixed_5c"] + for name, param in self.video_embed.named_parameters(): + param.requires_grad = False + if any(l in name for l in layers): + param.requires_grad = True + def forward(self, batch): if 'image' in self.args.input_type: img = batch['img_feat'] diff --git a/encoders/s3dg_video.py b/encoders/s3dg_video.py index cfc9445..a5c3738 100644 --- a/encoders/s3dg_video.py +++ b/encoders/s3dg_video.py @@ -238,7 +238,6 @@ def __init__(self, dict_path, num_classes=512, gating=True, space_to_depth=True) self.mixed_5c = InceptionBlock( self.mixed_5b.output_dim, 384, 192, 384, 48, 128, 128 ) - self.fc = nn.Linear(self.mixed_5c.output_dim, num_classes) ''' if init == 'kaiming_normal': @@ -288,4 +287,4 @@ def forward(self, inputs): net = self.mixed_5b(net) net = self.mixed_5c(net) net = th.mean(net, dim=[2, 3, 4]) - return {'video_embedding': self.fc(net), 'mixed_5c': net} + return {'mixed_5c': net} diff --git a/train.py b/train.py index 6f3a701..7a2a1bd 100644 --- a/train.py +++ b/train.py @@ -80,6 +80,8 @@ parser.add_argument('--random_flip', type=int, default=0, help='random seed') parser.add_argument('--video_root', default='./data/charades/videos') +parser.add_argument('--unfreeze_layers', default=0, type=int, + help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -153,6 +155,12 @@ encoder = Encoder(model_args) decoder = Decoder(model_args, encoder) +total_params = sum(p.numel() for p in encoder.parameters() if p.requires_grad) +print("Total number of encoder params {0}".format(total_params)) +if args.finetune: + total_params = sum(p.numel() + for p in encoder.video_embed.parameters() if p.requires_grad) + print("Total number of s3dg params {0}".format(total_params)) optimizer = optim.Adam(list(encoder.parameters( )) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss() @@ -221,7 +229,7 @@ # -------------------------------------------------------------------- # print after every few iterations # -------------------------------------------------------------------- - if i % 100 == 0: + if i % 500 == 0: validation_losses = [] for _, val_batch in tqdm(enumerate(dataloader_val)): for key in val_batch: From 7ce14ca70c7b06a9545c53cbc5200dd612f731b6 Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Sat, 14 Nov 2020 19:28:24 -0500 Subject: [PATCH 08/49] update evaluate --- .gitignore | 1 + evaluate.py | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 8ef9b08..e4b3b5c 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ logs/ results/ log/ launcher.sh +data # IPython Notebook .ipynb_checkpoints data diff --git a/evaluate.py b/evaluate.py index 0cebf24..c863c57 100644 --- a/evaluate.py +++ b/evaluate.py @@ -19,8 +19,10 @@ parser = argparse.ArgumentParser() VisDialDataset.add_cmdline_args(parser) LateFusionEncoder.add_cmdline_args(parser) - -parser.add_argument('-input_type', default='question_dialog_video_audio', choices=['question_only', +parser.add_argument('--finetune', default=0, type=int) +parser.add_argument('--fps', type=int, default=16, help='') +parser.add_argument('--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") +parser.add_argument('-input_type', default='question_dialog_video', choices=['question_only', 'question_dialog', 'question_audio', 'question_image', @@ -32,16 +34,27 @@ 'question_dialog_video_audio'], help='Specify the inputs') parser.add_argument_group('Evaluation related arguments') -parser.add_argument('-load_path', default='checkpoints/13-Jun-2019-16:22:48/model_epoch_14.pth', help='Checkpoint to load path from') +parser.add_argument('-load_path', default='/nethome/halamri3/cvpr2020/avsd/checkpoints/nofinetune/14-Nov-2020-18:38:13/model_epoch_18.pth', help='Checkpoint to load path from') parser.add_argument('-split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') parser.add_argument('-use_gt', action='store_true', help='Whether to use ground truth for retrieving ranks') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') parser.add_argument('-gpuid', default=0, type=int, help='GPU id to use') parser.add_argument('-overfit', action='store_true', help='Use a batch of only 5 examples, useful for debugging') - +parser.add_argument('--video_root', default='data/videos/') parser.add_argument_group('Submission related arguments') parser.add_argument('-save_ranks', action='store_true', help='Whether to save retrieved ranks') parser.add_argument('-save_path', default='logs/ranks.json', help='Path of json file to save ranks') +parser.add_argument('--random_flip', type=int, default=0, help='random seed') +parser.add_argument('--crop_only', type=int, default=1, + help='random seed') +parser.add_argument('--center_crop', type=int, default=0, + help='random seed') +parser.add_argument('--num_frames', type=int, default=40, + help='random seed') +parser.add_argument('--video_size', type=int, default=224, + help='random seed') + + # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- From 354267bf754d4d123858989ac277ce3676bedfca Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Sat, 14 Nov 2020 21:21:26 -0500 Subject: [PATCH 09/49] adds naming conentions for saving checkpoints and args.txt Signed-off-by: Apoorva Beedu --- train.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index 7a2a1bd..0ee859a 100644 --- a/train.py +++ b/train.py @@ -82,6 +82,8 @@ parser.add_argument('--video_root', default='./data/charades/videos') parser.add_argument('--unfreeze_layers', default=0, type=int, help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") +parser.add_argument("--text_encoder", default="lstm", + help="lstm or transformer", type=str) # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -90,7 +92,9 @@ start_time = datetime.datetime.strftime( datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') if args.save_path == 'checkpoints/': - args.save_path += start_time + # args.save_path += start_time + args.save_path += 's3d_mixed_5c_fps_{0}_num_frames_{1}_text_encoder_{2}_lr_{3}_unfreeze_layer_{4}'.format( + args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers) # seed for reproducibility torch.manual_seed(1234) @@ -186,6 +190,9 @@ encoder.train() decoder.train() os.makedirs(args.save_path, exist_ok=True) +with open(os.path.join(args.save_path, "args_{0}.txt".format(start_time)), "w") as f: + f.write(str(args)) +f.close() running_loss = 0.0 train_begin = datetime.datetime.utcnow() From a752a3c1653be4e1b7a16526ce874e958d7ff425 Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Sun, 15 Nov 2020 06:03:26 -0500 Subject: [PATCH 10/49] update evaluation just updated my email --- checkpoints | 1 + data | 1 - evaluate.py | 2 +- train.py | 12 ++++++------ 4 files changed, 8 insertions(+), 8 deletions(-) create mode 120000 checkpoints delete mode 120000 data diff --git a/checkpoints b/checkpoints new file mode 120000 index 0000000..5cdef85 --- /dev/null +++ b/checkpoints @@ -0,0 +1 @@ +/srv/share/halamri3/checkpoints \ No newline at end of file diff --git a/data b/data deleted file mode 120000 index 75dea0c..0000000 --- a/data +++ /dev/null @@ -1 +0,0 @@ -/home/apoorva/hdd1/avsd \ No newline at end of file diff --git a/evaluate.py b/evaluate.py index c863c57..4fae40e 100644 --- a/evaluate.py +++ b/evaluate.py @@ -34,7 +34,7 @@ 'question_dialog_video_audio'], help='Specify the inputs') parser.add_argument_group('Evaluation related arguments') -parser.add_argument('-load_path', default='/nethome/halamri3/cvpr2020/avsd/checkpoints/nofinetune/14-Nov-2020-18:38:13/model_epoch_18.pth', help='Checkpoint to load path from') +parser.add_argument('-load_path', default='/nethome/halamri3/cvpr2020/avsd/checkpoints/nofinetune/14-Nov-2020-18:38:13/model_final.pth', help='Checkpoint to load path from') parser.add_argument('-split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') parser.add_argument('-use_gt', action='store_true', help='Whether to use ground truth for retrieving ranks') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') diff --git a/train.py b/train.py index 7a2a1bd..fb4c3d6 100644 --- a/train.py +++ b/train.py @@ -21,7 +21,7 @@ LateFusionEncoder.add_cmdline_args(parser) parser.add_argument_group('Input modalites arguments') -parser.add_argument('-input_type', default='question_dialog_video', choices=['question_only', +parser.add_argument('-input_type', default='question_video', choices=['question_only', 'question_dialog', 'question_audio', 'question_image', @@ -64,8 +64,8 @@ parser.add_argument('-save_step', default=2, type=int, help='Save checkpoint after every save_step epochs') parser.add_argument( - '--input_vid', default="./data/charades/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") -parser.add_argument('--finetune', default=0, type=int, + '--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") +parser.add_argument('--finetune', default=1, type=int, help="When set true, the model finetunes the s3dg model for video") # S3DG parameters and dataloader parser.add_argument('--num_frames', type=int, default=40, @@ -79,8 +79,8 @@ help='random seed') parser.add_argument('--random_flip', type=int, default=0, help='random seed') -parser.add_argument('--video_root', default='./data/charades/videos') -parser.add_argument('--unfreeze_layers', default=0, type=int, +parser.add_argument('--video_root', default='data/videos') +parser.add_argument('--unfreeze_layers', default=1, type=int, help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") # ---------------------------------------------------------------------------- # input arguments and options @@ -89,7 +89,7 @@ args = parser.parse_args() start_time = datetime.datetime.strftime( datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') -if args.save_path == 'checkpoints/': +if args.save_path == 'checkpoints/withfinetune_notext/': args.save_path += start_time # seed for reproducibility From 67c6c9c18711ae80bdaf65242a8edeb1b266bf6a Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Sun, 15 Nov 2020 08:26:24 -0500 Subject: [PATCH 11/49] Adds finetune to the naming convention Signed-off-by: Apoorva Beedu --- train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index 0ee859a..13f4b03 100644 --- a/train.py +++ b/train.py @@ -93,8 +93,8 @@ datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') if args.save_path == 'checkpoints/': # args.save_path += start_time - args.save_path += 's3d_mixed_5c_fps_{0}_num_frames_{1}_text_encoder_{2}_lr_{3}_unfreeze_layer_{4}'.format( - args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers) + args.save_path += 's3d_mixed_5c_fps_{0}_num_frames_{1}_text_encoder_{2}_lr_{3}_unfreeze_layer_{4}_finetune_{5}'.format( + args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, finetune) # seed for reproducibility torch.manual_seed(1234) @@ -192,7 +192,7 @@ os.makedirs(args.save_path, exist_ok=True) with open(os.path.join(args.save_path, "args_{0}.txt".format(start_time)), "w") as f: f.write(str(args)) -f.close() +f.close()wf running_loss = 0.0 train_begin = datetime.datetime.utcnow() From 7ac71af95b90cd5629f8bbf0ed3f19f10f48066c Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Sun, 15 Nov 2020 12:51:14 -0500 Subject: [PATCH 12/49] Adds writing video files as npy for easy reading Signed-off-by: Apoorva Beedu --- create_npy.py | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++ dataloader.py | 13 +++-- evaluate.py | 63 ++++++++++++++++------ train.py | 14 +++-- 4 files changed, 208 insertions(+), 26 deletions(-) create mode 100644 create_npy.py diff --git a/create_npy.py b/create_npy.py new file mode 100644 index 0000000..e372679 --- /dev/null +++ b/create_npy.py @@ -0,0 +1,144 @@ +import argparse +import numpy as np +import pandas as pd +import os +from tqdm import tqdm +import ffmpeg +import h5py + +import torch +import torch as th +import torch.nn.functional as F +from torch.utils.data import Dataset +import random + + +class CustomDataset(Dataset): + + def __init__(self, args, path): + """Initialize the dataset with splits given by 'subsets', where + subsets is taken from ['train', 'val', 'test'] + """ + super(CustomDataset, self).__init__() + self.args = args + self.path = path + self.fl_list = self.get_filenames(os.path.join(args.video_root, path)) + + def __len__(self): + return len(self.fl_list) + + def get_filenames(self, path): + results = [] + results += [each for each in os.listdir(path) if each.endswith('.mp4')] + return results + + def _get_video(self, video_path, start=0, end=0): + ''' + :param video_path: Path of the video file + start: Start time for the video + end: End time. + :return: video: video_frames. + ''' + # start_seek = random.randint(start, int(max(start, end - self.num_sec))) + start_seek = 0 + cmd = ( + ffmpeg + .input(video_path) + .filter('fps', fps=self.args.fps) + ) + if self.args.center_crop: + aw, ah = 0.5, 0.5 + else: + aw, ah = random.uniform(0, 1), random.uniform(0, 1) + if self.args.crop_only: + ''' + Changes from the original code, because we have few videos that have <224 resolution and needs to be scaled up after cropping, and cropping needs to take care of the size of the image which it did not before. + cmd = (cmd.crop('(iw - {})*{}'.format(self.args.video_size, aw), + '(ih - {})*{}'.format(self.args.video_size, ah), + str(self.args.video_size), str(self.args.video_size)) + )''' + cmd = ( + cmd.crop('max(0, (iw - {}))*{}'.format(self.args.video_size, aw), + 'max(0, (ih - {}))*{}'.format(self.args.video_size, ah), + 'min(iw, {})'.format(self.args.video_size), + 'min(ih, {})'.format(self.args.video_size)) + .filter('scale', self.args.video_size, self.args.video_size) + ) + else: + cmd = ( + cmd.crop('(iw - max(0, min(iw,ih)))*{}'.format(aw), + '(ih - max(0, min(iw,ih)))*{}'.format(ah), + 'min(iw,ih)', + 'min(iw,ih)') + .filter('scale', self.args.video_size, self.args.video_size) + ) + if self.args.random_flip and random.uniform(0, 1) > 0.5: + cmd = cmd.hflip() + out, _ = ( + cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24') + .run(capture_stdout=True, quiet=True) + ) + + video = np.frombuffer(out, np.uint8).reshape( + [-1, self.args.video_size, self.args.video_size, 3]) + video = th.from_numpy(video) + video = video.permute(3, 0, 1, 2) + if video.shape[1] < self.args.num_frames: + zeros = th.zeros( + (3, self.args.num_frames - video.shape[1], self.args.video_size, self.args.video_size), dtype=th.uint8) + video = th.cat((video, zeros), axis=1) + # Gets n_frames from tne entire video, linearly spaced + vid_indices = np.linspace( + 0, video.shape[1]-1, self.args.num_frames, dtype=int) + return video[:, vid_indices] + + def __getitem__(self, idx): + video_file = self.fl_list[idx] + write_file = os.path.join( + self.args.write_path, video_file.replace(".mp4", ".npy")) + video_path = os.path.join( + self.args.video_root, self.path, video_file) + vid = self._get_video(video_path) + np.save(write_file, vid) + return 0 + + +def main(args): + dataloader = torch.utils.data.DataLoader( + CustomDataset(args, args.train_val_path), + batch_size=1, + shuffle=False, drop_last=True) + + dataloader_val = torch.utils.data.DataLoader( + CustomDataset(args, args.test_path), + batch_size=1, + shuffle=False, drop_last=True) + + for i, batch in tqdm(enumerate(dataloader)): + print("train ", i) + for i, batch in tqdm(enumerate(dataloader_val)): + print("val ", i) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num_frames', type=int, default=40, + help='num_frame') + parser.add_argument('--video_root', default='./data/charades/videos') + + parser.add_argument('--write_path', default="./data/charades") + parser.add_argument('--video_size', type=int, default=224, + help='random seed') + parser.add_argument('--fps', type=int, default=16, help='') + parser.add_argument('--crop_only', type=int, default=1, + help='random seed') + parser.add_argument('--center_crop', type=int, default=0, + help='random seed') + parser.add_argument('--random_flip', type=int, default=0, + help='random seed') + args = parser.parse_args() + args.train_val_path = "train_val" + args.test_path = "test" + args.write_path += "/num_frames_{}".format(args.num_frames) + os.makedirs(args.write_path, exist_ok=True) + main(args) diff --git a/dataloader.py b/dataloader.py index 373c4ee..205f153 100644 --- a/dataloader.py +++ b/dataloader.py @@ -284,10 +284,15 @@ def __getitem__(self, idx): if self.args.finetune: f_dtype = "train_val" if dtype == "test": - f_dtype + "test" - video_path = os.path.join( - self.args.video_root, f_dtype, vid_id) - item['vid_feat'] = self._get_video(video_path) + f_dtype = "test" + if self.args.use_npy: + video_path = os.path.join(numpy_path, vid_id) + item['vid_feat'] = np.load( + video_path.replace(".mp4", ".npy")) + else: + video_path = os.path.join( + self.args.video_root, f_dtype, vid_id) + item['vid_feat'] = self._get_video(video_path) else: item['vid_feat'] = torch.from_numpy( self.data[dtype + '_vid_fv'][vid_id]).reshape(-1) diff --git a/evaluate.py b/evaluate.py index 0cebf24..e97fc45 100644 --- a/evaluate.py +++ b/evaluate.py @@ -21,27 +21,56 @@ LateFusionEncoder.add_cmdline_args(parser) parser.add_argument('-input_type', default='question_dialog_video_audio', choices=['question_only', - 'question_dialog', - 'question_audio', - 'question_image', - 'question_video', - 'question_caption_image', - 'question_dialog_video', - 'question_dialog_image', - 'question_video_audio', - 'question_dialog_video_audio'], help='Specify the inputs') + 'question_dialog', + 'question_audio', + 'question_image', + 'question_video', + 'question_caption_image', + 'question_dialog_video', + 'question_dialog_image', + 'question_video_audio', + 'question_dialog_video_audio'], help='Specify the inputs') parser.add_argument_group('Evaluation related arguments') -parser.add_argument('-load_path', default='checkpoints/13-Jun-2019-16:22:48/model_epoch_14.pth', help='Checkpoint to load path from') -parser.add_argument('-split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') -parser.add_argument('-use_gt', action='store_true', help='Whether to use ground truth for retrieving ranks') +parser.add_argument('-load_path', default='checkpoints/13-Jun-2019-16:22:48/model_epoch_14.pth', + help='Checkpoint to load path from') +parser.add_argument('-split', default='test', + choices=['val', 'test', 'train'], help='Split to evaluate on') +parser.add_argument('-use_gt', action='store_true', + help='Whether to use ground truth for retrieving ranks') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') parser.add_argument('-gpuid', default=0, type=int, help='GPU id to use') -parser.add_argument('-overfit', action='store_true', help='Use a batch of only 5 examples, useful for debugging') +parser.add_argument('-overfit', action='store_true', + help='Use a batch of only 5 examples, useful for debugging') parser.add_argument_group('Submission related arguments') -parser.add_argument('-save_ranks', action='store_true', help='Whether to save retrieved ranks') -parser.add_argument('-save_path', default='logs/ranks.json', help='Path of json file to save ranks') +parser.add_argument('-save_ranks', action='store_true', + help='Whether to save retrieved ranks') +parser.add_argument('-save_path', default='logs/ranks.json', + help='Path of json file to save ranks') +parser.add_argument( + '--input_vid', default="./data/charades/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") +parser.add_argument('--finetune', default=0, type=int, + help="When set true, the model finetunes the s3dg model for video") +# S3DG parameters and dataloader +parser.add_argument('--num_frames', type=int, default=40, + help='num_frame') +parser.add_argument('--video_size', type=int, default=224, + help='random seed') +parser.add_argument('--fps', type=int, default=16, help='') +parser.add_argument('--crop_only', type=int, default=1, + help='random seed') +parser.add_argument('--center_crop', type=int, default=0, + help='random seed') +parser.add_argument('--random_flip', type=int, default=0, + help='random seed') +parser.add_argument('--video_root', default='./data/charades/videos') +parser.add_argument('--unfreeze_layers', default=0, type=int, + help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") +parser.add_argument("--text_encoder", default="lstm", + help="lstm or transformer", type=str) +parser.add_argument("--use_npy", default=0, + help="Uses npy instead of reading from videos") # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -82,7 +111,8 @@ collate_fn=dataset.collate_fn) # iterations per epoch -setattr(args, 'iter_per_epoch', math.ceil(dataset.num_data_points[args.split] / args.batch_size)) +setattr(args, 'iter_per_epoch', math.ceil( + dataset.num_data_points[args.split] / args.batch_size)) print("{} iter per epoch.".format(args.iter_per_epoch)) # ---------------------------------------------------------------------------- @@ -110,7 +140,6 @@ decoder.eval() - if args.use_gt: # ------------------------------------------------------------------------ # calculate automatic metrics and finish diff --git a/train.py b/train.py index 13f4b03..9462a71 100644 --- a/train.py +++ b/train.py @@ -69,7 +69,7 @@ help="When set true, the model finetunes the s3dg model for video") # S3DG parameters and dataloader parser.add_argument('--num_frames', type=int, default=40, - help='random seed') + help='num_frame') parser.add_argument('--video_size', type=int, default=224, help='random seed') parser.add_argument('--fps', type=int, default=16, help='') @@ -84,17 +84,21 @@ help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") parser.add_argument("--text_encoder", default="lstm", help="lstm or transformer", type=str) +parser.add_argument("--use_npy", default=0, + help="Uses npy instead of reading from videos") +parser.add_argument("--numpy_path", default="./data/charades") # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- args = parser.parse_args() +args.numpy_path += "num_frames_{}".format(args.num_frames) start_time = datetime.datetime.strftime( datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') if args.save_path == 'checkpoints/': # args.save_path += start_time args.save_path += 's3d_mixed_5c_fps_{0}_num_frames_{1}_text_encoder_{2}_lr_{3}_unfreeze_layer_{4}_finetune_{5}'.format( - args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, finetune) + args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, args.finetune) # seed for reproducibility torch.manual_seed(1234) @@ -138,7 +142,6 @@ batch_size=args.batch_size, shuffle=False, collate_fn=dataset.collate_fn) - # ---------------------------------------------------------------------------- # setting model args # ---------------------------------------------------------------------------- @@ -192,7 +195,7 @@ os.makedirs(args.save_path, exist_ok=True) with open(os.path.join(args.save_path, "args_{0}.txt".format(start_time)), "w") as f: f.write(str(args)) -f.close()wf +f.close() running_loss = 0.0 train_begin = datetime.datetime.utcnow() @@ -236,7 +239,8 @@ # -------------------------------------------------------------------- # print after every few iterations # -------------------------------------------------------------------- - if i % 500 == 0: + if i % 200 == 0: + print("Running validation") validation_losses = [] for _, val_batch in tqdm(enumerate(dataloader_val)): for key in val_batch: From 29c1737f693718b5fad273ee72f0089c9a96d0ab Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Mon, 16 Nov 2020 09:51:32 -0500 Subject: [PATCH 13/49] minor changes Signed-off-by: Apoorva Beedu --- create_npy.py | 19 ++++++++++++------- dataloader.py | 11 ++++------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/create_npy.py b/create_npy.py index e372679..2541a33 100644 --- a/create_npy.py +++ b/create_npy.py @@ -22,7 +22,8 @@ def __init__(self, args, path): super(CustomDataset, self).__init__() self.args = args self.path = path - self.fl_list = self.get_filenames(os.path.join(args.video_root, path)) + self.fl_list = self.get_filenames( + os.path.join(args.video_root, path)) def __len__(self): return len(self.fl_list) @@ -52,7 +53,7 @@ def _get_video(self, video_path, start=0, end=0): aw, ah = random.uniform(0, 1), random.uniform(0, 1) if self.args.crop_only: ''' - Changes from the original code, because we have few videos that have <224 resolution and needs to be scaled up after cropping, and cropping needs to take care of the size of the image which it did not before. + Changes from the original code, because we have few videos that have <224 resolution and needs to be scaled up after cropping, and cropping needs to take care of the size of the image which it did not before. cmd = (cmd.crop('(iw - {})*{}'.format(self.args.video_size, aw), '(ih - {})*{}'.format(self.args.video_size, ah), str(self.args.video_size), str(self.args.video_size)) @@ -100,7 +101,7 @@ def __getitem__(self, idx): self.args.video_root, self.path, video_file) vid = self._get_video(video_path) np.save(write_file, vid) - return 0 + return video_file def main(args): @@ -114,10 +115,12 @@ def main(args): batch_size=1, shuffle=False, drop_last=True) - for i, batch in tqdm(enumerate(dataloader)): - print("train ", i) - for i, batch in tqdm(enumerate(dataloader_val)): - print("val ", i) + if args.train: + for i, batch in tqdm(enumerate(dataloader)): + print("train ", batch) + if args.test: + for i, batch in tqdm(enumerate(dataloader_val)): + print("val ", batch) if __name__ == "__main__": @@ -136,6 +139,8 @@ def main(args): help='random seed') parser.add_argument('--random_flip', type=int, default=0, help='random seed') + parser.add_argument('--train', default=1) + parser.add_argument('--test', default=1) args = parser.parse_args() args.train_val_path = "train_val" args.test_path = "test" diff --git a/dataloader.py b/dataloader.py index 205f153..09dff69 100644 --- a/dataloader.py +++ b/dataloader.py @@ -246,13 +246,10 @@ def _get_video(self, video_path, start=0, end=0): ) if self.args.random_flip and random.uniform(0, 1) > 0.5: cmd = cmd.hflip() - try: - out, _ = ( - cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24') - .run(capture_stdout=True, quiet=True) - ) - except: - print(video_path, cmd) + out, _ = ( + cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24') + .run(capture_stdout=True, quiet=True) + ) video = np.frombuffer(out, np.uint8).reshape( [-1, self.args.video_size, self.args.video_size, 3]) From 99c5887532d5221fdeb5c3fc1b1130db1bbb7993 Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Mon, 16 Nov 2020 10:38:54 -0500 Subject: [PATCH 14/49] deletes data folder Signed-off-by: Apoorva Beedu --- data | 1 - 1 file changed, 1 deletion(-) delete mode 120000 data diff --git a/data b/data deleted file mode 120000 index 75dea0c..0000000 --- a/data +++ /dev/null @@ -1 +0,0 @@ -/home/apoorva/hdd1/avsd \ No newline at end of file From e6b2863c9a29b9d250278e892466ed07642b84ec Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Mon, 16 Nov 2020 12:37:35 -0500 Subject: [PATCH 15/49] minor bug fix Signed-off-by: Apoorva Beedu --- dataloader.py | 6 +++--- train.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dataloader.py b/dataloader.py index 09dff69..0bbd75f 100644 --- a/dataloader.py +++ b/dataloader.py @@ -283,9 +283,9 @@ def __getitem__(self, idx): if dtype == "test": f_dtype = "test" if self.args.use_npy: - video_path = os.path.join(numpy_path, vid_id) - item['vid_feat'] = np.load( - video_path.replace(".mp4", ".npy")) + video_path = os.path.join(self.args.numpy_path, vid_id) + item['vid_feat'] = torch.from_numpy(np.load( + video_path.replace(".mp4", ".npy"))) else: video_path = os.path.join( self.args.video_root, f_dtype, vid_id) diff --git a/train.py b/train.py index 9462a71..dc2fca9 100644 --- a/train.py +++ b/train.py @@ -92,7 +92,7 @@ # ---------------------------------------------------------------------------- args = parser.parse_args() -args.numpy_path += "num_frames_{}".format(args.num_frames) +args.numpy_path += "/num_frames_{}".format(args.num_frames) start_time = datetime.datetime.strftime( datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') if args.save_path == 'checkpoints/': From c6d3c600c397a81309b0ad5df322896fdc5a7937 Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Thu, 19 Nov 2020 10:20:46 -0500 Subject: [PATCH 16/49] add input type ot the dir name --- dataloader.py | 7 ++++--- encoders/lf.py | 44 ++++++++++++++++++++++---------------------- evaluate.py | 10 +++++----- train.py | 47 ++++++++++++++++++++++++----------------------- 4 files changed, 55 insertions(+), 53 deletions(-) diff --git a/dataloader.py b/dataloader.py index 0bbd75f..349ca59 100644 --- a/dataloader.py +++ b/dataloader.py @@ -159,9 +159,6 @@ def __init__(self, args, subsets): # reduce amount of data for preprocessing in fast mode # TODO - if args.overfit: - print('\n \n \n ---------->> NOT IMPLEMENTED OVERFIT CASE <-----\n \n \n ') - self.num_data_points = {} for dtype in subsets: self.num_data_points[dtype] = len(self.data[dtype + '_ques']) @@ -188,6 +185,10 @@ def __init__(self, args, subsets): else: self._split = subsets[0] + if args.overfit: + self.num_data_points['train'] = 5 + self.num_data_points['val'] = 5 + @property def split(self): return self._split diff --git a/encoders/lf.py b/encoders/lf.py index 299204b..da53d7a 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -43,7 +43,7 @@ def __init__(self, args): if self.args.unfreeze_layers: self.__freeze_s3dg_layers() - if 'dialog' in args.input_type or 'caption' in args.input_type: + if 'DH' in args.input_type or 'C' in args.input_type: self.hist_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, args.num_layers, batch_first=True, dropout=args.dropout) self.hist_rnn = DynamicRNN(self.hist_rnn) @@ -57,20 +57,20 @@ def __init__(self, args): self.dropout = nn.Dropout(p=args.dropout) # fusion layer - if args.input_type == 'question_only': + if args.input_type == 'Q_only': fusion_size = args.rnn_hidden_size - if args.input_type == 'question_dialog': + if args.input_type == 'Q_DH': fusion_size = args.rnn_hidden_size * 2 - if args.input_type == 'question_audio': + if args.input_type == 'Q_A': fusion_size = args.rnn_hidden_size + args.audio_feature_size - if args.input_type == 'question_image' or args.input_type == 'question_video': + if args.input_type == 'Q_I' or args.input_type == 'Q_V': fusion_size = args.img_feature_size + args.rnn_hidden_size - if args.input_type == 'question_caption_image' or args.input_type == 'question_dialog_video' or args.input_type == 'question_dialog_image': + if args.input_type == 'Q_C_I' or args.input_type == 'Q_DH_V' or args.input_type == 'Q_DH_I': fusion_size = args.img_feature_size + args.rnn_hidden_size * 2 - if args.input_type == 'question_video_audio': + if args.input_type == 'Q_V_A': fusion_size = args.img_feature_size + \ args.rnn_hidden_size + args.audio_feature_size - if args.input_type == 'question_dialog_video_audio': + if args.input_type == 'Q_DH_V_A': fusion_size = args.img_feature_size + \ args.rnn_hidden_size * 2 + args.audio_feature_size @@ -93,21 +93,21 @@ def __freeze_s3dg_layers(self): param.requires_grad = True def forward(self, batch): - if 'image' in self.args.input_type: + if 'I' in self.args.input_type: img = batch['img_feat'] # repeat image feature vectors to be provided for every round img = img.view(-1, 1, self.args.img_feature_size) img = img.repeat(1, self.args.max_ques_count, 1) img = img.view(-1, self.args.img_feature_size) - if 'audio' in self.args.input_type: + if 'A' in self.args.input_type: audio = batch['audio_feat'] # repeat audio feature vectors to be provided for every round audio = audio.view(-1, 1, self.args.audio_feature_size) audio = audio.repeat(1, self.args.max_ques_count, 1) audio = audio.view(-1, self.args.audio_feature_size) - if 'video' in self.args.input_type: + if 'V' in self.args.input_type: if self.args.finetune: # In this case, vid_feat has video frames.Multiplication by 255 because s3d video frames are normalised vid = self.video_embed(batch['vid_feat'].float())[ @@ -119,7 +119,7 @@ def forward(self, batch): vid = vid.repeat(1, self.args.max_ques_count, 1) vid = vid.view(-1, self.args.vid_feature_size) - if 'dialog' in self.args.input_type or 'caption' in self.args.input_type: + if 'DH' in self.args.input_type or 'C' in self.args.input_type: hist = batch['hist'] # embed history hist = hist.view(-1, hist.size(2)) @@ -133,25 +133,25 @@ def forward(self, batch): ques_embed = self.word_embed(ques) ques_embed = self.ques_rnn(ques_embed, batch['ques_len']) - if self.args.input_type == 'question_only': + if self.args.input_type == 'Q_only': fused_vector = ques_embed - if self.args.input_type == 'question_dialog': + if self.args.input_type == 'Q_DH': fused_vector = torch.cat((ques_embed, hist_embed), 1) - if self.args.input_type == 'question_audio': + if self.args.input_type == 'Q_A': fused_vector = torch.cat((audio, ques_embed), 1) - if self.args.input_type == 'question_image': + if self.args.input_type == 'Q_I': fused_vector = torch.cat((img, ques_embed), 1) - if self.args.input_type == 'question_video': + if self.args.input_type == 'Q_V': fused_vector = torch.cat((vid, ques_embed), 1) - if self.args.input_type == 'question_dialog_image': + if self.args.input_type == 'Q_DH_I': fused_vector = torch.cat((img, ques_embed, hist_embed), 1) - if self.args.input_type == 'question_dialog_video': + if self.args.input_type == 'Q_DH_V': fused_vector = torch.cat((vid, ques_embed, hist_embed), 1) - if self.args.input_type == 'question_caption_image': + if self.args.input_type == 'Q_C_I': fused_vector = torch.cat((img, ques_embed, hist_embed), 1) - if self.args.input_type == 'question_video_audio': + if self.args.input_type == 'Q_V_A': fused_vector = torch.cat((vid, audio, ques_embed), 1) - if self.args.input_type == 'question_dialog_video_audio': + if self.args.input_type == 'Q_DH_V_A': fused_vector = torch.cat((vid, audio, ques_embed, hist_embed), 1) fused_vector = self.dropout(fused_vector) diff --git a/evaluate.py b/evaluate.py index e97fc45..59e1ba5 100644 --- a/evaluate.py +++ b/evaluate.py @@ -20,7 +20,7 @@ VisDialDataset.add_cmdline_args(parser) LateFusionEncoder.add_cmdline_args(parser) -parser.add_argument('-input_type', default='question_dialog_video_audio', choices=['question_only', +parser.add_argument('-input_type', default='question_dialog_video', choices=['question_only', 'question_dialog', 'question_audio', 'question_image', @@ -32,7 +32,7 @@ 'question_dialog_video_audio'], help='Specify the inputs') parser.add_argument_group('Evaluation related arguments') -parser.add_argument('-load_path', default='checkpoints/13-Jun-2019-16:22:48/model_epoch_14.pth', +parser.add_argument('-load_path', default='checkpoints/s3d_mixed_5c_fps_16_num_frames_40_text_encoder_lstm_lr_0.001_unfreeze_layer_1_finetune_1_use_npy_1/model_final.pth', help='Checkpoint to load path from') parser.add_argument('-split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') @@ -46,10 +46,10 @@ parser.add_argument_group('Submission related arguments') parser.add_argument('-save_ranks', action='store_true', help='Whether to save retrieved ranks') -parser.add_argument('-save_path', default='logs/ranks.json', +parser.add_argument('-save_path', default='logs/qes_dialog_videos_ranks.json', help='Path of json file to save ranks') parser.add_argument( - '--input_vid', default="./data/charades/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") + '--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") parser.add_argument('--finetune', default=0, type=int, help="When set true, the model finetunes the s3dg model for video") # S3DG parameters and dataloader @@ -64,7 +64,7 @@ help='random seed') parser.add_argument('--random_flip', type=int, default=0, help='random seed') -parser.add_argument('--video_root', default='./data/charades/videos') +parser.add_argument('--video_root', default='data/charades/videos') parser.add_argument('--unfreeze_layers', default=0, type=int, help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") parser.add_argument("--text_encoder", default="lstm", diff --git a/train.py b/train.py index dc2fca9..4b92999 100644 --- a/train.py +++ b/train.py @@ -21,16 +21,15 @@ LateFusionEncoder.add_cmdline_args(parser) parser.add_argument_group('Input modalites arguments') -parser.add_argument('-input_type', default='question_dialog_video', choices=['question_only', - 'question_dialog', - 'question_audio', - 'question_image', - 'question_video', - 'question_caption_image', - 'question_dialog_video', - 'question_dialog_image', - 'question_video_audio', - 'question_dialog_video_audio'], help='Specify the inputs') +parser.add_argument('-input_type', default='Q_DH_V', choices=['Q_only','Q_DH', + 'Q_A', + 'Q_I', + 'Q_V', + 'Q_C_I', + 'Q_DH_V', + 'Q_DH_I', + 'Q_V_A', + 'Q_DH_V_A'], help='Specify the inputs') parser.add_argument_group('Encoder Decoder choice arguments') parser.add_argument('-encoder', default='lf-ques-im-hist', @@ -41,11 +40,11 @@ choices=['disc'], help='Decoder to use for training') parser.add_argument_group('Optimization related arguments') -parser.add_argument('-num_epochs', default=20, type=int, help='Epochs') +parser.add_argument('-num_epochs', default=40, type=int, help='Epochs') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') parser.add_argument('-lr', default=1e-3, type=float, help='Learning rate') parser.add_argument('-lr_decay_rate', default=0.9997592083, - type=float, help='Decay for lr') + type=float, help='Decay for lr') parser.add_argument('-min_lr', default=5e-5, type=float, help='Minimum learning rate') parser.add_argument('-weight_init', default='xavier', @@ -64,7 +63,7 @@ parser.add_argument('-save_step', default=2, type=int, help='Save checkpoint after every save_step epochs') parser.add_argument( - '--input_vid', default="./data/charades/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") + '--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") parser.add_argument('--finetune', default=0, type=int, help="When set true, the model finetunes the s3dg model for video") # S3DG parameters and dataloader @@ -79,14 +78,14 @@ help='random seed') parser.add_argument('--random_flip', type=int, default=0, help='random seed') -parser.add_argument('--video_root', default='./data/charades/videos') -parser.add_argument('--unfreeze_layers', default=0, type=int, +parser.add_argument('--video_root', default='data/videos') +parser.add_argument('--unfreeze_layers', default=1, type=int, help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") parser.add_argument("--text_encoder", default="lstm", help="lstm or transformer", type=str) -parser.add_argument("--use_npy", default=0, +parser.add_argument("--use_npy", default=1, help="Uses npy instead of reading from videos") -parser.add_argument("--numpy_path", default="./data/charades") +parser.add_argument("--numpy_path", default="data/charades") # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -97,8 +96,8 @@ datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') if args.save_path == 'checkpoints/': # args.save_path += start_time - args.save_path += 's3d_mixed_5c_fps_{0}_num_frames_{1}_text_encoder_{2}_lr_{3}_unfreeze_layer_{4}_finetune_{5}'.format( - args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, args.finetune) + args.save_path += 'input_type_{0}_s3d_mixed_5c_fps_{1}_num_frames_{2}_text_encoder_{3}_lr_{4}_unfreeze_layer_{5}_finetune_{6}_use_npy_{7}'.format( + args.input_type, args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, args.finetune, args.use_npy) # seed for reproducibility torch.manual_seed(1234) @@ -239,9 +238,12 @@ # -------------------------------------------------------------------- # print after every few iterations # -------------------------------------------------------------------- + if i % 200 == 0: - print("Running validation") + + #print("Running validation") validation_losses = [] + for _, val_batch in tqdm(enumerate(dataloader_val)): for key in val_batch: if not isinstance(val_batch[key], list): @@ -255,9 +257,8 @@ validation_losses.append(cur_loss.item()) validation_loss = np.mean(validation_losses) - iteration = (epoch - 1) * args.iter_per_epoch + i - + log_loss.append((epoch, iteration, running_loss, @@ -270,7 +271,7 @@ datetime.datetime.utcnow() - train_begin, epoch, iteration, running_loss, validation_loss, optimizer.param_groups[0]['lr'])) - + # ------------------------------------------------------------------------ # save checkpoints and final model # ------------------------------------------------------------------------ From 53ff2fc13c39288aa4e3a4db5ce6659f9b69abd9 Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Thu, 19 Nov 2020 12:42:18 -0500 Subject: [PATCH 17/49] update input_type to add it to the savd dir --- dataloader.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dataloader.py b/dataloader.py index 349ca59..4346599 100644 --- a/dataloader.py +++ b/dataloader.py @@ -112,7 +112,7 @@ def __init__(self, args, subsets): self.data[save_label.format(dtype)] = torch.from_numpy( np.array(ques_file[load_label.format(dtype)], dtype='int64')) - if 'video' in args.input_type: + if 'V' in args.input_type: print("Reading video features...") # Charades dataset features are all saved in one h5 file as a key, feat dictionary @@ -128,7 +128,7 @@ def __init__(self, args, subsets): self.data[dtype + '_img_fnames'] = img_fnames self.data[dtype + '_vid_fv'] = vid_feats - if 'image' in args.input_type: + if 'I' in args.input_type: print("Reading image features...") img_feats = torch.from_numpy( np.array(img_file['images_' + dtype])) @@ -141,7 +141,7 @@ def __init__(self, args, subsets): self.data[dtype + '_img_fnames'] = img_fnames self.data[dtype + '_img_fv'] = img_feats - if 'audio' in args.input_type: + if 'A' in args.input_type: print("Reading audio features...") audio_feats = torch.from_numpy( np.array(audio_file['images_' + dtype])) @@ -169,7 +169,7 @@ def __init__(self, args, subsets): print("\tMax ans len: {}".format(self.max_ans_len)) # prepare history - if 'dialog' in args.input_type or 'caption' in args.input_type: + if 'DH' in args.input_type or 'C' in args.input_type: for dtype in subsets: self._process_history(dtype) @@ -272,7 +272,7 @@ def __getitem__(self, idx): item['num_rounds'] = self.data[dtype + '_num_rounds'][idx] # get video features - if 'video' in self.args.input_type: + if 'V' in self.args.input_type: item['img_fnames'] = self.data[dtype + '_img_fnames'][idx] # item['img_fnames'] is as train_val/vid_id.jpg hence the splits vid_id = item['img_fnames'].split("/")[-1].split(".")[0] @@ -296,16 +296,16 @@ def __getitem__(self, idx): self.data[dtype + '_vid_fv'][vid_id]).reshape(-1) # get image features - if 'image' in self.args.input_type: + if 'I' in self.args.input_type: item['img_feat'] = self.data[dtype + '_img_fv'][idx] item['img_fnames'] = self.data[dtype + '_img_fnames'][idx] # get audio features - if 'audio' in self.args.input_type: + if 'A' in self.args.input_type: item['audio_feat'] = self.data[dtype + '_audio_fv'][idx] # get history tokens - if 'dialog' in self.args.input_type or 'caption' in self.args.input_type: + if 'DH' in self.args.input_type or 'caption' in self.args.input_type: item['hist_len'] = self.data[dtype + '_hist_len'][idx] item['hist_len'][item['hist_len'] == 0] += 1 item['hist'] = self.data[dtype + '_hist'][idx] @@ -391,7 +391,7 @@ def _process_history(self, dtype): max_ques_len + max_ans_len).long() hist_len = torch.zeros(num_convs, num_rounds).long() - if 'dialog' in self.args.input_type: + if 'DH' in self.args.input_type: # go over each question and append it with answer for th_id in range(num_convs): clen = cap_len[th_id] From 3077e3b98f1e2b67934a4ea1f85d97ffb8517eac Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Mon, 23 Nov 2020 07:21:04 -0500 Subject: [PATCH 18/49] update training parameters --- train.py | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/train.py b/train.py index 060cd01..902fc2d 100644 --- a/train.py +++ b/train.py @@ -21,16 +21,6 @@ LateFusionEncoder.add_cmdline_args(parser) parser.add_argument_group('Input modalites arguments') -parser.add_argument('-input_type', default='question_video', choices=['question_only', - 'question_dialog', - 'question_audio', - 'question_image', - 'question_video', - 'question_caption_image', - 'question_dialog_video', - 'question_dialog_image', - 'question_video_audio', - 'question_dialog_video_audio'], help='Specify the inputs') parser.add_argument('-input_type', default='Q_DH_V', choices=['Q_only','Q_DH', 'Q_A', 'Q_I', @@ -50,16 +40,16 @@ choices=['disc'], help='Decoder to use for training') parser.add_argument_group('Optimization related arguments') -parser.add_argument('-num_epochs', default=40, type=int, help='Epochs') +parser.add_argument('-num_epochs', default=45, type=int, help='Epochs') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') -parser.add_argument('-lr', default=1e-3, type=float, help='Learning rate') +parser.add_argument('-lr', default=0.001, type=float, help='Learning rate') parser.add_argument('-lr_decay_rate', default=0.9997592083, type=float, help='Decay for lr') parser.add_argument('-min_lr', default=5e-5, type=float, help='Minimum learning rate') parser.add_argument('-weight_init', default='xavier', choices=['xavier', 'kaiming'], help='Weight initialization strategy') -parser.add_argument('-weight_decay', default=0.00075, +parser.add_argument('-weight_decay', default=5e-4, help='Weight decay for l2 regularization') parser.add_argument('-overfit', action='store_true', help='Overfit on 5 examples, meant for debugging') @@ -70,14 +60,11 @@ help='Checkpoint to load path from') parser.add_argument('-save_path', default='checkpoints/', help='Path to save checkpoints') -parser.add_argument('-save_step', default=2, type=int, +parser.add_argument('-save_step', default=6, type=int, help='Save checkpoint after every save_step epochs') -parser.add_argument( - '--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") -parser.add_argument('--finetune', default=1, type=int, -======= -parser.add_argument('--finetune', default=0, type=int, - help="When set true, the model finetunes the s3dg model for video") +parser.add_argument('--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") +parser.add_argument('-finetune', default=1, type=int, + help="When set true, the model finetunes the s3dg model for video") # S3DG parameters and dataloader parser.add_argument('--num_frames', type=int, default=40, help='num_frame') @@ -174,14 +161,13 @@ encoder = Encoder(model_args) decoder = Decoder(model_args, encoder) -total_params = sum(p.numel() for p in encoder.parameters() if p.requires_grad) +total_params = sum(p.numel() for p in encoder.parameters() if p.requires_grad) + sum(p.numel() for p in decoder.parameters() if p.requires_grad) print("Total number of encoder params {0}".format(total_params)) if args.finetune: total_params = sum(p.numel() for p in encoder.video_embed.parameters() if p.requires_grad) print("Total number of s3dg params {0}".format(total_params)) -optimizer = optim.Adam(list(encoder.parameters( -)) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay) +optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss() scheduler = lr_scheduler.StepLR( optimizer, step_size=1, gamma=args.lr_decay_rate) From 772f90da146a3c1adac8e10f0f7870a10495f66a Mon Sep 17 00:00:00 2001 From: Huda Date: Mon, 23 Nov 2020 08:10:41 -0500 Subject: [PATCH 19/49] add viz --- checkpoints | 1 - train.py | 20 +++++++++++++++++++- utils/visualize.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) delete mode 120000 checkpoints create mode 100644 utils/visualize.py diff --git a/checkpoints b/checkpoints deleted file mode 120000 index 5cdef85..0000000 --- a/checkpoints +++ /dev/null @@ -1 +0,0 @@ -/srv/share/halamri3/checkpoints \ No newline at end of file diff --git a/train.py b/train.py index 902fc2d..3ab01a8 100644 --- a/train.py +++ b/train.py @@ -15,6 +15,7 @@ from dataloader import VisDialDataset from encoders import Encoder, LateFusionEncoder from decoders import Decoder +from utils import visualize parser = argparse.ArgumentParser() VisDialDataset.add_cmdline_args(parser) @@ -85,6 +86,12 @@ parser.add_argument("--use_npy", default=1, help="Uses npy instead of reading from videos") parser.add_argument("--numpy_path", default="data/charades") + +parser.add_argument_group('Visualzing related arguments') +parser.add_argument('-enableVis', type=int, default=1) +parser.add_argument('-visEnvName', type=str, default='s3d_finetune') +parser.add_argument('-server', type=str, default='127.0.0.1') +parser.add_argument('-serverPort', type=int, default=8855) # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -99,6 +106,14 @@ args.save_path += 'input_type_{0}_s3d_mixed_5c_fps_{1}_num_frames_{2}_text_encoder_{3}_lr_{4}_unfreeze_layer_{5}_finetune_{6}_use_npy_{7}'.format( args.input_type, args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, args.finetune, args.use_npy) +# ------------------------------------------------------------------------------------- +# setting visdom args +# ------------------------------------------------------------------------------------- +viz = visualize.VisdomLinePlot( + env_name=args.visEnvName, + server=args.server, + port=args.serverPort) + # seed for reproducibility torch.manual_seed(1234) @@ -126,6 +141,7 @@ for arg in vars(args): print('{:<20}: {}'.format(arg, getattr(args, arg))) +viz.writeText(args) # ---------------------------------------------------------------------------- # loading dataset wrapping with a dataloader # ---------------------------------------------------------------------------- @@ -270,7 +286,9 @@ datetime.datetime.utcnow() - train_begin, epoch, iteration, running_loss, validation_loss, optimizer.param_groups[0]['lr'])) - + + viz.plotLine('Loss','Train', 'LOSS', iteration, train_loss) + viz.plotLine('Loss', 'Val', 'LOSS', iteration, validation_loss) # ------------------------------------------------------------------------ # save checkpoints and final model # ------------------------------------------------------------------------ diff --git a/utils/visualize.py b/utils/visualize.py new file mode 100644 index 0000000..d69622d --- /dev/null +++ b/utils/visualize.py @@ -0,0 +1,36 @@ +import os.path as pth +import json +from visdom import Visdom +import numpy as np + +class VisdomLinePlot(): + + def __init__(self, env_name='main', server="0.0.0.0", port=8899): + self.viz = Visdom( + port=port, + env=env_name, + server=server + ) + self.plot_list = {} + self.env = env_name + def plotLine(self, scalar_name, split, title_name, x ,y): + + if scalar_name not in self.plot_list: + + self.plot_list[scalar_name] = self.viz.line( X=np.array([x,x]), Y=np.array([y,y]), env=self.env, + opts=dict(legend=[split], + title=title_name, + xlabel='Epochs', + ylabel= scalar_name)) + else: + + self.viz.line(X=np.array([x]), Y=np.array([y]), + env=self.env, + win=self.plot_list[scalar_name], + name=split, update='append') + + def writeText(self, dict): + output = '' + for arg in vars(dict): + output=output+('{:<20}: {}{}'.format(arg, getattr(dict, arg),"\n")) + self.viz.text(output) \ No newline at end of file From 4a299e0b3026dd74b1c5af796cfeb8e72617937d Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Mon, 23 Nov 2020 20:24:40 -0500 Subject: [PATCH 20/49] Fixes code for dataparallel Signed-off-by: Apoorva Beedu --- dataloader.py | 2 +- train.py | 132 +++++++++++++++++++++++++++++--------------------- 2 files changed, 78 insertions(+), 56 deletions(-) diff --git a/dataloader.py b/dataloader.py index 4346599..ab8ba47 100644 --- a/dataloader.py +++ b/dataloader.py @@ -298,7 +298,7 @@ def __getitem__(self, idx): # get image features if 'I' in self.args.input_type: item['img_feat'] = self.data[dtype + '_img_fv'][idx] - item['img_fnames'] = self.data[dtype + '_img_fnames'][idx] + item['img_fnames'] = [self.data[dtype + '_img_fnames'][idx]] # get audio features if 'A' in self.args.input_type: diff --git a/train.py b/train.py index 3ab01a8..f7f4c0f 100644 --- a/train.py +++ b/train.py @@ -16,21 +16,22 @@ from encoders import Encoder, LateFusionEncoder from decoders import Decoder from utils import visualize +from models import AVSD parser = argparse.ArgumentParser() VisDialDataset.add_cmdline_args(parser) LateFusionEncoder.add_cmdline_args(parser) parser.add_argument_group('Input modalites arguments') -parser.add_argument('-input_type', default='Q_DH_V', choices=['Q_only','Q_DH', - 'Q_A', - 'Q_I', - 'Q_V', - 'Q_C_I', - 'Q_DH_V', - 'Q_DH_I', - 'Q_V_A', - 'Q_DH_V_A'], help='Specify the inputs') +parser.add_argument('-input_type', default='Q_DH_V', choices=['Q_only', 'Q_DH', + 'Q_A', + 'Q_I', + 'Q_V', + 'Q_C_I', + 'Q_DH_V', + 'Q_DH_I', + 'Q_V_A', + 'Q_DH_V_A'], help='Specify the inputs') parser.add_argument_group('Encoder Decoder choice arguments') parser.add_argument('-encoder', default='lf-ques-im-hist', @@ -63,29 +64,30 @@ help='Path to save checkpoints') parser.add_argument('-save_step', default=6, type=int, help='Save checkpoint after every save_step epochs') -parser.add_argument('--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") -parser.add_argument('-finetune', default=1, type=int, - help="When set true, the model finetunes the s3dg model for video") +parser.add_argument('-input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", + help=".h5 file path for the charades s3d features.") +parser.add_argument('-finetune', default=1, type=int, + help="When set true, the model finetunes the s3dg model for video") # S3DG parameters and dataloader -parser.add_argument('--num_frames', type=int, default=40, +parser.add_argument('-num_frames', type=int, default=40, help='num_frame') -parser.add_argument('--video_size', type=int, default=224, +parser.add_argument('-video_size', type=int, default=224, help='random seed') -parser.add_argument('--fps', type=int, default=16, help='') -parser.add_argument('--crop_only', type=int, default=1, +parser.add_argument('-fps', type=int, default=16, help='') +parser.add_argument('-crop_only', type=int, default=1, help='random seed') -parser.add_argument('--center_crop', type=int, default=0, +parser.add_argument('-center_crop', type=int, default=0, help='random seed') -parser.add_argument('--random_flip', type=int, default=0, +parser.add_argument('-random_flip', type=int, default=0, help='random seed') -parser.add_argument('--video_root', default='data/videos') -parser.add_argument('--unfreeze_layers', default=1, type=int, +parser.add_argument('-video_root', default='data/videos') +parser.add_argument('-unfreeze_layers', default=1, type=int, help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") -parser.add_argument("--text_encoder", default="lstm", +parser.add_argument("-text_encoder", default="lstm", help="lstm or transformer", type=str) -parser.add_argument("--use_npy", default=1, +parser.add_argument("-use_npy", default=1, help="Uses npy instead of reading from videos") -parser.add_argument("--numpy_path", default="data/charades") +parser.add_argument("-numpy_path", default="data/charades") parser.add_argument_group('Visualzing related arguments') parser.add_argument('-enableVis', type=int, default=1) @@ -110,17 +112,20 @@ # setting visdom args # ------------------------------------------------------------------------------------- viz = visualize.VisdomLinePlot( - env_name=args.visEnvName, - server=args.server, - port=args.serverPort) + env_name=args.visEnvName, + server=args.server, + port=args.serverPort) # seed for reproducibility torch.manual_seed(1234) +torch.backends.cudnn.deterministic = True +torch.autograd.set_detect_anomaly(True) # set device and default tensor type +device = "cpu" if args.gpuid >= 0: torch.cuda.manual_seed_all(1234) - torch.cuda.set_device(args.gpuid) + args.num_gpu = torch.cuda.device_count() # transfer all options to model model_args = args @@ -175,37 +180,36 @@ # setup the model # ---------------------------------------------------------------------------- -encoder = Encoder(model_args) -decoder = Decoder(model_args, encoder) -total_params = sum(p.numel() for p in encoder.parameters() if p.requires_grad) + sum(p.numel() for p in decoder.parameters() if p.requires_grad) -print("Total number of encoder params {0}".format(total_params)) +model = AVSD(model_args) +total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) +print("Total number of model params {0}".format(total_params)) if args.finetune: total_params = sum(p.numel() - for p in encoder.video_embed.parameters() if p.requires_grad) + for p in model.encoder.video_embed.parameters() if p.requires_grad) print("Total number of s3dg params {0}".format(total_params)) -optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay) +optimizer = optim.Adam(list(model.parameters()), + lr=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss() scheduler = lr_scheduler.StepLR( optimizer, step_size=1, gamma=args.lr_decay_rate) if args.load_path != '': - encoder.load_state_dict(components['encoder']) - decoder.load_state_dict(components['decoder']) + model._load_state_dict_(components) print("Loaded model from {}".format(args.load_path)) print("Encoder: {}".format(args.encoder)) print("Decoder: {}".format(args.decoder)) +device = "cuda" if args.gpuid >= 0: - encoder = encoder.cuda() - decoder = decoder.cuda() - criterion = criterion.cuda() + model = torch.nn.DataParallel(model, output_device=0, dim=0) + model = model.to(device) + criterion = criterion.to(device) # ---------------------------------------------------------------------------- # training # ---------------------------------------------------------------------------- -encoder.train() -decoder.train() +model.train() os.makedirs(args.save_path, exist_ok=True) with open(os.path.join(args.save_path, "args_{0}.txt".format(start_time)), "w") as f: f.write(str(args)) @@ -229,8 +233,18 @@ # -------------------------------------------------------------------- # forward-backward pass and optimizer step # -------------------------------------------------------------------- - enc_out = encoder(batch) - dec_out = decoder(enc_out, batch) + img = batch['img_feat'] if 'I' in args.input_type else None + audio = batch['audio_feat'] if 'A' in args.input_type else None + vid = batch['vid_feat'] if 'V' in args.input_type else None + hist = batch['hist'] if 'DH' in args.input_type else None + hist_len = batch['hist_len'] if 'DH' in args.input_type else None + ques = batch['ques'] + ques_len = batch["ques_len"] + opt = batch['opt'] + opt_len = batch['opt_len'] + + dec_out = model(img, audio, vid, hist, hist_len, + ques, ques_len, opt, opt_len) cur_loss = criterion(dec_out, batch['ans_ind'].view(-1)) cur_loss.backward() @@ -253,7 +267,7 @@ # -------------------------------------------------------------------- # print after every few iterations # -------------------------------------------------------------------- - + if i % 200 == 0: #print("Running validation") @@ -265,15 +279,23 @@ val_batch[key] = Variable(val_batch[key]) if args.gpuid >= 0: val_batch[key] = val_batch[key].cuda() - enc_out = encoder(val_batch) - dec_out = decoder(enc_out, val_batch) - + img_v = val_batch['img_feat'] if 'I' in args.input_type else None + audio_v = val_batch['audio_feat'] if 'A' in args.input_type else None + vid_v = val_batch['vid_feat'] if 'V' in args.input_type else None + hist_v = val_batch['hist'] if 'DH' in args.input_type else None + hist_len_v = val_batch['hist_len'] if 'DH' in args.input_type else None + ques_v = val_batch['ques'] + ques_len_v = val_batch["ques_len"] + opt_v = val_batch['opt'] + opt_len_v = val_batch['opt_len'] + dec_out = model(img_v, audio_v, vid_v, hist_v, + hist_len_v, ques_v, ques_len_v, opt_v, opt_len_v) cur_loss = criterion(dec_out, val_batch['ans_ind'].view(-1)) validation_losses.append(cur_loss.item()) validation_loss = np.mean(validation_losses) iteration = (epoch - 1) * args.iter_per_epoch + i - + log_loss.append((epoch, iteration, running_loss, @@ -286,25 +308,25 @@ datetime.datetime.utcnow() - train_begin, epoch, iteration, running_loss, validation_loss, optimizer.param_groups[0]['lr'])) - - viz.plotLine('Loss','Train', 'LOSS', iteration, train_loss) + + viz.plotLine('Loss', 'Train', 'LOSS', iteration, train_loss) viz.plotLine('Loss', 'Val', 'LOSS', iteration, validation_loss) # ------------------------------------------------------------------------ # save checkpoints and final model # ------------------------------------------------------------------------ if epoch % args.save_step == 0: torch.save({ - 'encoder': encoder.state_dict(), - 'decoder': decoder.state_dict(), + 'encoder': model.encoder.state_dict(), + 'decoder': model.decoder.state_dict(), 'optimizer': optimizer.state_dict(), - 'model_args': encoder.args + 'model_args': model.args }, os.path.join(args.save_path, 'model_epoch_{}.pth'.format(epoch))) torch.save({ - 'encoder': encoder.state_dict(), - 'decoder': decoder.state_dict(), + 'encoder': model.encoder.state_dict(), + 'decoder': model.decoder.state_dict(), 'optimizer': optimizer.state_dict(), - 'model_args': encoder.args + 'model_args': model.args }, os.path.join(args.save_path, 'model_final.pth')) np.save(os.path.join(args.save_path, 'log_loss'), log_loss) From 7d88d15177d680438f1f0fd85ba246c7d4c51e19 Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Mon, 23 Nov 2020 20:41:50 -0500 Subject: [PATCH 21/49] Cleans the code, as dataprallel can create dict chunks Signed-off-by: Apoorva Beedu --- evaluate.py | 133 ++++++++++++++++++++++++++-------------------------- models.py | 23 +++++++++ train.py | 53 ++++++++------------- 3 files changed, 109 insertions(+), 100 deletions(-) create mode 100644 models.py diff --git a/evaluate.py b/evaluate.py index c333422..e3ecadf 100644 --- a/evaluate.py +++ b/evaluate.py @@ -14,58 +14,46 @@ from encoders import Encoder, LateFusionEncoder from decoders import Decoder from utils import process_ranks, scores_to_ranks, get_gt_ranks +from models import AVSD parser = argparse.ArgumentParser() VisDialDataset.add_cmdline_args(parser) LateFusionEncoder.add_cmdline_args(parser) -parser.add_argument('--finetune', default=0, type=int) -parser.add_argument('--fps', type=int, default=16, help='') -parser.add_argument('--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") -parser.add_argument('-input_type', default='question_dialog_video', choices=['question_only', - 'question_dialog', - 'question_audio', - 'question_image', - 'question_video', - 'question_caption_image', - 'question_dialog_video', - 'question_dialog_image', - 'question_video_audio', - 'question_dialog_video_audio'], help='Specify the inputs') + +parser.add_argument('-input_type', default='question_dialog_video', + choices=['question_only', 'question_dialog', 'question_audio', 'question_image', 'question_video', 'question_caption_image', 'question_dialog_video', 'question_dialog_image', 'question_video_audio', 'question_dialog_video_audio'], help='Specify the inputs') parser.add_argument_group('Evaluation related arguments') -parser.add_argument('-load_path', default='/nethome/halamri3/cvpr2020/avsd/checkpoints/nofinetune/14-Nov-2020-18:38:13/model_final.pth', help='Checkpoint to load path from') -parser.add_argument('-split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') -parser.add_argument('-use_gt', action='store_true', help='Whether to use ground truth for retrieving ranks') +parser.add_argument('-load_path', default='/nethome/halamri3/cvpr2020/avsd/checkpoints/nofinetune/14-Nov-2020-18:38:13/model_final.pth', + help='Checkpoint to load path from') +parser.add_argument('-split', default='test', + choices=['val', 'test', 'train'], help='Split to evaluate on') +parser.add_argument('-use_gt', action='store_true', + help='Whether to use ground truth for retrieving ranks') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') parser.add_argument('-gpuid', default=0, type=int, help='GPU id to use') -parser.add_argument('-overfit', action='store_true', help='Use a batch of only 5 examples, useful for debugging') -parser.add_argument('--video_root', default='data/videos/') +parser.add_argument('-overfit', action='store_true', + help='Use a batch of only 5 examples, useful for debugging') +parser.add_argument('-video_root', default='data/videos/') parser.add_argument_group('Submission related arguments') -parser.add_argument('-save_ranks', action='store_true', help='Whether to save retrieved ranks') -parser.add_argument('-save_path', default='logs/ranks.json', help='Path of json file to save ranks') -parser.add_argument('--random_flip', type=int, default=0, help='random seed') -parser.add_argument('--crop_only', type=int, default=1, - help='random seed') -parser.add_argument('--center_crop', type=int, default=0, - help='random seed') -parser.add_argument('--num_frames', type=int, default=40, - help='random seed') -parser.add_argument('--video_size', type=int, default=224, - help='random seed') - - - -parser.add_argument('-input_type', default='question_dialog_video', choices=['question_only', - 'question_dialog', - 'question_audio', - 'question_image', - 'question_video', - 'question_caption_image', - 'question_dialog_video', - 'question_dialog_image', - 'question_video_audio', - 'question_dialog_video_audio'], help='Specify the inputs') +parser.add_argument('-save_ranks', action='store_true', + help='Whether to save retrieved ranks') +parser.add_argument('-save_path', default='logs/ranks.json', + help='Path of json file to save ranks') +parser.add_argument('-random_flip', type=int, default=0, help='random seed') +parser.add_argument('-crop_only', type=int, default=1, + help='random seed') +parser.add_argument('-center_crop', type=int, default=0, + help='random seed') +parser.add_argument('-num_frames', type=int, default=40, + help='random seed') +parser.add_argument('-video_size', type=int, default=224, + help='random seed') + + +parser.add_argument('-input_type', default='question_dialog_video', + choices=['question_only', 'question_dialog', 'question_audio', 'question_image', 'question_video', 'question_caption_image', 'question_dialog_video', 'question_dialog_image', 'question_video_audio', 'question_dialog_video_audio'], help='Specify the inputs') parser.add_argument_group('Evaluation related arguments') parser.add_argument('-load_path', default='checkpoints/s3d_mixed_5c_fps_16_num_frames_40_text_encoder_lstm_lr_0.001_unfreeze_layer_1_finetune_1_use_npy_1/model_final.pth', @@ -85,27 +73,27 @@ parser.add_argument('-save_path', default='logs/qes_dialog_videos_ranks.json', help='Path of json file to save ranks') parser.add_argument( - '--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") -parser.add_argument('--finetune', default=0, type=int, + '-input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") +parser.add_argument('-finetune', default=0, type=int, help="When set true, the model finetunes the s3dg model for video") # S3DG parameters and dataloader -parser.add_argument('--num_frames', type=int, default=40, +parser.add_argument('-num_frames', type=int, default=40, help='num_frame') -parser.add_argument('--video_size', type=int, default=224, +parser.add_argument('-video_size', type=int, default=224, help='random seed') -parser.add_argument('--fps', type=int, default=16, help='') -parser.add_argument('--crop_only', type=int, default=1, +parser.add_argument('-fps', type=int, default=16, help='') +parser.add_argument('-crop_only', type=int, default=1, help='random seed') -parser.add_argument('--center_crop', type=int, default=0, +parser.add_argument('-center_crop', type=int, default=0, help='random seed') -parser.add_argument('--random_flip', type=int, default=0, +parser.add_argument('-random_flip', type=int, default=0, help='random seed') -parser.add_argument('--video_root', default='data/charades/videos') -parser.add_argument('--unfreeze_layers', default=0, type=int, +parser.add_argument('-video_root', default='data/charades/videos') +parser.add_argument('-unfreeze_layers', default=0, type=int, help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") -parser.add_argument("--text_encoder", default="lstm", +parser.add_argument("-text_encoder", default="lstm", help="lstm or transformer", type=str) -parser.add_argument("--use_npy", default=0, +parser.add_argument("-use_npy", default=0, help="Uses npy instead of reading from videos") # ---------------------------------------------------------------------------- # input arguments and options @@ -115,11 +103,15 @@ # seed for reproducibility torch.manual_seed(1234) +torch.backends.cudnn.deterministic = True +torch.autograd.set_detect_anomaly(True) # set device and default tensor type +device = "cpu" if args.gpuid >= 0: torch.cuda.manual_seed_all(1234) - torch.cuda.set_device(args.gpuid) + args.num_gpu = torch.cuda.device_count() + device = "cuda" # ---------------------------------------------------------------------------- # read saved model and args @@ -155,16 +147,14 @@ # setup the model # ---------------------------------------------------------------------------- -encoder = Encoder(model_args) -encoder.load_state_dict(components['encoder']) -decoder = Decoder(model_args, encoder) -decoder.load_state_dict(components['decoder']) +model = AVSD(model_args) +model._load_state_dict_(components) print("Loaded model from {}".format(args.load_path)) if args.gpuid >= 0: - encoder = encoder.cuda() - decoder = decoder.cuda() + model = torch.nn.DataParallel(model, output_device=0, dim=0) + model = model.to(device) # ---------------------------------------------------------------------------- # evaluation @@ -172,9 +162,18 @@ print("Evaluation start time: {}".format( datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S'))) -encoder.eval() -decoder.eval() +model.eval() + +def convert_list_to_tensor(batch): + new_batch = {} + for k, v in batch.items(): + # tensor of list of strings isn't possible, hence removing the image fnames from the batch sent into the training module. + if isinstance(v, list) and not (k == "img_fnames"): + new_batch[k] = torch.Tensor(v) + elif isinstance(v, torch.Tensor): + new_batch[k] = v + return new_batch if args.use_gt: # ------------------------------------------------------------------------ @@ -188,8 +187,8 @@ if args.gpuid >= 0: batch[key] = batch[key].cuda() - enc_out = encoder(batch) - dec_out = decoder(enc_out, batch) + new_batch = convert_list_to_tensor(batch) + dec_out = model(new_batch) ranks = scores_to_ranks(dec_out.data) gt_ranks = get_gt_ranks(ranks, batch['ans_ind'].data) all_ranks.append(gt_ranks) @@ -208,8 +207,8 @@ if args.gpuid >= 0: batch[key] = batch[key].cuda() - enc_out = encoder(batch) - dec_out = decoder(enc_out, batch) + new_batch = convert_list_to_tensor(batch) + dec_out = model(new_batch) ranks = scores_to_ranks(dec_out.data) ranks = ranks.view(-1, 10, 100) diff --git a/models.py b/models.py new file mode 100644 index 0000000..4dd09b3 --- /dev/null +++ b/models.py @@ -0,0 +1,23 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from encoders import Encoder, LateFusionEncoder +from decoders import Decoder + + +class AVSD(nn.Module): + def __init__(self, args): + super().__init__() + self.args = args + self.encoder = Encoder(args) + self.decoder = Decoder(args, self.encoder) + + def _load_state_dict_(self, components): + self.encoder.load_state_dict(components['encoder']) + self.decoder.load_state_dict(components['decoder']) + + def forward(self, batch): + enc_out = self.encoder(batch) + dec_out = self.decoder(enc_out, batch) + return dec_out diff --git a/train.py b/train.py index f7f4c0f..5ac7643 100644 --- a/train.py +++ b/train.py @@ -23,15 +23,8 @@ LateFusionEncoder.add_cmdline_args(parser) parser.add_argument_group('Input modalites arguments') -parser.add_argument('-input_type', default='Q_DH_V', choices=['Q_only', 'Q_DH', - 'Q_A', - 'Q_I', - 'Q_V', - 'Q_C_I', - 'Q_DH_V', - 'Q_DH_I', - 'Q_V_A', - 'Q_DH_V_A'], help='Specify the inputs') +parser.add_argument('-input_type', default='Q_DH_V', + choices=['Q_only', 'Q_DH', 'Q_A', 'Q_I', 'Q_V', 'Q_C_I', 'Q_DH_V', 'Q_DH_I', 'Q_V_A', 'Q_DH_V_A'], help='Specify the inputs') parser.add_argument_group('Encoder Decoder choice arguments') parser.add_argument('-encoder', default='lf-ques-im-hist', @@ -126,6 +119,7 @@ if args.gpuid >= 0: torch.cuda.manual_seed_all(1234) args.num_gpu = torch.cuda.device_count() + device = "cuda" # transfer all options to model model_args = args @@ -199,7 +193,6 @@ print("Encoder: {}".format(args.encoder)) print("Decoder: {}".format(args.decoder)) -device = "cuda" if args.gpuid >= 0: model = torch.nn.DataParallel(model, output_device=0, dim=0) model = model.to(device) @@ -220,6 +213,18 @@ print("Training start time: {}".format( datetime.datetime.strftime(train_begin, '%d-%b-%Y-%H:%M:%S'))) + +def convert_list_to_tensor(batch): + new_batch = {} + for k, v in batch.items(): + # tensor of list of strings isn't possible, hence removing the image fnames from the batch sent into the training module. + if isinstance(v, list) and not (k == "img_fnames"): + new_batch[k] = torch.Tensor(v) + elif isinstance(v, torch.Tensor): + new_batch[k] = v + return new_batch + + log_loss = [] for epoch in range(1, model_args.num_epochs + 1): for i, batch in tqdm(enumerate(dataloader)): @@ -233,18 +238,8 @@ # -------------------------------------------------------------------- # forward-backward pass and optimizer step # -------------------------------------------------------------------- - img = batch['img_feat'] if 'I' in args.input_type else None - audio = batch['audio_feat'] if 'A' in args.input_type else None - vid = batch['vid_feat'] if 'V' in args.input_type else None - hist = batch['hist'] if 'DH' in args.input_type else None - hist_len = batch['hist_len'] if 'DH' in args.input_type else None - ques = batch['ques'] - ques_len = batch["ques_len"] - opt = batch['opt'] - opt_len = batch['opt_len'] - - dec_out = model(img, audio, vid, hist, hist_len, - ques, ques_len, opt, opt_len) + new_batch = convert_list_to_tensor(batch) + dec_out = model(new_batch) cur_loss = criterion(dec_out, batch['ans_ind'].view(-1)) cur_loss.backward() @@ -279,17 +274,9 @@ val_batch[key] = Variable(val_batch[key]) if args.gpuid >= 0: val_batch[key] = val_batch[key].cuda() - img_v = val_batch['img_feat'] if 'I' in args.input_type else None - audio_v = val_batch['audio_feat'] if 'A' in args.input_type else None - vid_v = val_batch['vid_feat'] if 'V' in args.input_type else None - hist_v = val_batch['hist'] if 'DH' in args.input_type else None - hist_len_v = val_batch['hist_len'] if 'DH' in args.input_type else None - ques_v = val_batch['ques'] - ques_len_v = val_batch["ques_len"] - opt_v = val_batch['opt'] - opt_len_v = val_batch['opt_len'] - dec_out = model(img_v, audio_v, vid_v, hist_v, - hist_len_v, ques_v, ques_len_v, opt_v, opt_len_v) + + new_batch_v = convert_list_to_tensor(val_batch) + dec_out = model(new_batch) cur_loss = criterion(dec_out, val_batch['ans_ind'].view(-1)) validation_losses.append(cur_loss.item()) From bd555d4833a268161b4154554fdd4b9d8f14f0ae Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Mon, 23 Nov 2020 20:58:19 -0500 Subject: [PATCH 22/49] Tried to fix flatten_paramter warning Signed-off-by: Apoorva Beedu --- decoders/disc.py | 11 +++++++---- evaluate.py | 21 +++++++++++++++++++++ train.py | 23 +++++++++++++++++++++-- utils/dynamic_rnn.py | 4 +++- 4 files changed, 52 insertions(+), 7 deletions(-) diff --git a/decoders/disc.py b/decoders/disc.py index 3fadae3..6911301 100644 --- a/decoders/disc.py +++ b/decoders/disc.py @@ -10,7 +10,8 @@ def __init__(self, args, encoder): self.args = args # share word embedding self.word_embed = encoder.word_embed - self.option_rnn = nn.LSTM(args.embed_size, args.rnn_hidden_size, batch_first=True) + self.option_rnn = nn.LSTM( + args.embed_size, args.rnn_hidden_size, batch_first=True) self.log_softmax = nn.LogSoftmax(dim=1) # options are variable length padded sequences, use DynamicRNN @@ -30,8 +31,10 @@ def forward(self, enc_out, batch): options = batch['opt'] options_len = batch['opt_len'] # word embed options - options = options.view(options.size(0) * options.size(1), options.size(2), -1) - options_len = options_len.view(options_len.size(0) * options_len.size(1), -1) + options = options.view(options.size( + 0) * options.size(1), options.size(2), -1) + options_len = options_len.view( + options_len.size(0) * options_len.size(1), -1) batch_size, num_options, max_opt_len = options.size() options = options.contiguous().view(-1, num_options * max_opt_len) options = self.word_embed(options) @@ -48,4 +51,4 @@ def forward(self, enc_out, batch): scores = torch.stack(scores, 1) return scores #log_probs = self.log_softmax(scores) - #return log_probs + # return log_probs diff --git a/evaluate.py b/evaluate.py index e3ecadf..c9af3dd 100644 --- a/evaluate.py +++ b/evaluate.py @@ -175,6 +175,19 @@ def convert_list_to_tensor(batch): new_batch[k] = v return new_batch + +def repeat_tensors(batch, num_repeat): + """In the last iterations, when the number of samples are not multiple of the num_gpu, this function will repeat the last few samples""" + new_batch = batch.copy() + for i in range(num_repeat): + for k, v in batch.items(): + if isinstance(v, list): + new_batch[k].append(v[-1]) + elif isinstance(v, torch.Tensor): + new_batch[k] = torch.cat((new_batch[k], v[-1].unsqueeze(0)), 0) + return new_batch + + if args.use_gt: # ------------------------------------------------------------------------ # calculate automatic metrics and finish @@ -187,6 +200,10 @@ def convert_list_to_tensor(batch): if args.gpuid >= 0: batch[key] = batch[key].cuda() + # if not batch["vid_feat"].shape[0] % args.num_gpu == 0: + # num_repeat = args.num_gpu - \ + # batch["vid_feat"].shape[0] % args.num_gpu + # batch = repeat_tensors(batch, num_repeat) new_batch = convert_list_to_tensor(batch) dec_out = model(new_batch) ranks = scores_to_ranks(dec_out.data) @@ -207,6 +224,10 @@ def convert_list_to_tensor(batch): if args.gpuid >= 0: batch[key] = batch[key].cuda() + # if not batch["vid_feat"].shape[0] % args.num_gpu == 0: + # num_repeat = args.num_gpu - \ + # batch["vid_feat"].shape[0] % args.num_gpu + # batch = repeat_tensors(batch, num_repeat) new_batch = convert_list_to_tensor(batch) dec_out = model(new_batch) ranks = scores_to_ranks(dec_out.data) diff --git a/train.py b/train.py index 5ac7643..56818b7 100644 --- a/train.py +++ b/train.py @@ -149,12 +149,14 @@ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, + drop_last=True, collate_fn=dataset.collate_fn) dataset_val = VisDialDataset(args, ['val']) dataloader_val = DataLoader(dataset_val, batch_size=args.batch_size, shuffle=False, + drop_last=True, collate_fn=dataset.collate_fn) # ---------------------------------------------------------------------------- # setting model args @@ -225,6 +227,18 @@ def convert_list_to_tensor(batch): return new_batch +def repeat_tensors(batch, num_repeat): + """In the last iterations, when the number of samples are not multiple of the num_gpu, this function will repeat the last few samples""" + new_batch = batch.copy() + for i in range(num_repeat): + for k, v in batch.items(): + if isinstance(v, list): + new_batch[k].append(v[-1]) + elif isinstance(v, torch.Tensor): + new_batch[k] = torch.cat((new_batch[k], v[-1].unsqueeze(0)), 0) + return new_batch + + log_loss = [] for epoch in range(1, model_args.num_epochs + 1): for i, batch in tqdm(enumerate(dataloader)): @@ -238,9 +252,11 @@ def convert_list_to_tensor(batch): # -------------------------------------------------------------------- # forward-backward pass and optimizer step # -------------------------------------------------------------------- + # if not batch["vid_feat"].shape[0] % args.num_gpu == 0: + # num_repeat = args.num_gpu - batch["vid_feat"].shape[0] % args.num_gpu + # batch = repeat_tensors(batch, num_repeat) new_batch = convert_list_to_tensor(batch) dec_out = model(new_batch) - cur_loss = criterion(dec_out, batch['ans_ind'].view(-1)) cur_loss.backward() @@ -263,7 +279,7 @@ def convert_list_to_tensor(batch): # print after every few iterations # -------------------------------------------------------------------- - if i % 200 == 0: + if (i + 1) % 200 == 0: #print("Running validation") validation_losses = [] @@ -275,6 +291,9 @@ def convert_list_to_tensor(batch): if args.gpuid >= 0: val_batch[key] = val_batch[key].cuda() + # if not val_batch["vid_feat"].shape[0] % args.num_gpu == 0: + # num_repeat = args.num_gpu - val_batch["vid_feat"].shape[0] % args.num_gpu + # val_batch = repeat_tensors(val_batch, num_repeat) new_batch_v = convert_list_to_tensor(val_batch) dec_out = model(new_batch) cur_loss = criterion(dec_out, val_batch['ans_ind'].view(-1)) diff --git a/utils/dynamic_rnn.py b/utils/dynamic_rnn.py index 696c83a..d925e10 100644 --- a/utils/dynamic_rnn.py +++ b/utils/dynamic_rnn.py @@ -39,6 +39,7 @@ def forward(self, seq_input, seq_lens, initial_state=None): assert hx[0].size(0) == self.rnn_model.num_layers else: hx = None + self.rnn_model.flatten_parameters() _, (h_n, c_n) = self.rnn_model(packed_seq_input, hx) rnn_output = h_n[-1].index_select(dim=0, index=bwd_order) @@ -46,7 +47,8 @@ def forward(self, seq_input, seq_lens, initial_state=None): @staticmethod def _get_sorted_order(lens): - sorted_len, fwd_order = torch.sort(lens.contiguous().view(-1), 0, descending=True) + sorted_len, fwd_order = torch.sort( + lens.contiguous().view(-1), 0, descending=True) _, bwd_order = torch.sort(fwd_order) if isinstance(sorted_len, Variable): sorted_len = sorted_len.data From d92bef92771ef6d28183ee6dfea03c9762700938 Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Tue, 24 Nov 2020 12:23:42 -0500 Subject: [PATCH 23/49] Adds loss calculation into the model class Signed-off-by: Apoorva Beedu --- evaluate.py | 16 ++++++++-------- models.py | 6 +++++- train.py | 9 +++------ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/evaluate.py b/evaluate.py index c9af3dd..493b90f 100644 --- a/evaluate.py +++ b/evaluate.py @@ -200,10 +200,10 @@ def repeat_tensors(batch, num_repeat): if args.gpuid >= 0: batch[key] = batch[key].cuda() - # if not batch["vid_feat"].shape[0] % args.num_gpu == 0: - # num_repeat = args.num_gpu - \ - # batch["vid_feat"].shape[0] % args.num_gpu - # batch = repeat_tensors(batch, num_repeat) + if not batch["vid_feat"].shape[0] % args.num_gpu == 0: + num_repeat = args.num_gpu - \ + batch["vid_feat"].shape[0] % args.num_gpu + batch = repeat_tensors(batch, num_repeat) new_batch = convert_list_to_tensor(batch) dec_out = model(new_batch) ranks = scores_to_ranks(dec_out.data) @@ -224,10 +224,10 @@ def repeat_tensors(batch, num_repeat): if args.gpuid >= 0: batch[key] = batch[key].cuda() - # if not batch["vid_feat"].shape[0] % args.num_gpu == 0: - # num_repeat = args.num_gpu - \ - # batch["vid_feat"].shape[0] % args.num_gpu - # batch = repeat_tensors(batch, num_repeat) + if not batch["vid_feat"].shape[0] % args.num_gpu == 0: + num_repeat = args.num_gpu - \ + batch["vid_feat"].shape[0] % args.num_gpu + batch = repeat_tensors(batch, num_repeat) new_batch = convert_list_to_tensor(batch) dec_out = model(new_batch) ranks = scores_to_ranks(dec_out.data) diff --git a/models.py b/models.py index 4dd09b3..fb877ec 100644 --- a/models.py +++ b/models.py @@ -12,6 +12,7 @@ def __init__(self, args): self.args = args self.encoder = Encoder(args) self.decoder = Decoder(args, self.encoder) + self.criterion = nn.CrossEntropyLoss() def _load_state_dict_(self, components): self.encoder.load_state_dict(components['encoder']) @@ -20,4 +21,7 @@ def _load_state_dict_(self, components): def forward(self, batch): enc_out = self.encoder(batch) dec_out = self.decoder(enc_out, batch) - return dec_out + + cur_loss = self.criterion(dec_out, batch['ans_ind'].view(-1)) + + return cur_loss diff --git a/train.py b/train.py index 56818b7..902180e 100644 --- a/train.py +++ b/train.py @@ -185,7 +185,7 @@ print("Total number of s3dg params {0}".format(total_params)) optimizer = optim.Adam(list(model.parameters()), lr=args.lr, weight_decay=args.weight_decay) -criterion = nn.CrossEntropyLoss() + scheduler = lr_scheduler.StepLR( optimizer, step_size=1, gamma=args.lr_decay_rate) @@ -198,7 +198,6 @@ if args.gpuid >= 0: model = torch.nn.DataParallel(model, output_device=0, dim=0) model = model.to(device) - criterion = criterion.to(device) # ---------------------------------------------------------------------------- # training @@ -256,8 +255,7 @@ def repeat_tensors(batch, num_repeat): # num_repeat = args.num_gpu - batch["vid_feat"].shape[0] % args.num_gpu # batch = repeat_tensors(batch, num_repeat) new_batch = convert_list_to_tensor(batch) - dec_out = model(new_batch) - cur_loss = criterion(dec_out, batch['ans_ind'].view(-1)) + cur_loss = model(new_batch).mean() cur_loss.backward() optimizer.step() @@ -295,8 +293,7 @@ def repeat_tensors(batch, num_repeat): # num_repeat = args.num_gpu - val_batch["vid_feat"].shape[0] % args.num_gpu # val_batch = repeat_tensors(val_batch, num_repeat) new_batch_v = convert_list_to_tensor(val_batch) - dec_out = model(new_batch) - cur_loss = criterion(dec_out, val_batch['ans_ind'].view(-1)) + cur_loss = model(new_batch_v).mean() validation_losses.append(cur_loss.item()) validation_loss = np.mean(validation_losses) From 57fc90a6cccc03fefce2f2c7185d372978657bf8 Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Tue, 24 Nov 2020 13:42:36 -0500 Subject: [PATCH 24/49] ad num_workers to the dataloader --- train.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/train.py b/train.py index 902180e..b7806b5 100644 --- a/train.py +++ b/train.py @@ -36,9 +36,9 @@ parser.add_argument_group('Optimization related arguments') parser.add_argument('-num_epochs', default=45, type=int, help='Epochs') -parser.add_argument('-batch_size', default=12, type=int, help='Batch size') -parser.add_argument('-lr', default=0.001, type=float, help='Learning rate') -parser.add_argument('-lr_decay_rate', default=0.9997592083, +parser.add_argument('-batch_size', default=72, type=int, help='Batch size') +parser.add_argument('-lr', default=1e-4, type=float, help='Learning rate') +parser.add_argument('-lr_decay_rate', default=0.9, type=float, help='Decay for lr') parser.add_argument('-min_lr', default=5e-5, type=float, help='Minimum learning rate') @@ -85,8 +85,8 @@ parser.add_argument_group('Visualzing related arguments') parser.add_argument('-enableVis', type=int, default=1) parser.add_argument('-visEnvName', type=str, default='s3d_finetune') -parser.add_argument('-server', type=str, default='127.0.0.1') -parser.add_argument('-serverPort', type=int, default=8855) +parser.add_argument('-server', type=str, default='sky1.cc.gatech.edu') +parser.add_argument('-serverPort', type=int, default=7771) # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -149,12 +149,14 @@ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, + num_workers=8, drop_last=True, collate_fn=dataset.collate_fn) dataset_val = VisDialDataset(args, ['val']) dataloader_val = DataLoader(dataset_val, batch_size=args.batch_size, + num_workers=8, shuffle=False, drop_last=True, collate_fn=dataset.collate_fn) From e0f4e937d7d4e1278c150e70cd128922563e6e86 Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Fri, 27 Nov 2020 14:10:11 -0500 Subject: [PATCH 25/49] update --- .evaluate.py.swo | Bin 0 -> 16384 bytes checkpoints | 1 + decoders/disc.py | 11 ++-- evaluate.py | 153 +++++++++++++++++++------------------------ models.py | 27 ++++++++ train.py | 113 +++++++++++++++++--------------- utils/dynamic_rnn.py | 4 +- 7 files changed, 165 insertions(+), 144 deletions(-) create mode 100644 .evaluate.py.swo create mode 120000 checkpoints create mode 100644 models.py diff --git a/.evaluate.py.swo b/.evaluate.py.swo new file mode 100644 index 0000000000000000000000000000000000000000..672ed65c7de046faacd13bf1ffcee5538f429721 GIT binary patch literal 16384 zcmeHNTZklA89tkMPhz4zh>CJDW9*)!r>1*$c9nrKj5A4$uCVK5XOVF@)Ks1BuARE2 zPSwozE-_C&hx11cIqh>G}-MAz>>r%rdzOm}B= zC%Q-#d{ei-&gJ|6^IuNSR&)K{FYr$~3k=6CjQ#$Gr^U0!u47-low2SPx=|J!r&p5> zwcTq?dT-x-#WpzS?WEa!bG|v}?kImw?zmy$=F&-fR?X!qKoj`;jf{N)c;yDho(4kTQ^3pbVeB%n4SX2*?YkNKE)W4N;BW6@ z>`7o7SO9JWe)Ueq9tSeubHFXYpWeaPAA#=!K5#qm*S9nFBcKD^0etOkjI9B$U(eWY zfX9GKz%uY5;00`sd=GdKxEuHw@CWeq6hOT7h`WFLNZm}y%yE5RxLK!&WSrM)vpaz= z6OkoJUTbh)w%sDkmukM7yK`RG&0JrqIW41VGmMWox-v|cYMC4RNyL?uKIBGsWn@G< z3{v!!_fokO#QC7tX5%V5@GvEr2`@|Dst(YyNaC}h^}VG_A<5s7x@Mslp~`VymOvH;yp^R?rv_i*SQ4M7iz!zMUcQkcMuvhm8D zI@gskY$xY=R!EpeSMn(FWysq>Eb}6kNQ+WL(dAYMiy%Pr-*Z>*kQvjZsw) zr$sK%^3Z2fdsc(MC`1%okiIzK38+@IPKp6NVxcLLEa(KW8w%yQq4cLpEl)Y!6PQsb z!BO7jZA^h?YL^*D!@MosyvU?-Fs~7dtu&X@^nKFi=T#D$7W0HFcSmxpv29wZqgNn_ z2Yz-x^_nbb< z1E1zns4;874&g5IKV;*EFhP074Ni8ISGSxG)5E}GVG9R+F)irv_IH>Lu2pAj#y%+u zmBz4llH?fA-=Yk!Hk`DE7mkVJpV4E~a+Cb`kkiE2 zIe9gNO8C7pEM&%sVj6GaMe()hjGfCUoAtU$;7PT#QG){m3AJ1*YM_>9ZV=ZtZAP+| zh5_Pn!tFoHas-00afO7@h-Id1bb<$2=}K=qO_2XlACw@JK`~3BvE7i^f$-(^TT*i- zdXqO3XT<7f#rh$FbuKUDLdG8QjZB0pk3^V>rqgUTTcU{DnUoi$2;CmuwuDV8a7on1 zX%ANe;}9pCb2@PmGEKa$Xf>Ubg6*VO_MZe2tJr}u&)BrIMiVD}o)OFDbgwzBo70LY z?};As2v=6dextct9<~PQ1?`^BS2W4v!|Xr@XuWR`Mjc>+okF<2yP(*8mmytOBwpl) z$n~f)QFRmfXV`k6R*{peB85b9I9|@k^Ce0G^tH2G7C5JM%~PL2vDAiTndDLNAWD-g z=gaGBtLAx;2Vwtlnk85vWd_vX%0p73M2;$n{BnbLWR68=?#$=D)aKJ~Vb$E5m!)m> zUzXEu@B>VgjrB8TJI=j4lv6>fyJA6dB^*H$J`_9{XOX|E|mwAUTaUSUKs52AkiRx^z1|L>s2eGc_6)&DDH z9eV}!{HwraKmi{I-Uqyh`u$npTL9JdP2eA>fC5+n z-ay^|Yv6}K477lkQ1?F%JOexg&|bhtfR|DKKM6brJOq3l*ahwcu7DHb<)_3=byN&g z3|zwuP!J~P=lDgG)^%(Ubl>1fD&smuGN)f~BOveAW=u|)C?|5ar5|Lfj%%vN9o)p& zHre;pOaobNpmLFHFWrZ#*nFR6NSo?Y=dvL0XNZ(VU3y^3K@JS(yuXYprlEm;95;o2{Q4zEjSnAG zK%B>rDCzFVdf5+E$~r+e{md+~;csN9T)0lz6n3l&MJ#-|B`t0lka<@K z(Gy9L>x63{T|Py-I{B~1a@Q0E1CjnC2zg128Z;NQVkoW5G1nwBV;_7$L)O!w;mQ>I zf~BA`k`1oat>5hZs{iH26sl`MTZ&Vgj_SIp+-Pp{+l(imudyY@NZKBQ7SXBp7o`+z z@ypCgLBDa5o53BX3l0-#4k^L5LuaF~ZQ0vIkAoY>D|~l^GqR?o_wx9sb-TmO9ca?# zxus=LM-9G{V9yZyutPr8Hd4+lP(iG6H}Y*D8*Jjx;Pw15d zhcZJxT&HBZPPw|-5GTu|eh2Fb9BgdrXVZN4h>o(k6gK2yK~aR#D@XPBP>;I9flE9P zSoV}wbvF!66=m$Ve+wE+)#6bovqB9%v_Sh$tv9y?*`1t3WbJdWS#vcNMx)M0BYP~D^U1=eAk JjY= 0: torch.cuda.manual_seed_all(1234) - torch.cuda.set_device(args.gpuid) + args.num_gpu = torch.cuda.device_count() + device = "cuda" # ---------------------------------------------------------------------------- # read saved model and args @@ -155,16 +108,14 @@ # setup the model # ---------------------------------------------------------------------------- -encoder = Encoder(model_args) -encoder.load_state_dict(components['encoder']) -decoder = Decoder(model_args, encoder) -decoder.load_state_dict(components['decoder']) +model = AVSD(model_args) +model._load_state_dict_(components) print("Loaded model from {}".format(args.load_path)) if args.gpuid >= 0: - encoder = encoder.cuda() - decoder = decoder.cuda() + model = torch.nn.DataParallel(model, output_device=0, dim=0) + model = model.to(device) # ---------------------------------------------------------------------------- # evaluation @@ -172,8 +123,30 @@ print("Evaluation start time: {}".format( datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S'))) -encoder.eval() -decoder.eval() +model.eval() + + +def convert_list_to_tensor(batch): + new_batch = {} + for k, v in batch.items(): + # tensor of list of strings isn't possible, hence removing the image fnames from the batch sent into the training module. + if isinstance(v, list) and not (k == "img_fnames"): + new_batch[k] = torch.Tensor(v) + elif isinstance(v, torch.Tensor): + new_batch[k] = v + return new_batch + + +def repeat_tensors(batch, num_repeat): + """In the last iterations, when the number of samples are not multiple of the num_gpu, this function will repeat the last few samples""" + new_batch = batch.copy() + for i in range(num_repeat): + for k, v in batch.items(): + if isinstance(v, list): + new_batch[k].append(v[-1]) + elif isinstance(v, torch.Tensor): + new_batch[k] = torch.cat((new_batch[k], v[-1].unsqueeze(0)), 0) + return new_batch if args.use_gt: @@ -188,8 +161,12 @@ if args.gpuid >= 0: batch[key] = batch[key].cuda() - enc_out = encoder(batch) - dec_out = decoder(enc_out, batch) + if not batch["vid_feat"].shape[0] % args.num_gpu == 0: + num_repeat = args.num_gpu - \ + batch["vid_feat"].shape[0] % args.num_gpu + batch = repeat_tensors(batch, num_repeat) + new_batch = convert_list_to_tensor(batch) + dec_out = model(new_batch) ranks = scores_to_ranks(dec_out.data) gt_ranks = get_gt_ranks(ranks, batch['ans_ind'].data) all_ranks.append(gt_ranks) @@ -208,8 +185,12 @@ if args.gpuid >= 0: batch[key] = batch[key].cuda() - enc_out = encoder(batch) - dec_out = decoder(enc_out, batch) + if not batch["vid_feat"].shape[0] % args.num_gpu == 0: + num_repeat = args.num_gpu - \ + batch["vid_feat"].shape[0] % args.num_gpu + batch = repeat_tensors(batch, num_repeat) + new_batch = convert_list_to_tensor(batch) + dec_out = model(new_batch) ranks = scores_to_ranks(dec_out.data) ranks = ranks.view(-1, 10, 100) diff --git a/models.py b/models.py new file mode 100644 index 0000000..fb877ec --- /dev/null +++ b/models.py @@ -0,0 +1,27 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from encoders import Encoder, LateFusionEncoder +from decoders import Decoder + + +class AVSD(nn.Module): + def __init__(self, args): + super().__init__() + self.args = args + self.encoder = Encoder(args) + self.decoder = Decoder(args, self.encoder) + self.criterion = nn.CrossEntropyLoss() + + def _load_state_dict_(self, components): + self.encoder.load_state_dict(components['encoder']) + self.decoder.load_state_dict(components['decoder']) + + def forward(self, batch): + enc_out = self.encoder(batch) + dec_out = self.decoder(enc_out, batch) + + cur_loss = self.criterion(dec_out, batch['ans_ind'].view(-1)) + + return cur_loss diff --git a/train.py b/train.py index f7f4c0f..51cc699 100644 --- a/train.py +++ b/train.py @@ -23,15 +23,8 @@ LateFusionEncoder.add_cmdline_args(parser) parser.add_argument_group('Input modalites arguments') -parser.add_argument('-input_type', default='Q_DH_V', choices=['Q_only', 'Q_DH', - 'Q_A', - 'Q_I', - 'Q_V', - 'Q_C_I', - 'Q_DH_V', - 'Q_DH_I', - 'Q_V_A', - 'Q_DH_V_A'], help='Specify the inputs') +parser.add_argument('-input_type', default='Q_DH_V', + choices=['Q_only', 'Q_DH', 'Q_A', 'Q_I', 'Q_V', 'Q_C_I', 'Q_DH_V', 'Q_DH_I', 'Q_V_A', 'Q_DH_V_A'], help='Specify the inputs') parser.add_argument_group('Encoder Decoder choice arguments') parser.add_argument('-encoder', default='lf-ques-im-hist', @@ -42,10 +35,10 @@ choices=['disc'], help='Decoder to use for training') parser.add_argument_group('Optimization related arguments') -parser.add_argument('-num_epochs', default=45, type=int, help='Epochs') +parser.add_argument('-num_epochs', default=21, type=int, help='Epochs') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') -parser.add_argument('-lr', default=0.001, type=float, help='Learning rate') -parser.add_argument('-lr_decay_rate', default=0.9997592083, +parser.add_argument('-lr', default=1e-4, type=float, help='Learning rate') +parser.add_argument('-lr_decay_rate', default=0.9, type=float, help='Decay for lr') parser.add_argument('-min_lr', default=5e-5, type=float, help='Minimum learning rate') @@ -62,12 +55,13 @@ help='Checkpoint to load path from') parser.add_argument('-save_path', default='checkpoints/', help='Path to save checkpoints') -parser.add_argument('-save_step', default=6, type=int, +parser.add_argument('-save_step', default=4, type=int, help='Save checkpoint after every save_step epochs') parser.add_argument('-input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") -parser.add_argument('-finetune', default=1, type=int, +parser.add_argument('-finetune', default=0, type=int, help="When set true, the model finetunes the s3dg model for video") + # S3DG parameters and dataloader parser.add_argument('-num_frames', type=int, default=40, help='num_frame') @@ -91,9 +85,9 @@ parser.add_argument_group('Visualzing related arguments') parser.add_argument('-enableVis', type=int, default=1) -parser.add_argument('-visEnvName', type=str, default='s3d_finetune') -parser.add_argument('-server', type=str, default='127.0.0.1') -parser.add_argument('-serverPort', type=int, default=8855) +parser.add_argument('-visEnvName', type=str, default='s3d_Nofinetune') +parser.add_argument('-server', type=str, default='sky1.cc.gatech.edu') +parser.add_argument('-serverPort', type=int, default=7771) # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -105,8 +99,8 @@ if args.save_path == 'checkpoints/': # args.save_path += start_time - args.save_path += 'input_type_{0}_s3d_mixed_5c_fps_{1}_num_frames_{2}_text_encoder_{3}_lr_{4}_unfreeze_layer_{5}_finetune_{6}_use_npy_{7}'.format( - args.input_type, args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, args.finetune, args.use_npy) + args.save_path += 'input_type_{0}_s3d_mixed_5c_fps_{1}_num_frames_{2}_text_encoder_{3}_lr_{4}_unfreeze_layer_{5}_finetune_{6}_use_npy_{7}_batch_size_{8}'.format( + args.input_type, args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, args.finetune, args.use_npy, args.batch_size) # ------------------------------------------------------------------------------------- # setting visdom args @@ -126,6 +120,7 @@ if args.gpuid >= 0: torch.cuda.manual_seed_all(1234) args.num_gpu = torch.cuda.device_count() + device = "cuda" # transfer all options to model model_args = args @@ -155,12 +150,16 @@ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, + num_workers=3, + drop_last=True, collate_fn=dataset.collate_fn) dataset_val = VisDialDataset(args, ['val']) dataloader_val = DataLoader(dataset_val, batch_size=args.batch_size, + num_workers=3, shuffle=False, + drop_last=True, collate_fn=dataset.collate_fn) # ---------------------------------------------------------------------------- # setting model args @@ -189,7 +188,7 @@ print("Total number of s3dg params {0}".format(total_params)) optimizer = optim.Adam(list(model.parameters()), lr=args.lr, weight_decay=args.weight_decay) -criterion = nn.CrossEntropyLoss() + scheduler = lr_scheduler.StepLR( optimizer, step_size=1, gamma=args.lr_decay_rate) @@ -199,11 +198,9 @@ print("Encoder: {}".format(args.encoder)) print("Decoder: {}".format(args.decoder)) -device = "cuda" if args.gpuid >= 0: model = torch.nn.DataParallel(model, output_device=0, dim=0) model = model.to(device) - criterion = criterion.to(device) # ---------------------------------------------------------------------------- # training @@ -220,6 +217,30 @@ print("Training start time: {}".format( datetime.datetime.strftime(train_begin, '%d-%b-%Y-%H:%M:%S'))) + +def convert_list_to_tensor(batch): + new_batch = {} + for k, v in batch.items(): + # tensor of list of strings isn't possible, hence removing the image fnames from the batch sent into the training module. + if isinstance(v, list) and not (k == "img_fnames"): + new_batch[k] = torch.Tensor(v) + elif isinstance(v, torch.Tensor): + new_batch[k] = v + return new_batch + + +def repeat_tensors(batch, num_repeat): + """In the last iterations, when the number of samples are not multiple of the num_gpu, this function will repeat the last few samples""" + new_batch = batch.copy() + for i in range(num_repeat): + for k, v in batch.items(): + if isinstance(v, list): + new_batch[k].append(v[-1]) + elif isinstance(v, torch.Tensor): + new_batch[k] = torch.cat((new_batch[k], v[-1].unsqueeze(0)), 0) + return new_batch + + log_loss = [] for epoch in range(1, model_args.num_epochs + 1): for i, batch in tqdm(enumerate(dataloader)): @@ -233,20 +254,11 @@ # -------------------------------------------------------------------- # forward-backward pass and optimizer step # -------------------------------------------------------------------- - img = batch['img_feat'] if 'I' in args.input_type else None - audio = batch['audio_feat'] if 'A' in args.input_type else None - vid = batch['vid_feat'] if 'V' in args.input_type else None - hist = batch['hist'] if 'DH' in args.input_type else None - hist_len = batch['hist_len'] if 'DH' in args.input_type else None - ques = batch['ques'] - ques_len = batch["ques_len"] - opt = batch['opt'] - opt_len = batch['opt_len'] - - dec_out = model(img, audio, vid, hist, hist_len, - ques, ques_len, opt, opt_len) - - cur_loss = criterion(dec_out, batch['ans_ind'].view(-1)) + # if not batch["vid_feat"].shape[0] % args.num_gpu == 0: + # num_repeat = args.num_gpu - batch["vid_feat"].shape[0] % args.num_gpu + # batch = repeat_tensors(batch, num_repeat) + new_batch = convert_list_to_tensor(batch) + cur_loss = model(new_batch).mean() cur_loss.backward() optimizer.step() @@ -256,6 +268,8 @@ # update running loss and decay learning rates # -------------------------------------------------------------------- train_loss = cur_loss.item() + #import pdb + #pdb.set_trace() if running_loss > 0.0: running_loss = 0.95 * running_loss + 0.05 * cur_loss.item() else: @@ -267,8 +281,7 @@ # -------------------------------------------------------------------- # print after every few iterations # -------------------------------------------------------------------- - - if i % 200 == 0: + if (i + 1) % 200 == 0: #print("Running validation") validation_losses = [] @@ -279,18 +292,12 @@ val_batch[key] = Variable(val_batch[key]) if args.gpuid >= 0: val_batch[key] = val_batch[key].cuda() - img_v = val_batch['img_feat'] if 'I' in args.input_type else None - audio_v = val_batch['audio_feat'] if 'A' in args.input_type else None - vid_v = val_batch['vid_feat'] if 'V' in args.input_type else None - hist_v = val_batch['hist'] if 'DH' in args.input_type else None - hist_len_v = val_batch['hist_len'] if 'DH' in args.input_type else None - ques_v = val_batch['ques'] - ques_len_v = val_batch["ques_len"] - opt_v = val_batch['opt'] - opt_len_v = val_batch['opt_len'] - dec_out = model(img_v, audio_v, vid_v, hist_v, - hist_len_v, ques_v, ques_len_v, opt_v, opt_len_v) - cur_loss = criterion(dec_out, val_batch['ans_ind'].view(-1)) + + # if not val_batch["vid_feat"].shape[0] % args.num_gpu == 0: + # num_repeat = args.num_gpu - val_batch["vid_feat"].shape[0] % args.num_gpu + # val_batch = repeat_tensors(val_batch, num_repeat) + new_batch_v = convert_list_to_tensor(val_batch) + cur_loss = model(new_batch_v).mean() validation_losses.append(cur_loss.item()) validation_loss = np.mean(validation_losses) @@ -316,10 +323,10 @@ # ------------------------------------------------------------------------ if epoch % args.save_step == 0: torch.save({ - 'encoder': model.encoder.state_dict(), - 'decoder': model.decoder.state_dict(), + 'encoder': model.module.encoder.state_dict(), + 'decoder': model.module.decoder.state_dict(), 'optimizer': optimizer.state_dict(), - 'model_args': model.args + 'model_args': model.module.args }, os.path.join(args.save_path, 'model_epoch_{}.pth'.format(epoch))) torch.save({ diff --git a/utils/dynamic_rnn.py b/utils/dynamic_rnn.py index 696c83a..d925e10 100644 --- a/utils/dynamic_rnn.py +++ b/utils/dynamic_rnn.py @@ -39,6 +39,7 @@ def forward(self, seq_input, seq_lens, initial_state=None): assert hx[0].size(0) == self.rnn_model.num_layers else: hx = None + self.rnn_model.flatten_parameters() _, (h_n, c_n) = self.rnn_model(packed_seq_input, hx) rnn_output = h_n[-1].index_select(dim=0, index=bwd_order) @@ -46,7 +47,8 @@ def forward(self, seq_input, seq_lens, initial_state=None): @staticmethod def _get_sorted_order(lens): - sorted_len, fwd_order = torch.sort(lens.contiguous().view(-1), 0, descending=True) + sorted_len, fwd_order = torch.sort( + lens.contiguous().view(-1), 0, descending=True) _, bwd_order = torch.sort(fwd_order) if isinstance(sorted_len, Variable): sorted_len = sorted_len.data From e5277b2e2116d3d1b22a28b6e29e2b33ee910223 Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Fri, 27 Nov 2020 21:36:51 -0500 Subject: [PATCH 26/49] mid, unchecked commit for multiGPU Signed-off-by: Apoorva Beedu --- create_npy.py | 75 +++++++++++++++++++++++++++++++++++++----- dataloader.py | 53 ++++++++++++++++++++++++----- encoders/lf.py | 9 ++--- encoders/s3dg_video.py | 15 ++++++--- models.py | 3 +- train.py | 62 +++++++++++++++------------------- 6 files changed, 155 insertions(+), 62 deletions(-) diff --git a/create_npy.py b/create_npy.py index 2541a33..3249198 100644 --- a/create_npy.py +++ b/create_npy.py @@ -1,16 +1,43 @@ import argparse -import numpy as np -import pandas as pd import os -from tqdm import tqdm +import random + +import cv2 import ffmpeg import h5py - +import numpy as np +import pandas as pd import torch import torch as th import torch.nn.functional as F from torch.utils.data import Dataset -import random +from torchvision import io, transforms +from tqdm import tqdm + +random.seed(42) +np.random.seed(42) + + +class Transform(object): + + def __init__(self): + self.mean = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float32) + self.std = torch.tensor([0.229, 0.224, 0.225], dtype=torch.float32) + + def __call__(self, add_jitter=False, crop_size=224): + transform = transforms.Compose([ + self.random_crop(crop_size), + ]) + return transform + + def to_tensor(self): + return transforms.ToTensor() + + def random_crop(self, size): + return transforms.RandomCrop(size, pad_if_needed=True) + + def colorJitter(self): + return transforms.ColorJitter(0.4, 0.2, 0.2, 0.1) class CustomDataset(Dataset): @@ -24,15 +51,47 @@ def __init__(self, args, path): self.path = path self.fl_list = self.get_filenames( os.path.join(args.video_root, path)) + self.transform = Transform() def __len__(self): return len(self.fl_list) + def _get_opencv_video(self, video_path): + cap = cv2.VideoCapture(video_path) + cap.set(cv2.CAP_PROP_FPS, 30) + ret, frame = cap.read() + frames = [frame] + while ret: + ret, frame = cap.read() + if frame is not None: + frames.append(frame) + cap.release() + frames_array = np.concatenate(np.expand_dims(frames, 0)) + return frames_array + def get_filenames(self, path): results = [] results += [each for each in os.listdir(path) if each.endswith('.mp4')] return results + def _get_video_torch(self, video_path): + vframes, _, vmeta = io.read_video(video_path) + vframes = vframes.permute(0, 3, 1, 2) + vframes = self.transform(self.args.video_size)(vframes) + if vframes.shape[0] < self.args.num_frames: + zeros = th.zeros( + (3, self.args.num_frames - video.shape[0], self.args.video_size, self.args.video_size), dtype=th.uint8) + vframes = th.cat((vframes, zeros), axis=0) + # Gets n_frames from tne entire video, linearly spaced + vid_indices = np.linspace( + 0, vframes.shape[0] - 1, self.args.num_frames, dtype=int) + vid = vframes[vid_indices, :].permute(1, 0, 2, 3) + for i in range(3): + for j in range(vid.shape[1]): + if vid[i, j, :, :].sum() == 0: + print(i, j) + return vid + def _get_video(self, video_path, start=0, end=0): ''' :param video_path: Path of the video file @@ -59,8 +118,8 @@ def _get_video(self, video_path, start=0, end=0): str(self.args.video_size), str(self.args.video_size)) )''' cmd = ( - cmd.crop('max(0, (iw - {}))*{}'.format(self.args.video_size, aw), - 'max(0, (ih - {}))*{}'.format(self.args.video_size, ah), + cmd.crop('max(0, (iw-{}))*{}'.format(self.args.video_size, aw), + 'max(0, (ih-{}))*{}'.format(self.args.video_size, ah), 'min(iw, {})'.format(self.args.video_size), 'min(ih, {})'.format(self.args.video_size)) .filter('scale', self.args.video_size, self.args.video_size) @@ -99,7 +158,7 @@ def __getitem__(self, idx): self.args.write_path, video_file.replace(".mp4", ".npy")) video_path = os.path.join( self.args.video_root, self.path, video_file) - vid = self._get_video(video_path) + vid = self._get_video_torch(video_path) np.save(write_file, vid) return video_file diff --git a/dataloader.py b/dataloader.py index ab8ba47..136cc5e 100644 --- a/dataloader.py +++ b/dataloader.py @@ -1,20 +1,42 @@ -import os import json -from six import iteritems +import os +import pdb +import random from random import shuffle +import ffmpeg import h5py import hdfdict import numpy as np -from tqdm import tqdm -import ffmpeg -import random -import pdb - import torch import torch as th import torch.nn.functional as F +from six import iteritems from torch.utils.data import Dataset +from torchvision import io, transforms +from tqdm import tqdm + + +class Transform(object): + + def __init__(self): + self.mean = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float32) + self.std = torch.tensor([0.229, 0.224, 0.225], dtype=torch.float32) + + def __call__(self, add_jitter=False, crop_size=224): + transform = transforms.Compose([ + self.random_crop(crop_size) + ]) + return transform + + def to_tensor(self): + return transforms.ToTensor() + + def random_crop(self, size): + return transforms.RandomCrop(size, pad_if_needed=True) + + def colorJitter(self): + return transforms.ColorJitter(0.4, 0.2, 0.2, 0.1) class VisDialDataset(Dataset): @@ -43,6 +65,7 @@ def __init__(self, args, subsets): super(VisDialDataset, self).__init__() self.args = args self.subsets = tuple(subsets) + self.transform = Transform() print("Dataloader loading json file: {}".format(args.input_json)) with open(args.input_json, 'r') as info_file: @@ -205,6 +228,20 @@ def split(self, split): def __len__(self): return self.num_data_points[self._split] + def _get_video_torch(self, video_path): + vframes, _, vmeta = io.read_video(video_path) + vframes = vframes.permute(0, 3, 1, 2) + vframes = self.transform(self.args.video_size)(vframes) + if vframes.shape[0] < self.args.num_frames: + zeros = th.zeros( + (3, self.args.num_frames - video.shape[0], self.args.video_size, self.args.video_size), dtype=th.uint8) + vframes = th.cat((vframes, zeros), axis=0) + # Gets n_frames from tne entire video, linearly spaced + vid_indices = np.linspace( + 0, vframes.shape[0] - 1, self.args.num_frames, dtype=int) + vid = vframes[vid_indices, :].permute(1, 0, 2, 3) + return vid + def _get_video(self, video_path, start=0, end=0): ''' :param video_path: Path of the video file @@ -290,7 +327,7 @@ def __getitem__(self, idx): else: video_path = os.path.join( self.args.video_root, f_dtype, vid_id) - item['vid_feat'] = self._get_video(video_path) + item['vid_feat'] = self._get_video_torch(video_path) else: item['vid_feat'] = torch.from_numpy( self.data[dtype + '_vid_fv'][vid_id]).reshape(-1) diff --git a/encoders/lf.py b/encoders/lf.py index da53d7a..4147eb0 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -1,7 +1,6 @@ import torch from torch import nn from torch.nn import functional as F - from utils import DynamicRNN from encoders.s3dg_video import S3D @@ -39,6 +38,8 @@ def __init__(self, args): if self.args.finetune: self.video_embed = S3D( dict_path='data/s3d_dict.npy', space_to_depth=True) + self.video_embed.load_state_dict( + torch.load('data/s3d_howto100m.pth'), strict=False) self.video_embed.train() if self.args.unfreeze_layers: self.__freeze_s3dg_layers() @@ -77,10 +78,10 @@ def __init__(self, args): self.fusion = nn.Linear(fusion_size, args.rnn_hidden_size) if args.weight_init == 'xavier': - nn.init.xavier_uniform(self.fusion.weight.data) + nn.init.xavier_uniform_(self.fusion.weight.data) elif args.weight_init == 'kaiming': - nn.init.kaiming_uniform(self.fusion.weight.data) - nn.init.constant(self.fusion.bias.data, 0) + nn.init.kaiming_uniform_(self.fusion.weight.data) + nn.init.constant_(self.fusion.bias.data, 0) def __freeze_s3dg_layers(self): # Only train _4 and _5 layers diff --git a/encoders/s3dg_video.py b/encoders/s3dg_video.py index a5c3738..324a56f 100644 --- a/encoders/s3dg_video.py +++ b/encoders/s3dg_video.py @@ -17,13 +17,14 @@ optimization. """ -import torch as th -import torch.nn.functional as F -import torch.nn as nn import os -import numpy as np import re +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F + class InceptionBlock(nn.Module): def __init__( @@ -146,8 +147,12 @@ def __init__( def forward(self, input): out = self.relu(self.bn1(self.conv1(input))) + if th.isnan(self.conv1(input)).any(): + print("conv1 is the issue") if self.separable: out = self.relu(self.bn2(self.conv2(out))) + if th.isnan(out).any(): + print("isnan") return out @@ -260,7 +265,7 @@ def _space_to_depth(self, input): input = input.contiguous().view(B, 8 * C, T // 2, H // 2, W // 2) return input - def forward(self, inputs): + def forward(self, inputs1): """Defines the S3DG base architecture. """ if self.space_to_depth: diff --git a/models.py b/models.py index fb877ec..40ea12b 100644 --- a/models.py +++ b/models.py @@ -2,8 +2,8 @@ import torch.nn as nn import torch.nn.functional as F -from encoders import Encoder, LateFusionEncoder from decoders import Decoder +from encoders import Encoder, LateFusionEncoder class AVSD(nn.Module): @@ -21,7 +21,6 @@ def _load_state_dict_(self, components): def forward(self, batch): enc_out = self.encoder(batch) dec_out = self.decoder(enc_out, batch) - cur_loss = self.criterion(dec_out, batch['ans_ind'].view(-1)) return cur_loss diff --git a/train.py b/train.py index b3acc24..cd21006 100644 --- a/train.py +++ b/train.py @@ -3,20 +3,21 @@ import gc import math import os -import numpy as np -from tqdm import tqdm +import random +import numpy as np import torch from torch import nn, optim from torch.autograd import Variable from torch.optim import lr_scheduler from torch.utils.data import DataLoader +from tqdm import tqdm from dataloader import VisDialDataset -from encoders import Encoder, LateFusionEncoder from decoders import Decoder -from utils import visualize +from encoders import Encoder, LateFusionEncoder from models import AVSD +from utils import visualize parser = argparse.ArgumentParser() VisDialDataset.add_cmdline_args(parser) @@ -35,13 +36,8 @@ choices=['disc'], help='Decoder to use for training') parser.add_argument_group('Optimization related arguments') -<<<<<<< HEAD -parser.add_argument('-num_epochs', default=21, type=int, help='Epochs') -parser.add_argument('-batch_size', default=12, type=int, help='Batch size') -======= parser.add_argument('-num_epochs', default=45, type=int, help='Epochs') -parser.add_argument('-batch_size', default=72, type=int, help='Batch size') ->>>>>>> 57fc90a6cccc03fefce2f2c7185d372978657bf8 +parser.add_argument('-batch_size', default=12, type=int, help='Batch size') parser.add_argument('-lr', default=1e-4, type=float, help='Learning rate') parser.add_argument('-lr_decay_rate', default=0.9, type=float, help='Decay for lr') @@ -62,6 +58,8 @@ help='Path to save checkpoints') parser.add_argument('-save_step', default=4, type=int, help='Save checkpoint after every save_step epochs') +parser.add_argument('-eval_step', default=100, type=int, + help='Run validation after every eval_step iterations') parser.add_argument('-input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") parser.add_argument('-finetune', default=0, type=int, @@ -84,19 +82,18 @@ help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") parser.add_argument("-text_encoder", default="lstm", help="lstm or transformer", type=str) -parser.add_argument("-use_npy", default=1, +parser.add_argument("-use_npy", default=1, type=int, help="Uses npy instead of reading from videos") parser.add_argument("-numpy_path", default="data/charades") parser.add_argument_group('Visualzing related arguments') parser.add_argument('-enableVis', type=int, default=1) -<<<<<<< HEAD parser.add_argument('-visEnvName', type=str, default='s3d_Nofinetune') -======= -parser.add_argument('-visEnvName', type=str, default='s3d_finetune') ->>>>>>> 57fc90a6cccc03fefce2f2c7185d372978657bf8 -parser.add_argument('-server', type=str, default='sky1.cc.gatech.edu') -parser.add_argument('-serverPort', type=int, default=7771) +parser.add_argument('-server', type=str, default='127.0.0.1') +parser.add_argument('-serverPort', type=int, default=8855) +parser.add_argument('-set_cuda_device', type=str, default='') +parser.add_argument("-seed", type=int, default=1, + help="random seed for initialization") # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -120,10 +117,15 @@ port=args.serverPort) # seed for reproducibility +random.seed(args.seed) +np.random.seed(args.seed) torch.manual_seed(1234) torch.backends.cudnn.deterministic = True torch.autograd.set_detect_anomaly(True) +if args.set_cuda_device is not '': + os.environ["CUDA_VISIBLE_DEVICES"] = args.set_cuda_device + # set device and default tensor type device = "cpu" if args.gpuid >= 0: @@ -159,22 +161,14 @@ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, -<<<<<<< HEAD - num_workers=3, -======= num_workers=8, ->>>>>>> 57fc90a6cccc03fefce2f2c7185d372978657bf8 drop_last=True, collate_fn=dataset.collate_fn) dataset_val = VisDialDataset(args, ['val']) dataloader_val = DataLoader(dataset_val, batch_size=args.batch_size, -<<<<<<< HEAD - num_workers=3, -======= num_workers=8, ->>>>>>> 57fc90a6cccc03fefce2f2c7185d372978657bf8 shuffle=False, drop_last=True, collate_fn=dataset.collate_fn) @@ -286,7 +280,8 @@ def repeat_tensors(batch, num_repeat): # -------------------------------------------------------------------- train_loss = cur_loss.item() #import pdb - #pdb.set_trace() + # pdb.set_trace() + if running_loss > 0.0: running_loss = 0.95 * running_loss + 0.05 * cur_loss.item() else: @@ -298,16 +293,12 @@ def repeat_tensors(batch, num_repeat): # -------------------------------------------------------------------- # print after every few iterations # -------------------------------------------------------------------- -<<<<<<< HEAD -======= - ->>>>>>> 57fc90a6cccc03fefce2f2c7185d372978657bf8 - if (i + 1) % 200 == 0: - #print("Running validation") + if (i + 1) % args.eval_step == 0: + print("Running validation") validation_losses = [] - for _, val_batch in tqdm(enumerate(dataloader_val)): + for v_i, val_batch in tqdm(enumerate(dataloader_val)): for key in val_batch: if not isinstance(val_batch[key], list): val_batch[key] = Variable(val_batch[key]) @@ -317,6 +308,7 @@ def repeat_tensors(batch, num_repeat): # if not val_batch["vid_feat"].shape[0] % args.num_gpu == 0: # num_repeat = args.num_gpu - val_batch["vid_feat"].shape[0] % args.num_gpu # val_batch = repeat_tensors(val_batch, num_repeat) + # print(val_batch["img_fnames"]) new_batch_v = convert_list_to_tensor(val_batch) cur_loss = model(new_batch_v).mean() validation_losses.append(cur_loss.item()) @@ -351,8 +343,8 @@ def repeat_tensors(batch, num_repeat): }, os.path.join(args.save_path, 'model_epoch_{}.pth'.format(epoch))) torch.save({ - 'encoder': model.encoder.state_dict(), - 'decoder': model.decoder.state_dict(), + 'encoder': model.module.encoder.state_dict(), + 'decoder': model.module.decoder.state_dict(), 'optimizer': optimizer.state_dict(), 'model_args': model.args }, os.path.join(args.save_path, 'model_final.pth')) From 8867b6bbca5abb0768990960cf6c7bfcdc450a64 Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Fri, 27 Nov 2020 21:50:34 -0500 Subject: [PATCH 27/49] removes checkpoint from git Signed-off-by: Apoorva Beedu --- .gitignore | 3 ++- checkpoints | 1 - train.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) delete mode 120000 checkpoints diff --git a/.gitignore b/.gitignore index e4b3b5c..eb3a542 100644 --- a/.gitignore +++ b/.gitignore @@ -30,7 +30,8 @@ MANIFEST # Datasets, pretrained models, checkpoints and preprocessed files data/ !visdialch/data/ -checkpoints/ +checkpoints +checkpoints/* logs/ results/ log/ diff --git a/checkpoints b/checkpoints deleted file mode 120000 index 7f921bb..0000000 --- a/checkpoints +++ /dev/null @@ -1 +0,0 @@ -/srv/share/halamri3/checkpoints/ \ No newline at end of file diff --git a/train.py b/train.py index cd21006..6333ac8 100644 --- a/train.py +++ b/train.py @@ -85,6 +85,7 @@ parser.add_argument("-use_npy", default=1, type=int, help="Uses npy instead of reading from videos") parser.add_argument("-numpy_path", default="data/charades") +parser.add_argument("-num_workers", default=8, type=int) parser.add_argument_group('Visualzing related arguments') parser.add_argument('-enableVis', type=int, default=1) @@ -161,14 +162,14 @@ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, - num_workers=8, + num_workers=args.num_workers, drop_last=True, collate_fn=dataset.collate_fn) dataset_val = VisDialDataset(args, ['val']) dataloader_val = DataLoader(dataset_val, batch_size=args.batch_size, - num_workers=8, + num_workers=args.num_workers, shuffle=False, drop_last=True, collate_fn=dataset.collate_fn) From 1f292e83ae2746bd9fc48131f2d83ee2484869e4 Mon Sep 17 00:00:00 2001 From: Huda Date: Sat, 28 Nov 2020 08:09:40 -0500 Subject: [PATCH 28/49] update the dataloader for test set --- dataloader.py | 9 ++++-- encoders/lf.py | 1 + evaluate.py | 87 +++++++++++++------------------------------------- train.py | 17 +++++----- 4 files changed, 38 insertions(+), 76 deletions(-) diff --git a/dataloader.py b/dataloader.py index 4346599..74fc7fa 100644 --- a/dataloader.py +++ b/dataloader.py @@ -292,8 +292,13 @@ def __getitem__(self, idx): self.args.video_root, f_dtype, vid_id) item['vid_feat'] = self._get_video(video_path) else: - item['vid_feat'] = torch.from_numpy( - self.data[dtype + '_vid_fv'][vid_id]).reshape(-1) + f_dtype = "train_val" + if dtype == "test": + f_dtype = "test" + if self.args.use_npy: + video_path = os.path.join(self.args.numpy_path, vid_id) + item['vid_feat'] = torch.from_numpy(np.load( + video_path.replace(".mp4", ".npy"))) # get image features if 'I' in self.args.input_type: diff --git a/encoders/lf.py b/encoders/lf.py index da53d7a..cf10cab 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -39,6 +39,7 @@ def __init__(self, args): if self.args.finetune: self.video_embed = S3D( dict_path='data/s3d_dict.npy', space_to_depth=True) + self.video_embed.load_state_dict(torch.load('data/s3d_howto100m.pth'), strict=False) self.video_embed.train() if self.args.unfreeze_layers: self.__freeze_s3dg_layers() diff --git a/evaluate.py b/evaluate.py index c333422..b5ca6d8 100644 --- a/evaluate.py +++ b/evaluate.py @@ -20,31 +20,28 @@ VisDialDataset.add_cmdline_args(parser) LateFusionEncoder.add_cmdline_args(parser) parser.add_argument('--finetune', default=0, type=int) -parser.add_argument('--fps', type=int, default=16, help='') parser.add_argument('--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") -parser.add_argument('-input_type', default='question_dialog_video', choices=['question_only', - 'question_dialog', - 'question_audio', - 'question_image', - 'question_video', - 'question_caption_image', - 'question_dialog_video', - 'question_dialog_image', - 'question_video_audio', - 'question_dialog_video_audio'], help='Specify the inputs') - +parser.add_argument('--input_type', default='Q_DH_V', choices=['Q_only','Q_DH', + 'Q_A', + 'Q_I', + 'Q_V', + 'Q_C_I', + 'Q_DH_V', + 'Q_DH_I', + 'Q_V_A', + 'Q_DH_V_A'], help='Specify the inputs') parser.add_argument_group('Evaluation related arguments') -parser.add_argument('-load_path', default='/nethome/halamri3/cvpr2020/avsd/checkpoints/nofinetune/14-Nov-2020-18:38:13/model_final.pth', help='Checkpoint to load path from') -parser.add_argument('-split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') -parser.add_argument('-use_gt', action='store_true', help='Whether to use ground truth for retrieving ranks') -parser.add_argument('-batch_size', default=12, type=int, help='Batch size') -parser.add_argument('-gpuid', default=0, type=int, help='GPU id to use') -parser.add_argument('-overfit', action='store_true', help='Use a batch of only 5 examples, useful for debugging') -parser.add_argument('--video_root', default='data/videos/') +parser.add_argument('--load_path', default='/nethome/halamri3/cvpr2020/avsd/checkpoints/nofinetune/14-Nov-2020-18:38:1/model_final.pth', help='Checkpoint to load path from') +parser.add_argument('--split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') +parser.add_argument('--use_gt', action='store_true', help='Whether to use ground truth for retrieving ranks') +parser.add_argument('--batch_size', default=12, type=int, help='Batch size') +parser.add_argument('--gpuid', default=0, type=int, help='GPU id to use') +parser.add_argument('--overfit', action='store_true', help='Use a batch of only 5 examples, useful for debugging') + + parser.add_argument_group('Submission related arguments') -parser.add_argument('-save_ranks', action='store_true', help='Whether to save retrieved ranks') -parser.add_argument('-save_path', default='logs/ranks.json', help='Path of json file to save ranks') -parser.add_argument('--random_flip', type=int, default=0, help='random seed') +parser.add_argument('--save_ranks', action='store_true', help='Whether to save retrieved ranks') +parser.add_argument('--save_path', default='logs/ranks.json', help='Path of json file to save ranks') parser.add_argument('--crop_only', type=int, default=1, help='random seed') parser.add_argument('--center_crop', type=int, default=0, @@ -53,51 +50,12 @@ help='random seed') parser.add_argument('--video_size', type=int, default=224, help='random seed') - - - -parser.add_argument('-input_type', default='question_dialog_video', choices=['question_only', - 'question_dialog', - 'question_audio', - 'question_image', - 'question_video', - 'question_caption_image', - 'question_dialog_video', - 'question_dialog_image', - 'question_video_audio', - 'question_dialog_video_audio'], help='Specify the inputs') - parser.add_argument_group('Evaluation related arguments') -parser.add_argument('-load_path', default='checkpoints/s3d_mixed_5c_fps_16_num_frames_40_text_encoder_lstm_lr_0.001_unfreeze_layer_1_finetune_1_use_npy_1/model_final.pth', - help='Checkpoint to load path from') -parser.add_argument('-split', default='test', - choices=['val', 'test', 'train'], help='Split to evaluate on') -parser.add_argument('-use_gt', action='store_true', - help='Whether to use ground truth for retrieving ranks') -parser.add_argument('-batch_size', default=12, type=int, help='Batch size') -parser.add_argument('-gpuid', default=0, type=int, help='GPU id to use') -parser.add_argument('-overfit', action='store_true', - help='Use a batch of only 5 examples, useful for debugging') -parser.add_argument_group('Submission related arguments') -parser.add_argument('-save_ranks', action='store_true', - help='Whether to save retrieved ranks') -parser.add_argument('-save_path', default='logs/qes_dialog_videos_ranks.json', - help='Path of json file to save ranks') -parser.add_argument( - '--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") -parser.add_argument('--finetune', default=0, type=int, - help="When set true, the model finetunes the s3dg model for video") # S3DG parameters and dataloader -parser.add_argument('--num_frames', type=int, default=40, - help='num_frame') -parser.add_argument('--video_size', type=int, default=224, - help='random seed') +parser.add_argument("--use_npy", default=1, help="Uses npy instead of reading from videos") +parser.add_argument("--numpy_path", default="data/charades/num_frames_40/num_frames_40/") parser.add_argument('--fps', type=int, default=16, help='') -parser.add_argument('--crop_only', type=int, default=1, - help='random seed') -parser.add_argument('--center_crop', type=int, default=0, - help='random seed') parser.add_argument('--random_flip', type=int, default=0, help='random seed') parser.add_argument('--video_root', default='data/charades/videos') @@ -105,8 +63,7 @@ help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") parser.add_argument("--text_encoder", default="lstm", help="lstm or transformer", type=str) -parser.add_argument("--use_npy", default=0, - help="Uses npy instead of reading from videos") + # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- diff --git a/train.py b/train.py index 3ab01a8..3665dd2 100644 --- a/train.py +++ b/train.py @@ -43,8 +43,8 @@ parser.add_argument_group('Optimization related arguments') parser.add_argument('-num_epochs', default=45, type=int, help='Epochs') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') -parser.add_argument('-lr', default=0.001, type=float, help='Learning rate') -parser.add_argument('-lr_decay_rate', default=0.9997592083, +parser.add_argument('-lr', default=1e-4, type=float, help='Learning rate') +parser.add_argument('-lr_decay_rate', default=0.99, type=float, help='Decay for lr') parser.add_argument('-min_lr', default=5e-5, type=float, help='Minimum learning rate') @@ -61,7 +61,7 @@ help='Checkpoint to load path from') parser.add_argument('-save_path', default='checkpoints/', help='Path to save checkpoints') -parser.add_argument('-save_step', default=6, type=int, +parser.add_argument('-save_step', default=2, type=int, help='Save checkpoint after every save_step epochs') parser.add_argument('--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") parser.add_argument('-finetune', default=1, type=int, @@ -85,13 +85,13 @@ help="lstm or transformer", type=str) parser.add_argument("--use_npy", default=1, help="Uses npy instead of reading from videos") -parser.add_argument("--numpy_path", default="data/charades") +parser.add_argument("--numpy_path", default="data/charades/num_frames_40") parser.add_argument_group('Visualzing related arguments') parser.add_argument('-enableVis', type=int, default=1) parser.add_argument('-visEnvName', type=str, default='s3d_finetune') parser.add_argument('-server', type=str, default='127.0.0.1') -parser.add_argument('-serverPort', type=int, default=8855) +parser.add_argument('-serverPort', type=int, default=7771) # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- @@ -218,7 +218,7 @@ log_loss = [] for epoch in range(1, model_args.num_epochs + 1): - for i, batch in tqdm(enumerate(dataloader)): + for i, batch in enumerate(dataloader): optimizer.zero_grad() for key in batch: if not isinstance(batch[key], list): @@ -252,14 +252,13 @@ # -------------------------------------------------------------------- # print after every few iterations - # -------------------------------------------------------------------- - + # ------------------------------------------------------------------- if i % 200 == 0: #print("Running validation") validation_losses = [] - for _, val_batch in tqdm(enumerate(dataloader_val)): + for _, val_batch in enumerate(dataloader_val): for key in val_batch: if not isinstance(val_batch[key], list): val_batch[key] = Variable(val_batch[key]) From 3f8125a2c37a15768a22900952eed9c90335786b Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Sun, 29 Nov 2020 12:29:32 -0500 Subject: [PATCH 29/49] minor bug --- encoders/s3dg_video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encoders/s3dg_video.py b/encoders/s3dg_video.py index 324a56f..dbc2665 100644 --- a/encoders/s3dg_video.py +++ b/encoders/s3dg_video.py @@ -265,7 +265,7 @@ def _space_to_depth(self, input): input = input.contiguous().view(B, 8 * C, T // 2, H // 2, W // 2) return input - def forward(self, inputs1): + def forward(self, inputs): """Defines the S3DG base architecture. """ if self.space_to_depth: From e3a76069b8f6b35f8dcdf2e06a8ae4c177ca74c5 Mon Sep 17 00:00:00 2001 From: Huda Date: Sun, 29 Nov 2020 16:48:49 -0500 Subject: [PATCH 30/49] add eval to train --- checkpoints | 1 + 1 file changed, 1 insertion(+) create mode 120000 checkpoints diff --git a/checkpoints b/checkpoints new file mode 120000 index 0000000..1628e5d --- /dev/null +++ b/checkpoints @@ -0,0 +1 @@ +/media/halamri/31db5016-276c-4234-84da-1624299843eb/checkpoints \ No newline at end of file From b61f10c6e3b7e2fa35afeaa291ceca94373bc975 Mon Sep 17 00:00:00 2001 From: Huda Date: Sun, 29 Nov 2020 19:29:12 -0500 Subject: [PATCH 31/49] add eval --- train.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 3665dd2..ff10565 100644 --- a/train.py +++ b/train.py @@ -5,7 +5,7 @@ import os import numpy as np from tqdm import tqdm - +import gc import torch from torch import nn, optim from torch.autograd import Variable @@ -16,6 +16,7 @@ from encoders import Encoder, LateFusionEncoder from decoders import Decoder from utils import visualize +from utils import process_ranks, scores_to_ranks, get_gt_ranks parser = argparse.ArgumentParser() VisDialDataset.add_cmdline_args(parser) @@ -85,7 +86,7 @@ help="lstm or transformer", type=str) parser.add_argument("--use_npy", default=1, help="Uses npy instead of reading from videos") -parser.add_argument("--numpy_path", default="data/charades/num_frames_40") +parser.add_argument("--numpy_path", default="data/charades/") parser.add_argument_group('Visualzing related arguments') parser.add_argument('-enableVis', type=int, default=1) @@ -157,6 +158,12 @@ batch_size=args.batch_size, shuffle=False, collate_fn=dataset.collate_fn) + +dataset_test = VisDialDataset(args, ['test']) +dataloader_test = DataLoader(dataset_test, + batch_size=args.batch_size, + shuffle=False, + collate_fn=dataset.collate_fn) # ---------------------------------------------------------------------------- # setting model args # ---------------------------------------------------------------------------- @@ -298,6 +305,27 @@ 'optimizer': optimizer.state_dict(), 'model_args': encoder.args }, os.path.join(args.save_path, 'model_epoch_{}.pth'.format(epoch))) + print('Running evaluation for checkpoint:',epoch) + encoder.eval() + decoder.eval() + all_ranks = [] + for i, batch in enumerate(tqdm(dataloader)): + for key in batch: + if not isinstance(batch[key], list): + batch[key] = Variable(batch[key], volatile=True) + if args.gpuid >= 0: + batch[key] = batch[key].cuda() + + enc_out = encoder(batch) + dec_out = decoder(enc_out, batch) + ranks = scores_to_ranks(dec_out.data) + gt_ranks = get_gt_ranks(ranks, batch['ans_ind'].data) + all_ranks.append(gt_ranks) + all_ranks = torch.cat(all_ranks, 0) + process_ranks(all_ranks) + gc.collect() + encoder.train() + decoder.train() torch.save({ 'encoder': encoder.state_dict(), @@ -307,3 +335,5 @@ }, os.path.join(args.save_path, 'model_final.pth')) np.save(os.path.join(args.save_path, 'log_loss'), log_loss) + + From 4c07a2fa34ea4bdb1c7d9cac349c4e558f8e3581 Mon Sep 17 00:00:00 2001 From: Huda Date: Sun, 29 Nov 2020 20:57:25 -0500 Subject: [PATCH 32/49] print rank results for each save point --- train.py | 7 +++++-- utils/eval_utils.py | 18 ++++++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/train.py b/train.py index ff10565..6d29490 100644 --- a/train.py +++ b/train.py @@ -62,7 +62,7 @@ help='Checkpoint to load path from') parser.add_argument('-save_path', default='checkpoints/', help='Path to save checkpoints') -parser.add_argument('-save_step', default=2, type=int, +parser.add_argument('-save_step', default=1, type=int, help='Save checkpoint after every save_step epochs') parser.add_argument('--input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", help=".h5 file path for the charades s3d features.") parser.add_argument('-finetune', default=1, type=int, @@ -321,8 +321,11 @@ ranks = scores_to_ranks(dec_out.data) gt_ranks = get_gt_ranks(ranks, batch['ans_ind'].data) all_ranks.append(gt_ranks) + all_ranks = torch.cat(all_ranks, 0) - process_ranks(all_ranks) + process_ranks(all_ranks, args.sav) + + f.close() gc.collect() encoder.train() decoder.train() diff --git a/utils/eval_utils.py b/utils/eval_utils.py index 568773d..69c6d03 100644 --- a/utils/eval_utils.py +++ b/utils/eval_utils.py @@ -1,5 +1,5 @@ import torch - +import os def get_gt_ranks(ranks, ans_ind): ans_ind = ans_ind.view(-1) @@ -8,8 +8,7 @@ def get_gt_ranks(ranks, ans_ind): gt_ranks[i] = int(ranks[i, ans_ind[i]]) return gt_ranks - -def process_ranks(ranks): +def process_ranks(ranks, save_path, epoch): num_ques = ranks.size(0) num_opts = 100 @@ -24,11 +23,22 @@ def process_ranks(ranks): num_ge = torch.sum(ranks.ge(num_opts + 1)) print("Warning: some of ranks > 100: {}".format(num_ge)) ranks = ranks[ranks.le(num_opts + 1)] - + ranks = ranks.float() num_r1 = float(torch.sum(torch.le(ranks, 1))) num_r5 = float(torch.sum(torch.le(ranks, 5))) num_r10 = float(torch.sum(torch.le(ranks, 10))) + + with open(os.path.join(save_path, "ranks_{0}.txt".format(epoch)), "w") as f: + f.write("\tNo. questions: {}".format(num_ques)) + f.write("\tr@1: {}".format(num_r1 / num_ques)) + f.write("\tr@5: {}".format(num_r5 / num_ques)) + f.write("\tr@10: {}".format(num_r10 / num_ques)) + f.write("\tr@10: {}".format(num_r10 / num_ques)) + f.write("\tmeanR: {}".format(torch.mean(ranks))) + f.write("\tmeanRR: {}".format(torch.mean(ranks.reciprocal()))) + f.close() + print("\tNo. questions: {}".format(num_ques)) print("\tr@1: {}".format(num_r1 / num_ques)) print("\tr@5: {}".format(num_r5 / num_ques)) From 7b09d73d5792812f05d36736c1ef5f815b4ecdd5 Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Mon, 30 Nov 2020 06:32:48 -0500 Subject: [PATCH 33/49] update train --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 6d29490..067de2b 100644 --- a/train.py +++ b/train.py @@ -323,7 +323,7 @@ all_ranks.append(gt_ranks) all_ranks = torch.cat(all_ranks, 0) - process_ranks(all_ranks, args.sav) + process_ranks(all_ranks, args.save_path, epoch) f.close() gc.collect() From d24eda2f655a1cc0b25818357d5f44e1a2de37cb Mon Sep 17 00:00:00 2001 From: Huda Date: Mon, 30 Nov 2020 09:02:40 -0500 Subject: [PATCH 34/49] update eval for all epcohs --- evaluate.py | 194 +++++++++++++++++++++++--------------------- utils/eval_utils.py | 7 +- 2 files changed, 106 insertions(+), 95 deletions(-) diff --git a/evaluate.py b/evaluate.py index b5ca6d8..96b4606 100644 --- a/evaluate.py +++ b/evaluate.py @@ -14,6 +14,8 @@ from encoders import Encoder, LateFusionEncoder from decoders import Decoder from utils import process_ranks, scores_to_ranks, get_gt_ranks +import logging + parser = argparse.ArgumentParser() @@ -31,7 +33,7 @@ 'Q_V_A', 'Q_DH_V_A'], help='Specify the inputs') parser.add_argument_group('Evaluation related arguments') -parser.add_argument('--load_path', default='/nethome/halamri3/cvpr2020/avsd/checkpoints/nofinetune/14-Nov-2020-18:38:1/model_final.pth', help='Checkpoint to load path from') +parser.add_argument('--load_path', default='checkpoints/input_type_Q_DH_V_s3d_mixed_5c_fps_16_num_frames_40_text_encoder_lstm_lr_0.0001_unfreeze_layer_1_finetune_0_use_npy_1_batch_size_12', help='Checkpoint to load path from') parser.add_argument('--split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') parser.add_argument('--use_gt', action='store_true', help='Whether to use ground truth for retrieving ranks') parser.add_argument('--batch_size', default=12, type=int, help='Batch size') @@ -69,34 +71,28 @@ # ---------------------------------------------------------------------------- args = parser.parse_args() - +''' +log_path = os.path.join(args.load_path, 'eval_results.log') +logging.basicConfig(filename='eval_results.log') +''' # seed for reproducibility torch.manual_seed(1234) + +checkpoints = [file for file in os.listdir(args.load_path) if file.endswith(".pth")] +logging.info("Evaluate the following checkpoints: %s", args.load_path) + # set device and default tensor type if args.gpuid >= 0: torch.cuda.manual_seed_all(1234) torch.cuda.set_device(args.gpuid) -# ---------------------------------------------------------------------------- -# read saved model and args -# ---------------------------------------------------------------------------- - -components = torch.load(args.load_path) -model_args = components['model_args'] -model_args.gpuid = args.gpuid -model_args.batch_size = args.batch_size - # set this because only late fusion encoder is supported yet args.concat_history = True -for arg in vars(args): - print('{:<20}: {}'.format(arg, getattr(args, arg))) - # ---------------------------------------------------------------------------- # loading dataset wrapping with a dataloader # ---------------------------------------------------------------------------- - dataset = VisDialDataset(args, [args.split]) dataloader = DataLoader(dataset, batch_size=args.batch_size, @@ -109,85 +105,97 @@ print("{} iter per epoch.".format(args.iter_per_epoch)) # ---------------------------------------------------------------------------- -# setup the model -# ---------------------------------------------------------------------------- - -encoder = Encoder(model_args) -encoder.load_state_dict(components['encoder']) - -decoder = Decoder(model_args, encoder) -decoder.load_state_dict(components['decoder']) -print("Loaded model from {}".format(args.load_path)) - -if args.gpuid >= 0: - encoder = encoder.cuda() - decoder = decoder.cuda() - -# ---------------------------------------------------------------------------- -# evaluation +# read saved model and args # ---------------------------------------------------------------------------- - -print("Evaluation start time: {}".format( - datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S'))) -encoder.eval() -decoder.eval() - - -if args.use_gt: - # ------------------------------------------------------------------------ - # calculate automatic metrics and finish - # ------------------------------------------------------------------------ - all_ranks = [] - for i, batch in enumerate(tqdm(dataloader)): - for key in batch: - if not isinstance(batch[key], list): - batch[key] = Variable(batch[key], volatile=True) - if args.gpuid >= 0: - batch[key] = batch[key].cuda() - - enc_out = encoder(batch) - dec_out = decoder(enc_out, batch) - ranks = scores_to_ranks(dec_out.data) - gt_ranks = get_gt_ranks(ranks, batch['ans_ind'].data) - all_ranks.append(gt_ranks) - all_ranks = torch.cat(all_ranks, 0) - process_ranks(all_ranks) - gc.collect() -else: - # ------------------------------------------------------------------------ - # prepare json for submission - # ------------------------------------------------------------------------ - ranks_json = [] - for i, batch in enumerate(tqdm(dataloader)): - for key in batch: - if not isinstance(batch[key], list): - batch[key] = Variable(batch[key], volatile=True) - if args.gpuid >= 0: - batch[key] = batch[key].cuda() - - enc_out = encoder(batch) - dec_out = decoder(enc_out, batch) - ranks = scores_to_ranks(dec_out.data) - ranks = ranks.view(-1, 10, 100) - - for i in range(len(batch['img_fnames'])): - # cast into types explicitly to ensure no errors in schema - if args.split == 'test': - ranks_json.append({ - 'image_id': int(batch['img_fnames'][i][-16:-4]), - 'round_id': int(batch['num_rounds'][i]), - 'ranks': list(ranks[i][batch['num_rounds'][i] - 1]) - }) - else: - for j in range(batch['num_rounds'][i]): +for checkpoint in checkpoints: + model_path = os.path.join(args.load_path, checkpoint) + components = torch.load(model_path) + model_args = components['model_args'] + model_args.gpuid = args.gpuid + model_args.batch_size = args.batch_size + + for arg in vars(args): + print('{:<20}: {}'.format(arg, getattr(args, arg))) + + # ---------------------------------------------------------------------------- + # setup the model + # ---------------------------------------------------------------------------- + encoder = Encoder(model_args) + encoder.load_state_dict(components['encoder']) + + decoder = Decoder(model_args, encoder) + decoder.load_state_dict(components['decoder']) + print("Loaded model from {}".format(args.load_path)) + + if args.gpuid >= 0: + encoder = encoder.cuda() + decoder = decoder.cuda() + + # ---------------------------------------------------------------------------- + # evaluation + # ---------------------------------------------------------------------------- + + print("Evaluation start time: {}".format( + datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S'))) + encoder.eval() + decoder.eval() + + + if args.use_gt: + # ------------------------------------------------------------------------ + # calculate automatic metrics and finish + # ------------------------------------------------------------------------ + all_ranks = [] + for i, batch in enumerate(tqdm(dataloader)): + for key in batch: + if not isinstance(batch[key], list): + batch[key] = Variable(batch[key], volatile=True) + if args.gpuid >= 0: + batch[key] = batch[key].cuda() + + enc_out = encoder(batch) + dec_out = decoder(enc_out, batch) + ranks = scores_to_ranks(dec_out.data) + gt_ranks = get_gt_ranks(ranks, batch['ans_ind'].data) + all_ranks.append(gt_ranks) + all_ranks = torch.cat(all_ranks, 0) + process_ranks(all_ranks,args.load_path,checkpoint[6:-4]) + gc.collect() + else: + # ------------------------------------------------------------------------ + # prepare json for submission + # ------------------------------------------------------------------------ + ranks_json = [] + for i, batch in enumerate(tqdm(dataloader)): + for key in batch: + if not isinstance(batch[key], list): + batch[key] = Variable(batch[key], volatile=True) + if args.gpuid >= 0: + batch[key] = batch[key].cuda() + + enc_out = encoder(batch) + dec_out = decoder(enc_out, batch) + ranks = scores_to_ranks(dec_out.data) + ranks = ranks.view(-1, 10, 100) + + for i in range(len(batch['img_fnames'])): + # cast into types explicitly to ensure no errors in schema + if args.split == 'test': ranks_json.append({ 'image_id': int(batch['img_fnames'][i][-16:-4]), - 'round_id': int(j + 1), - 'ranks': list(ranks[i][j]) + 'round_id': int(batch['num_rounds'][i]), + 'ranks': list(ranks[i][batch['num_rounds'][i] - 1]) }) - gc.collect() - -if args.save_ranks: - print("Writing ranks to {}".format(args.save_path)) - os.makedirs(os.path.dirname(args.save_path), exist_ok=True) - json.dump(ranks_json, open(args.save_path, 'w')) + else: + for j in range(batch['num_rounds'][i]): + ranks_json.append({ + 'image_id': int(batch['img_fnames'][i][-16:-4]), + 'round_id': int(j + 1), + 'ranks': list(ranks[i][j]) + }) + gc.collect() + + if args.save_ranks: + print("Writing ranks to {}".format(args.save_path)) + os.makedirs(os.path.dirname(args.save_path), exist_ok=True) + json.dump(ranks_json, open(args.save_path, 'w')) diff --git a/utils/eval_utils.py b/utils/eval_utils.py index 69c6d03..760bdfd 100644 --- a/utils/eval_utils.py +++ b/utils/eval_utils.py @@ -29,7 +29,8 @@ def process_ranks(ranks, save_path, epoch): num_r5 = float(torch.sum(torch.le(ranks, 5))) num_r10 = float(torch.sum(torch.le(ranks, 10))) - with open(os.path.join(save_path, "ranks_{0}.txt".format(epoch)), "w") as f: + with open(os.path.join(save_path, "ranks_resutls.txt"), "a+") as f: + f.write("Epoch: {}".format(epoch)) f.write("\tNo. questions: {}".format(num_ques)) f.write("\tr@1: {}".format(num_r1 / num_ques)) f.write("\tr@5: {}".format(num_r5 / num_ques)) @@ -37,8 +38,10 @@ def process_ranks(ranks, save_path, epoch): f.write("\tr@10: {}".format(num_r10 / num_ques)) f.write("\tmeanR: {}".format(torch.mean(ranks))) f.write("\tmeanRR: {}".format(torch.mean(ranks.reciprocal()))) + f.write('\n') f.close() - + + print("\tNo. questions: {}".format(num_ques)) print("\tr@1: {}".format(num_r1 / num_ques)) print("\tr@5: {}".format(num_r5 / num_ques)) From f9078bd9a6aa8514c39c42d152b02ce2bcf33e8f Mon Sep 17 00:00:00 2001 From: Huda Date: Mon, 30 Nov 2020 11:07:24 -0500 Subject: [PATCH 35/49] fix log --- evaluate.py | 8 +++++--- utils/eval_utils.py | 14 +++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/evaluate.py b/evaluate.py index 96b4606..e07523b 100644 --- a/evaluate.py +++ b/evaluate.py @@ -77,10 +77,12 @@ ''' # seed for reproducibility torch.manual_seed(1234) - - -checkpoints = [file for file in os.listdir(args.load_path) if file.endswith(".pth")] +cur = os.getcwd() +os.chdir(args.load_path) +checkpoints = sorted(filter(os.path.isfile, os.listdir('.')), key=os.path.getmtime) +checkpoints = [file for file in checkpoints if file.endswith(".pth")] logging.info("Evaluate the following checkpoints: %s", args.load_path) +os.chdir(cur) # set device and default tensor type if args.gpuid >= 0: diff --git a/utils/eval_utils.py b/utils/eval_utils.py index 760bdfd..4cd5a17 100644 --- a/utils/eval_utils.py +++ b/utils/eval_utils.py @@ -31,13 +31,13 @@ def process_ranks(ranks, save_path, epoch): with open(os.path.join(save_path, "ranks_resutls.txt"), "a+") as f: f.write("Epoch: {}".format(epoch)) - f.write("\tNo. questions: {}".format(num_ques)) - f.write("\tr@1: {}".format(num_r1 / num_ques)) - f.write("\tr@5: {}".format(num_r5 / num_ques)) - f.write("\tr@10: {}".format(num_r10 / num_ques)) - f.write("\tr@10: {}".format(num_r10 / num_ques)) - f.write("\tmeanR: {}".format(torch.mean(ranks))) - f.write("\tmeanRR: {}".format(torch.mean(ranks.reciprocal()))) + f.write("\tNo. questions: {}\n".format(num_ques)) + f.write("\tr@1: {}\n".format(num_r1 / num_ques)) + f.write("\tr@5: {}\n".format(num_r5 / num_ques)) + f.write("\tr@10: {}\n".format(num_r10 / num_ques)) + f.write("\tr@10: {}\n".format(num_r10 / num_ques)) + f.write("\tmeanR: {}\n".format(torch.mean(ranks))) + f.write("\tmeanRR: {}\n".format(torch.mean(ranks.reciprocal()))) f.write('\n') f.close() From 727ee2de05712a8229f066d09253693144f2709a Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Tue, 1 Dec 2020 08:44:20 -0500 Subject: [PATCH 36/49] minor bug --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 6333ac8..d73a734 100644 --- a/train.py +++ b/train.py @@ -347,7 +347,7 @@ def repeat_tensors(batch, num_repeat): 'encoder': model.module.encoder.state_dict(), 'decoder': model.module.decoder.state_dict(), 'optimizer': optimizer.state_dict(), - 'model_args': model.args + 'model_args': model.module.args }, os.path.join(args.save_path, 'model_final.pth')) np.save(os.path.join(args.save_path, 'log_loss'), log_loss) From 025735111b9e3c95ca548017bbc41816001e17f1 Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Thu, 3 Dec 2020 08:54:52 -0500 Subject: [PATCH 37/49] fix eval --- checkpoints | 1 - evaluate.py | 1 + utils/eval_utils.py | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) delete mode 120000 checkpoints diff --git a/checkpoints b/checkpoints deleted file mode 120000 index 1628e5d..0000000 --- a/checkpoints +++ /dev/null @@ -1 +0,0 @@ -/media/halamri/31db5016-276c-4234-84da-1624299843eb/checkpoints \ No newline at end of file diff --git a/evaluate.py b/evaluate.py index e07523b..73787b0 100644 --- a/evaluate.py +++ b/evaluate.py @@ -110,6 +110,7 @@ # read saved model and args # ---------------------------------------------------------------------------- for checkpoint in checkpoints: + print('checkpoint:',checkpoint) model_path = os.path.join(args.load_path, checkpoint) components = torch.load(model_path) model_args = components['model_args'] diff --git a/utils/eval_utils.py b/utils/eval_utils.py index 4cd5a17..4205e9c 100644 --- a/utils/eval_utils.py +++ b/utils/eval_utils.py @@ -35,7 +35,6 @@ def process_ranks(ranks, save_path, epoch): f.write("\tr@1: {}\n".format(num_r1 / num_ques)) f.write("\tr@5: {}\n".format(num_r5 / num_ques)) f.write("\tr@10: {}\n".format(num_r10 / num_ques)) - f.write("\tr@10: {}\n".format(num_r10 / num_ques)) f.write("\tmeanR: {}\n".format(torch.mean(ranks))) f.write("\tmeanRR: {}\n".format(torch.mean(ranks.reciprocal()))) f.write('\n') From 49d373190545b288297282ce96557ac003f2cb9c Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Thu, 3 Dec 2020 19:45:22 -0500 Subject: [PATCH 38/49] Moves arguments into args.py Signed-off-by: Apoorva Beedu --- args.py | 86 ++++++++++++++++++++++++++++++++++++++++++ encoders/s3dg_video.py | 2 +- train.py | 78 +------------------------------------- 3 files changed, 89 insertions(+), 77 deletions(-) create mode 100644 args.py diff --git a/args.py b/args.py new file mode 100644 index 0000000..45c9726 --- /dev/null +++ b/args.py @@ -0,0 +1,86 @@ +import argparse + + +def get_args(parser, description='MILNCE'): + if parser is None: + parser = argparse.ArgumentParser(description=description) + + parser.add_argument_group('Input modalites arguments') + + parser.add_argument('-input_type', default='Q_DH_V', + choices=['Q_only', 'Q_DH', 'Q_A', 'Q_I', 'Q_V', 'Q_C_I', 'Q_DH_V', 'Q_DH_I', 'Q_V_A', 'Q_DH_V_A'], help='Specify the inputs') + + parser.add_argument_group('Encoder Decoder choice arguments') + parser.add_argument('-encoder', default='lf-ques-im-hist', + choices=['lf-ques-im-hist'], help='Encoder to use for training') + parser.add_argument('-concat_history', default=True, + help='True for lf encoding') + parser.add_argument('-decoder', default='disc', + choices=['disc'], help='Decoder to use for training') + + parser.add_argument_group('Optimization related arguments') + parser.add_argument('-num_epochs', default=45, type=int, help='Epochs') + parser.add_argument('-batch_size', default=12, type=int, help='Batch size') + parser.add_argument('-lr', default=1e-4, type=float, help='Learning rate') + parser.add_argument('-lr_decay_rate', default=0.9, + type=float, help='Decay for lr') + parser.add_argument('-min_lr', default=5e-5, type=float, + help='Minimum learning rate') + parser.add_argument('-weight_init', default='xavier', + choices=['xavier', 'kaiming'], help='Weight initialization strategy') + parser.add_argument('-weight_decay', default=5e-4, + help='Weight decay for l2 regularization') + parser.add_argument('-overfit', action='store_true', + help='Overfit on 5 examples, meant for debugging') + parser.add_argument('-gpuid', default=0, type=int, help='GPU id to use') + + parser.add_argument_group('Checkpointing related arguments') + parser.add_argument('-load_path', default='', + help='Checkpoint to load path from') + parser.add_argument('-save_path', default='checkpoints/', + help='Path to save checkpoints') + parser.add_argument('-save_step', default=4, type=int, + help='Save checkpoint after every save_step epochs') + parser.add_argument('-eval_step', default=100, type=int, + help='Run validation after every eval_step iterations') + parser.add_argument('-input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", + help=".h5 file path for the charades s3d features.") + parser.add_argument('-finetune', default=0, type=int, + help="When set true, the model finetunes the s3dg model for video") + + # S3DG parameters and dataloader + parser.add_argument('-num_frames', type=int, default=40, + help='num_frame') + parser.add_argument('-video_size', type=int, default=224, + help='random seed') + parser.add_argument('-fps', type=int, default=16, help='') + parser.add_argument('-crop_only', type=int, default=1, + help='random seed') + parser.add_argument('-center_crop', type=int, default=0, + help='random seed') + parser.add_argument('-random_flip', type=int, default=0, + help='random seed') + parser.add_argument('-video_root', default='data/videos') + parser.add_argument('-unfreeze_layers', default=1, type=int, + help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") + parser.add_argument("-text_encoder", default="lstm", + help="lstm or transformer", type=str) + parser.add_argument("-use_npy", default=1, type=int, + help="Uses npy instead of reading from videos") + parser.add_argument("-numpy_path", default="data/charades") + parser.add_argument("-num_workers", default=8, type=int) + + parser.add_argument_group('Visualzing related arguments') + parser.add_argument('-enableVis', type=int, default=1) + parser.add_argument('-visEnvName', type=str, default='s3d_Nofinetune') + parser.add_argument('-server', type=str, default='127.0.0.1') + parser.add_argument('-serverPort', type=int, default=8855) + parser.add_argument('-set_cuda_device', type=str, default='') + parser.add_argument("-seed", type=int, default=1, + help="random seed for initialization") + # ---------------------------------------------------------------------------- + # input arguments and options + # ---------------------------------------------------------------------------- + + args = parser.parse_args() + return args diff --git a/encoders/s3dg_video.py b/encoders/s3dg_video.py index 324a56f..dbc2665 100644 --- a/encoders/s3dg_video.py +++ b/encoders/s3dg_video.py @@ -265,7 +265,7 @@ def _space_to_depth(self, input): input = input.contiguous().view(B, 8 * C, T // 2, H // 2, W // 2) return input - def forward(self, inputs1): + def forward(self, inputs): """Defines the S3DG base architecture. """ if self.space_to_depth: diff --git a/train.py b/train.py index 6333ac8..dadf8cf 100644 --- a/train.py +++ b/train.py @@ -13,6 +13,7 @@ from torch.utils.data import DataLoader from tqdm import tqdm +from args import get_args from dataloader import VisDialDataset from decoders import Decoder from encoders import Encoder, LateFusionEncoder @@ -23,83 +24,8 @@ VisDialDataset.add_cmdline_args(parser) LateFusionEncoder.add_cmdline_args(parser) -parser.add_argument_group('Input modalites arguments') -parser.add_argument('-input_type', default='Q_DH_V', - choices=['Q_only', 'Q_DH', 'Q_A', 'Q_I', 'Q_V', 'Q_C_I', 'Q_DH_V', 'Q_DH_I', 'Q_V_A', 'Q_DH_V_A'], help='Specify the inputs') - -parser.add_argument_group('Encoder Decoder choice arguments') -parser.add_argument('-encoder', default='lf-ques-im-hist', - choices=['lf-ques-im-hist'], help='Encoder to use for training') -parser.add_argument('-concat_history', default=True, - help='True for lf encoding') -parser.add_argument('-decoder', default='disc', - choices=['disc'], help='Decoder to use for training') - -parser.add_argument_group('Optimization related arguments') -parser.add_argument('-num_epochs', default=45, type=int, help='Epochs') -parser.add_argument('-batch_size', default=12, type=int, help='Batch size') -parser.add_argument('-lr', default=1e-4, type=float, help='Learning rate') -parser.add_argument('-lr_decay_rate', default=0.9, - type=float, help='Decay for lr') -parser.add_argument('-min_lr', default=5e-5, type=float, - help='Minimum learning rate') -parser.add_argument('-weight_init', default='xavier', - choices=['xavier', 'kaiming'], help='Weight initialization strategy') -parser.add_argument('-weight_decay', default=5e-4, - help='Weight decay for l2 regularization') -parser.add_argument('-overfit', action='store_true', - help='Overfit on 5 examples, meant for debugging') -parser.add_argument('-gpuid', default=0, type=int, help='GPU id to use') - -parser.add_argument_group('Checkpointing related arguments') -parser.add_argument('-load_path', default='', - help='Checkpoint to load path from') -parser.add_argument('-save_path', default='checkpoints/', - help='Path to save checkpoints') -parser.add_argument('-save_step', default=4, type=int, - help='Save checkpoint after every save_step epochs') -parser.add_argument('-eval_step', default=100, type=int, - help='Run validation after every eval_step iterations') -parser.add_argument('-input_vid', default="data/charades_s3d_mixed_5c_fps_16_num_frames_40_original_scaled", - help=".h5 file path for the charades s3d features.") -parser.add_argument('-finetune', default=0, type=int, - help="When set true, the model finetunes the s3dg model for video") - -# S3DG parameters and dataloader -parser.add_argument('-num_frames', type=int, default=40, - help='num_frame') -parser.add_argument('-video_size', type=int, default=224, - help='random seed') -parser.add_argument('-fps', type=int, default=16, help='') -parser.add_argument('-crop_only', type=int, default=1, - help='random seed') -parser.add_argument('-center_crop', type=int, default=0, - help='random seed') -parser.add_argument('-random_flip', type=int, default=0, - help='random seed') -parser.add_argument('-video_root', default='data/videos') -parser.add_argument('-unfreeze_layers', default=1, type=int, - help="if 1, unfreezes _5 layers, if 2 unfreezes _4 and _5 layers, if 0, unfreezes all layers") -parser.add_argument("-text_encoder", default="lstm", - help="lstm or transformer", type=str) -parser.add_argument("-use_npy", default=1, type=int, - help="Uses npy instead of reading from videos") -parser.add_argument("-numpy_path", default="data/charades") -parser.add_argument("-num_workers", default=8, type=int) - -parser.add_argument_group('Visualzing related arguments') -parser.add_argument('-enableVis', type=int, default=1) -parser.add_argument('-visEnvName', type=str, default='s3d_Nofinetune') -parser.add_argument('-server', type=str, default='127.0.0.1') -parser.add_argument('-serverPort', type=int, default=8855) -parser.add_argument('-set_cuda_device', type=str, default='') -parser.add_argument("-seed", type=int, default=1, - help="random seed for initialization") -# ---------------------------------------------------------------------------- -# input arguments and options -# ---------------------------------------------------------------------------- +args = get_args(parser) -args = parser.parse_args() args.numpy_path += "/num_frames_{}".format(args.num_frames) start_time = datetime.datetime.strftime( datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') From e2c7bc7036d75df1229b12aa4a131703aabfa442 Mon Sep 17 00:00:00 2001 From: Apoorva Beedu Date: Thu, 3 Dec 2020 20:29:40 -0500 Subject: [PATCH 39/49] Adds minor changes to evaluate.py, not tested Signed-off-by: Apoorva Beedu --- .evaluate.py.swo | Bin 16384 -> 0 bytes evaluate.py | 108 ++++------------------------------------------- 2 files changed, 9 insertions(+), 99 deletions(-) delete mode 100644 .evaluate.py.swo diff --git a/.evaluate.py.swo b/.evaluate.py.swo deleted file mode 100644 index 672ed65c7de046faacd13bf1ffcee5538f429721..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHNTZklA89tkMPhz4zh>CJDW9*)!r>1*$c9nrKj5A4$uCVK5XOVF@)Ks1BuARE2 zPSwozE-_C&hx11cIqh>G}-MAz>>r%rdzOm}B= zC%Q-#d{ei-&gJ|6^IuNSR&)K{FYr$~3k=6CjQ#$Gr^U0!u47-low2SPx=|J!r&p5> zwcTq?dT-x-#WpzS?WEa!bG|v}?kImw?zmy$=F&-fR?X!qKoj`;jf{N)c;yDho(4kTQ^3pbVeB%n4SX2*?YkNKE)W4N;BW6@ z>`7o7SO9JWe)Ueq9tSeubHFXYpWeaPAA#=!K5#qm*S9nFBcKD^0etOkjI9B$U(eWY zfX9GKz%uY5;00`sd=GdKxEuHw@CWeq6hOT7h`WFLNZm}y%yE5RxLK!&WSrM)vpaz= z6OkoJUTbh)w%sDkmukM7yK`RG&0JrqIW41VGmMWox-v|cYMC4RNyL?uKIBGsWn@G< z3{v!!_fokO#QC7tX5%V5@GvEr2`@|Dst(YyNaC}h^}VG_A<5s7x@Mslp~`VymOvH;yp^R?rv_i*SQ4M7iz!zMUcQkcMuvhm8D zI@gskY$xY=R!EpeSMn(FWysq>Eb}6kNQ+WL(dAYMiy%Pr-*Z>*kQvjZsw) zr$sK%^3Z2fdsc(MC`1%okiIzK38+@IPKp6NVxcLLEa(KW8w%yQq4cLpEl)Y!6PQsb z!BO7jZA^h?YL^*D!@MosyvU?-Fs~7dtu&X@^nKFi=T#D$7W0HFcSmxpv29wZqgNn_ z2Yz-x^_nbb< z1E1zns4;874&g5IKV;*EFhP074Ni8ISGSxG)5E}GVG9R+F)irv_IH>Lu2pAj#y%+u zmBz4llH?fA-=Yk!Hk`DE7mkVJpV4E~a+Cb`kkiE2 zIe9gNO8C7pEM&%sVj6GaMe()hjGfCUoAtU$;7PT#QG){m3AJ1*YM_>9ZV=ZtZAP+| zh5_Pn!tFoHas-00afO7@h-Id1bb<$2=}K=qO_2XlACw@JK`~3BvE7i^f$-(^TT*i- zdXqO3XT<7f#rh$FbuKUDLdG8QjZB0pk3^V>rqgUTTcU{DnUoi$2;CmuwuDV8a7on1 zX%ANe;}9pCb2@PmGEKa$Xf>Ubg6*VO_MZe2tJr}u&)BrIMiVD}o)OFDbgwzBo70LY z?};As2v=6dextct9<~PQ1?`^BS2W4v!|Xr@XuWR`Mjc>+okF<2yP(*8mmytOBwpl) z$n~f)QFRmfXV`k6R*{peB85b9I9|@k^Ce0G^tH2G7C5JM%~PL2vDAiTndDLNAWD-g z=gaGBtLAx;2Vwtlnk85vWd_vX%0p73M2;$n{BnbLWR68=?#$=D)aKJ~Vb$E5m!)m> zUzXEu@B>VgjrB8TJI=j4lv6>fyJA6dB^*H$J`_9{XOX|E|mwAUTaUSUKs52AkiRx^z1|L>s2eGc_6)&DDH z9eV}!{HwraKmi{I-Uqyh`u$npTL9JdP2eA>fC5+n z-ay^|Yv6}K477lkQ1?F%JOexg&|bhtfR|DKKM6brJOq3l*ahwcu7DHb<)_3=byN&g z3|zwuP!J~P=lDgG)^%(Ubl>1fD&smuGN)f~BOveAW=u|)C?|5ar5|Lfj%%vN9o)p& zHre;pOaobNpmLFHFWrZ#*nFR6NSo?Y=dvL0XNZ(VU3y^3K@JS(yuXYprlEm;95;o2{Q4zEjSnAG zK%B>rDCzFVdf5+E$~r+e{md+~;csN9T)0lz6n3l&MJ#-|B`t0lka<@K z(Gy9L>x63{T|Py-I{B~1a@Q0E1CjnC2zg128Z;NQVkoW5G1nwBV;_7$L)O!w;mQ>I zf~BA`k`1oat>5hZs{iH26sl`MTZ&Vgj_SIp+-Pp{+l(imudyY@NZKBQ7SXBp7o`+z z@ypCgLBDa5o53BX3l0-#4k^L5LuaF~ZQ0vIkAoY>D|~l^GqR?o_wx9sb-TmO9ca?# zxus=LM-9G{V9yZyutPr8Hd4+lP(iG6H}Y*D8*Jjx;Pw15d zhcZJxT&HBZPPw|-5GTu|eh2Fb9BgdrXVZN4h>o(k6gK2yK~aR#D@XPBP>;I9flE9P zSoV}wbvF!66=m$Ve+wE+)#6bovqB9%v_Sh$tv9y?*`1t3WbJdWS#vcNMx)M0BYP~D^U1=eAk JjY Date: Thu, 3 Dec 2020 22:03:33 -0500 Subject: [PATCH 40/49] bert_encoder --- dataloader.py | 10 +++--- decoders/disc.py | 29 ++++++++++----- encoders/lf.py | 19 +++++++--- envn.yml | 94 ++++++++++++++++++++++++++++++++++++++++++++++++ train.py | 2 +- 5 files changed, 136 insertions(+), 18 deletions(-) create mode 100644 envn.yml diff --git a/dataloader.py b/dataloader.py index 74fc7fa..ee5f9bc 100644 --- a/dataloader.py +++ b/dataloader.py @@ -2,7 +2,7 @@ import json from six import iteritems from random import shuffle - +from transformers import BertTokenizer import h5py import hdfdict import numpy as np @@ -71,15 +71,15 @@ def __init__(self, args, subsets): print("Dataloader loading h5 file: {}".format(args.input_ques)) ques_file = h5py.File(args.input_ques, 'r') - if 'image' in args.input_type: + if 'I' in args.input_type: print("Dataloader loading h5 file: {}".format(args.input_img)) img_file = h5py.File(args.input_img, 'r') - if 'video' in args.input_type: + if 'V' in args.input_type: print("Dataloader loading h5 file: {}".format(args.input_vid)) vid_file = args.input_vid - if 'audio' in args.input_type: + if 'A' in args.input_type: print("Dataloader loading h5 file: {}".format(args.input_audio)) audio_file = h5py.File(args.input_audio, 'r') @@ -442,4 +442,4 @@ def _process_history(self, dtype): hist_len[th_id][round_id] = hlen self.data[dtype + '_hist'] = history - self.data[dtype + '_hist_len'] = hist_len + self.data[dtype + '_hist_len'] = hist_len \ No newline at end of file diff --git a/decoders/disc.py b/decoders/disc.py index 3fadae3..f267073 100644 --- a/decoders/disc.py +++ b/decoders/disc.py @@ -30,19 +30,32 @@ def forward(self, enc_out, batch): options = batch['opt'] options_len = batch['opt_len'] # word embed options - options = options.view(options.size(0) * options.size(1), options.size(2), -1) - options_len = options_len.view(options_len.size(0) * options_len.size(1), -1) - batch_size, num_options, max_opt_len = options.size() - options = options.contiguous().view(-1, num_options * max_opt_len) - options = self.word_embed(options) - options = options.view(batch_size, num_options, max_opt_len, -1) + if self.args.text_encoder == 'BERT': + batch_size, rounds, num_options, num_words = options.size() + options_embeds = torch.zeros([batch_size * rounds, num_options, num_words, self.args.embed_size], + dtype=torch.float) + options = options.view(batch_size*rounds, num_options, -1) + for i in range(batch_size*rounds): + opt_embed = self.word_embed(options[i])['last_hidden_state'].detach().cpu() + opt_embed = self.word_embed(options[i])['last_hidden_state'].detach().cpu() + options_embeds[i, :] = opt_embed + options_embeds = options_embeds.view(batch_size * rounds, num_options, num_words, -1) + + else: + options = options.view(options.size(0) * options.size(1), options.size(2), -1) + batch_size, num_options, max_opt_len = options.size() + options = options.contiguous().view(-1, num_options * max_opt_len) + options_embeds = self.word_embed(options) + options_embeds = options_embeds.view(batch_size, num_options, max_opt_len, -1) + + options_len = options_len.view(options_len.size(0) * options_len.size(1), -1) # score each option scores = [] for opt_id in range(num_options): - opt = options[:, opt_id, :, :] + opt = options_embeds[:, opt_id, :, :] opt_len = options_len[:, opt_id] - opt_embed = self.option_rnn(opt, opt_len) + opt_embed = self.option_rnn(opt.to(0), opt_len) scores.append(torch.sum(opt_embed * enc_out, 1)) scores = torch.stack(scores, 1) diff --git a/encoders/lf.py b/encoders/lf.py index cf10cab..8bed2ed 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -2,10 +2,10 @@ from torch import nn from torch.nn import functional as F -from utils import DynamicRNN +from utils import DynamicRNN from encoders.s3dg_video import S3D - +from transformers import BertTokenizer, BertModel class LateFusionEncoder(nn.Module): @@ -33,8 +33,13 @@ def __init__(self, args): super().__init__() self.args = args - self.word_embed = nn.Embedding( - args.vocab_size, args.embed_size, padding_idx=0) + if args.text_encoder == 'lstm': + args.embed_size = 300 + self.word_embed = nn.Embedding( + args.vocab_size, args.embed_size, padding_idx=0) + else: + args.embed_size = 768 + self.word_embed = BertModel.from_pretrained('bert-base-uncased') if self.args.finetune: self.video_embed = S3D( @@ -125,6 +130,10 @@ def forward(self, batch): # embed history hist = hist.view(-1, hist.size(2)) hist_embed = self.word_embed(hist) + + if self.args.text_encoder == 'BERT': + hist_embed = hist_embed['last_hidden_state'] + hist_embed = self.hist_rnn(hist_embed, batch['hist_len']) ques = batch['ques'] @@ -132,6 +141,8 @@ def forward(self, batch): # embed questions ques = ques.view(-1, ques.size(2)) ques_embed = self.word_embed(ques) + if self.args.text_encoder == 'BERT': + ques_embed = ques_embed['last_hidden_state'] ques_embed = self.ques_rnn(ques_embed, batch['ques_len']) if self.args.input_type == 'Q_only': diff --git a/envn.yml b/envn.yml new file mode 100644 index 0000000..015f01a --- /dev/null +++ b/envn.yml @@ -0,0 +1,94 @@ +name: visdial-bert +channels: + - pytorch + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _pytorch_select=0.2=gpu_0 + - blas=1.0=mkl + - ca-certificates=2020.10.14=0 + - certifi=2020.11.8=py38h06a4308_0 + - cffi=1.14.3=py38he30daa8_0 + - cudatoolkit=10.2.89=hfd86e86_1 + - freetype=2.10.4=h5ab3b9f_0 + - intel-openmp=2020.2=254 + - jpeg=9b=h024ee3a_2 + - lcms2=2.11=h396b838_0 + - ld_impl_linux-64=2.33.1=h53a641e_7 + - libedit=3.1.20191231=h14c3975_1 + - libffi=3.3=he6710b0_2 + - libgcc-ng=9.1.0=hdf63c60_0 + - libpng=1.6.37=hbc83047_0 + - libstdcxx-ng=9.1.0=hdf63c60_0 + - libtiff=4.1.0=h2733197_1 + - libuv=1.40.0=h7b6447c_0 + - lz4-c=1.9.2=heb0550a_3 + - mkl=2020.2=256 + - mkl-service=2.3.0=py38he904b0f_0 + - mkl_fft=1.2.0=py38h23d657b_0 + - mkl_random=1.1.1=py38h0573a6f_0 + - ncurses=6.2=he6710b0_1 + - ninja=1.10.1=py38hfd86e86_0 + - numpy=1.19.2=py38h54aff64_0 + - numpy-base=1.19.2=py38hfa32c7d_0 + - olefile=0.46=py_0 + - openssl=1.1.1h=h7b6447c_0 + - pillow=8.0.1=py38he98fc37_0 + - pip=20.2.4=py38h06a4308_0 + - pycparser=2.20=py_2 + - python=3.8.5=h7579374_1 + - pytorch=1.7.0=py3.8_cuda10.2.89_cudnn7.6.5_0 + - readline=8.0=h7b6447c_0 + - setuptools=50.3.1=py38h06a4308_1 + - six=1.15.0=py38h06a4308_0 + - sqlite=3.33.0=h62c20be_0 + - tk=8.6.10=hbc83047_0 + - torchaudio=0.7.0=py38 + - torchvision=0.8.1=py38_cu102 + - typing_extensions=3.7.4.3=py_0 + - wheel=0.35.1=py_0 + - xz=5.2.5=h7b6447c_0 + - zlib=1.2.11=h7b6447c_3 + - zstd=1.4.5=h9ceee32_0 + - pip: + - chardet==3.0.4 + - click==7.1.2 + - configparser==5.0.1 + - docker-pycreds==0.4.0 + - ffmpeg-python==0.2.0 + - future==0.18.2 + - gitdb==4.0.5 + - gitpython==3.1.11 + - h5py==3.1.0 + - h5py-wrapper==1.1.0 + - hdfdict==0.3.1 + - idna==2.10 + - install==1.3.4 + - jsonpatch==1.26 + - jsonpointer==2.0 + - pandas==1.1.4 + - pathtools==0.1.2 + - promise==2.3 + - protobuf==3.13.0 + - psutil==5.7.3 + - pytest-runner==5.2 + - python-dateutil==2.8.1 + - pytz==2020.4 + - pyyaml==5.3.1 + - pyzmq==20.0.0 + - requests==2.25.0 + - scipy==1.5.4 + - sentry-sdk==0.19.3 + - shortuuid==1.0.1 + - smmap==3.0.4 + - subprocess32==3.5.4 + - torchfile==0.1.0 + - tornado==6.1 + - tqdm==4.51.0 + - urllib3==1.26.2 + - visdom==0.1.8.9 + - wandb==0.10.10 + - watchdog==0.10.3 + - websocket-client==0.57.0 +prefix: /nethome/halamri3/anaconda3/envs/visdial-bert + diff --git a/train.py b/train.py index 067de2b..fe1a30c 100644 --- a/train.py +++ b/train.py @@ -159,7 +159,7 @@ shuffle=False, collate_fn=dataset.collate_fn) -dataset_test = VisDialDataset(args, ['test']) +dataset_test = VisDialDataset(args, ['test']) dataloader_test = DataLoader(dataset_test, batch_size=args.batch_size, shuffle=False, From 092736b4f83ee3a4449874814acfa40ddc99bccd Mon Sep 17 00:00:00 2001 From: halamri3 Date: Fri, 4 Dec 2020 07:29:49 -0500 Subject: [PATCH 41/49] freeze all bert layers --- decoders/disc.py | 8 -------- encoders/lf.py | 3 +++ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/decoders/disc.py b/decoders/disc.py index 50468bc..a1d0c7c 100644 --- a/decoders/disc.py +++ b/decoders/disc.py @@ -31,14 +31,6 @@ def forward(self, enc_out, batch): options = batch['opt'] options_len = batch['opt_len'] # word embed options - options = options.view(options.size( - 0) * options.size(1), options.size(2), -1) - options_len = options_len.view( - options_len.size(0) * options_len.size(1), -1) - batch_size, num_options, max_opt_len = options.size() - options = options.contiguous().view(-1, num_options * max_opt_len) - options = self.word_embed(options) - options = options.view(batch_size, num_options, max_opt_len, -1) if self.args.text_encoder == 'BERT': batch_size, rounds, num_options, num_words = options.size() diff --git a/encoders/lf.py b/encoders/lf.py index f4a118d..f9a96af 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -39,6 +39,9 @@ def __init__(self, args): else: args.embed_size = 768 self.word_embed = BertModel.from_pretrained('bert-base-uncased') + # Freeze all the layers and use bert to encode the text for now + for param in self.word_embed.parameters(): + param.requires_grad = False if self.args.finetune: self.video_embed = S3D( From 559507be8d921125a3e24879e38933d37f0fe7ad Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Fri, 4 Dec 2020 15:37:07 -0500 Subject: [PATCH 42/49] add the option_embed to the current device --- decoders/disc.py | 6 +++--- evaluate.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/decoders/disc.py b/decoders/disc.py index a1d0c7c..e6c0bb8 100644 --- a/decoders/disc.py +++ b/decoders/disc.py @@ -38,8 +38,7 @@ def forward(self, enc_out, batch): dtype=torch.float) options = options.view(batch_size*rounds, num_options, -1) for i in range(batch_size*rounds): - opt_embed = self.word_embed(options[i])['last_hidden_state'].detach().cpu() - opt_embed = self.word_embed(options[i])['last_hidden_state'].detach().cpu() + opt_embed = self.word_embed(options[i])['last_hidden_state'] options_embeds[i, :] = opt_embed options_embeds = options_embeds.view(batch_size * rounds, num_options, num_words, -1) @@ -56,7 +55,8 @@ def forward(self, enc_out, batch): for opt_id in range(num_options): opt = options_embeds[:, opt_id, :, :] opt_len = options_len[:, opt_id] - opt_embed = self.option_rnn(opt.to(0), opt_len) + device = opt_len.device + opt_embed = self.option_rnn(opt.to(device), opt_len) scores.append(torch.sum(opt_embed * enc_out, 1)) scores = torch.stack(scores, 1) diff --git a/evaluate.py b/evaluate.py index e736c75..ff413ce 100644 --- a/evaluate.py +++ b/evaluate.py @@ -115,7 +115,7 @@ def repeat_tensors(batch, num_repeat): new_batch[k] = torch.cat((new_batch[k], v[-1].unsqueeze(0)), 0) return new_batch - +''' if args.use_gt: # ------------------------------------------------------------------------ # calculate automatic metrics and finish @@ -174,6 +174,7 @@ def repeat_tensors(batch, num_repeat): # read saved model and args # ---------------------------------------------------------------------------- +''' for checkpoint in checkpoints: print('checkpoint:', checkpoint) model_path = os.path.join(args.load_path, checkpoint) From e7b7d525e70a02d8d4edf68ff7156ef9049076fd Mon Sep 17 00:00:00 2001 From: halamri3 Date: Fri, 4 Dec 2020 16:21:14 -0500 Subject: [PATCH 43/49] unfreeze bert layers --- args.py | 3 ++- encoders/lf.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/args.py b/args.py index 45c9726..1231d59 100644 --- a/args.py +++ b/args.py @@ -17,7 +17,8 @@ def get_args(parser, description='MILNCE'): help='True for lf encoding') parser.add_argument('-decoder', default='disc', choices=['disc'], help='Decoder to use for training') - + parser.add_argument('-finetune_textEncoder', default=0, + help= 'Finetune the text encoder') parser.add_argument_group('Optimization related arguments') parser.add_argument('-num_epochs', default=45, type=int, help='Epochs') parser.add_argument('-batch_size', default=12, type=int, help='Batch size') diff --git a/encoders/lf.py b/encoders/lf.py index f9a96af..82ebf37 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -40,8 +40,12 @@ def __init__(self, args): args.embed_size = 768 self.word_embed = BertModel.from_pretrained('bert-base-uncased') # Freeze all the layers and use bert to encode the text for now - for param in self.word_embed.parameters(): - param.requires_grad = False + if not self.args.finetune_textEncoder: + print('Freezing all bert layers') + for param in self.word_embed.parameters(): + param.requires_grad = False + else: + print('Finetuning text encoder layers') if self.args.finetune: self.video_embed = S3D( From ca6ba06867825dad8f1b739102baddc3ca710f5d Mon Sep 17 00:00:00 2001 From: halamri3 Date: Fri, 4 Dec 2020 17:52:02 -0500 Subject: [PATCH 44/49] freeze bert.encoder 10 layers, bert.embedding --- encoders/lf.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/encoders/lf.py b/encoders/lf.py index 82ebf37..7a84f9c 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -37,6 +37,8 @@ def __init__(self, args): self.word_embed = nn.Embedding( args.vocab_size, args.embed_size, padding_idx=0) else: + freeze_layers = True + freeze_embeddings = True args.embed_size = 768 self.word_embed = BertModel.from_pretrained('bert-base-uncased') # Freeze all the layers and use bert to encode the text for now @@ -46,6 +48,20 @@ def __init__(self, args): param.requires_grad = False else: print('Finetuning text encoder layers') + for param in list(self.word_embed.embeddings.parameters()): + param.requires_grad = False + print("Froze Embedding Layer") + layer_indexes = "1,2,3,4,5,6,7,8,9" + layer_indexes = [int(x) for x in freeze_layers.split(",")] + for layer_idx in layer_indexes: + for param in list(self.word_embed.encoder.layer[layer_idx].parameters()): + param.requires_grad = False + print("Froze Layer: ", layer_idx) + + + + + if self.args.finetune: self.video_embed = S3D( From e604d160ba533cfb5ddae3155b97704495d038ef Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Fri, 4 Dec 2020 18:53:36 -0500 Subject: [PATCH 45/49] freeze all layers --- encoders/lf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encoders/lf.py b/encoders/lf.py index 7a84f9c..8606b10 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -51,7 +51,7 @@ def __init__(self, args): for param in list(self.word_embed.embeddings.parameters()): param.requires_grad = False print("Froze Embedding Layer") - layer_indexes = "1,2,3,4,5,6,7,8,9" + freeze_layers = "1,2,3,4,5,6,7,8,9,10,11" layer_indexes = [int(x) for x in freeze_layers.split(",")] for layer_idx in layer_indexes: for param in list(self.word_embed.encoder.layer[layer_idx].parameters()): From 520eee8abb3d0e29a2ff846a0872ae31abcdc8a0 Mon Sep 17 00:00:00 2001 From: Huda Abdulhadi D Alamri Date: Mon, 7 Dec 2020 13:55:49 -0500 Subject: [PATCH 46/49] update for bert --- .args.py.swn | Bin 0 -> 16384 bytes .args.py.swo | Bin 0 -> 16384 bytes args.py | 7 ++++++- decoders/disc.py | 4 ++-- encoders/lf.py | 11 +++-------- evaluate.py | 7 +++---- train.py | 4 ++-- 7 files changed, 16 insertions(+), 17 deletions(-) create mode 100644 .args.py.swn create mode 100644 .args.py.swo diff --git a/.args.py.swn b/.args.py.swn new file mode 100644 index 0000000000000000000000000000000000000000..7ed3bf78d3b4e0cac2fb73f4ba4f780ef1259902 GIT binary patch literal 16384 zcmeHNJB-}M86Hb{I1fMa00JbjFt2;|EaD^?M?F%IN97R%*7{rJi?hcNx z49T@tMGP1M3_A}XDO?3`9|K7dAWb?KQJOd@(#3^Sq(~DRFyO+KzZuT#@=iSN36KE; z84BNV$>GfZ|KI%c`2W%M;nm%X^vqzB;Ch&lkNtQ2lWQl)?OzizWid;nIIW7hu3~>a z>MFNoKeG*P1hZU@hvVS}n-$>(lapeQ&+TXK?ig?k+-nA^GU}gw;L7RoXsDV#)q9M7 zIy`$_Zs9dkc- zsN6idn`6K+;23ZWI0hU8jseGjW56-s7;p?Y1{?#QK?Ybz$mgKfnhN-F{%_;|4;~`q zec*q<+rV3Z0H#16SO-3QkdPOFp8($m{`CMMuK>HiH-Y!QO2}`33&6v`KfXf9EnpM) z&zA}LJx~H?fct>kUxEx^47`6oA%6n)fk%NyfOo%0$nSsz*Z@BI0wI3}QeXu9`}2h4 zz$xIrFzT1UFMtW~JwO@yHWC?c0M~%;0v{mp@kiiQ;0M5?z$FNE$@xuQP=AZYY|dq|PEj;&mTXd%hKD{FHtVuDHkEsi6D2igvfy&S!qCV6 zl&BJ(mTO+$3iE40-Z~9=#HzU5h9>1a=i4GJ*E<@RiI8W$%(Ajk*9%$6Hgr+p)#8;3 zhc9ilCt*pJ(1P>OeA}*S{hc4hB8RAjZVQEnwVH>8sOP&A6o48o{Xk})wjfCGvoxN! z+B52+J<5yrN>8jCYwsEFbq^QfCU2EE9&fH9KCKczk}Tn!{k!<8-Lg&7zo+Qs)90U6 z?u-V}PlX-{%HiUSgrhJ>LAGuAq>cco37AmFx`RzF^65{ltz%KlnADk zRp~?Bh8{gV9BpGh8uJrzgNOcB;KPyp(G&iXe1CK3XHrZ=%3{9=Sj@xTst*2PmC_lD z1>}e-sD(nQ(Hp1&U{&_fEBuWY|+@kyjOY)dw`Aj&2C#Yc+^6S+E&OPbmBCDH0C$K=d z^tM-&ndH8jGgkJ%H7+L85|RQo*D~WVc$ri&lj5cx0iCj6($#%kz2OC0yx%d=2GdLg zyx9J+cZ1CYm#FCNGm$7mU$jetQB0(w6d0|wF0?2yEcs-99Hms%hE5A%-Lso_5%rQP zp)qGN)oucbo${c#A1f>DTnh8I%-DUV$Zp->#S`Fif@HOEo;*@TzJ0N0>=!kBn;D zR|yKTG+?Da6>26@J4Wjms->f)E$xk?{!c4j^hMHFjVw(!Oy0HWw#$~Pc!y;iIcTgW zS=pBUTHQOBk2Z%|n)ZU-MSCy!=br;NxIb0jcI(^K`gYDY&*z`>%`5z}-`F~;5Wg4m z=H;9RBARPw)%&aNHN4(}sc8s8Z7*rfl*}WSLrcl7+QF%RK=j`#SJh-F2RXA)a0Bb=!C|67Q?-vm_rZ_oE{BfftP*aPkd{)+hi=fLa01>kGI zUl8Np0(O9tz~2z#{|a~wxCOinJP7<2@%}F00sn-YzXR_8e*m;x6JxuZW56-s7;p?Y z1{?#90ms1qH3K$skVx)IEB(PX>IX7e+h zP4SdnkNH*6e#$dC(heIQqh`h8o4Wh&CfVHUt2Ee4ncT;j5jt@}-m*WBl)a`a=X0GN zZ`3__uwq=N#qny-?p-Mm&ZqgDiWDazNX%yu62mN1wnRwJ*QZQHrwghPgeDqF+;LEQ nv;uKaCRCj*VGI<=h9_KFcZOGVnXfW#zDf8rWcO5yI3D^0Y5s^E literal 0 HcmV?d00001 diff --git a/.args.py.swo b/.args.py.swo new file mode 100644 index 0000000000000000000000000000000000000000..ae5f46e622cd6d9d08c2db156b0ae175b308d35d GIT binary patch literal 16384 zcmeHNO^h5z6|Oi0!cQRl9Y7FOW)ZzXJF_$6-8dNSAz3fUBHP)-*gim1DtFh+l(zq< z?w)vj00~7x{2q|Vl^X|;xFB)i7AZf96p1?=!hr*qBoY#5zN)U8=^fAZtfg4Wq1yVk zXQsRAz4z67uc}`8R{zG~6?&n!LvTGx$V2bE%`Sf99NB-IkO_}@qU3q~sO>6_XM?uq z?%9uBhuh&aSHu3Wzs;vbw9VDH=;br_n!kGnJOlTIfk(*s;h?X(-sn6|zxajg_eF~D z)HC22@C+BvJ(v#uT8z-$N9h8|9|i~Lf!}716~9E z1V~^4bb&44zn>-KCE$m^SAl;#LdZ{nL*PrmyPqNCx4>oKQQ+^NCgc@h2l)4=2>CNm z0vCV>fHyx08Nd*D|6xM@3LFEE0bd9H`3XY)2qeHZ@V}1}@;4v_2Ee}_A|wYkfd9a# zUjx4a#=y4#ZRl$#WV{O81il7*fWpULfS&>110Dn31sA^u3@3KX=qcAk#Q9!Fj_6>E zR_RCyaYq!C?b4XfgetZu9u1pMb|_Dyr5F0mwmgn)DtQHh((MXJV6_K^yI( zyjb1n$t`Q`hsJw7!iBseR!STWch(V~RtXy^o`}}*oxio(vK>3V8}!=wOV4U|Mu!-u z!bF00xH#jHP=u`VfVmfRZcaoBXDex`Dgi&52%2P(i0MeCqO4NHo&K#jAC29EOi@EZ zYb3emAjyRg0&?!AYqaW3cA-ElU{^VzBS_RrgcDd9K_h5GM}n7?5=F074VmUu$)IjS zkB$jP*BL`+ERnZG#CAglM`D8~*@Aqw(`T8IW0~@p6(NsB)LA#cFIFj?@>oKS%u>on z7#Aw0Le213cT^cxGCg#QbJP5XGUf`1TW(%=1Sf%5zs38Wz(FQ<_#~{72TFZ8%{DAiem2v!EHX3 zLZM-B%w?hteaTG;Rxz2%QX*PwU1(7vEX8UUq zTm(uai*OMe@DVNF=9zj1Rvdo2CQ~3xjou420@LxjH2r;KlG-Cjwe9N!g;^T%l1-$} zM0&;O97FeXy0nG8@u>TwN)%n0baf{S(=C(tY`Pn=g(|_aj8lWgd6JXu=&#kid-_=A zPz%#uaI5Ix1$OB+lY7Ut(GT%?o;R=MB9x<ORW9wno2bn^>j-2^i%V>_q5&A9hrUJ zCRUB1l894GUX9Onn!s+-b|4FSHA}@s0zK(ilzMMiY}OC8@{3M_D~Fe_?jHnCT*O{h zCV8f?w6em9-v56S`|(!+z5jRT`)^`@|8w97co=vK`}s5l3lhiXt60g{DM>x?5v=*jkpt!5s>c zN7!S*g7~Qn;GA`b=wA zBIUPYaYL>?bWyRw=riyqk#pX^|m%(1n)G^MCFo<*V7UOxU>`l`+pPBM_ zqwUFq73(_7UZ=m;o_D>#c0SE#hzFdApfI0}5T88Kwv14ouTPnZRu@DOoS}<`5_cWc oo~}S#lnK>mONfC2)$l|p=g#noHuH7GjoE@vLv}~^h~uIE181Kx;{X5v literal 0 HcmV?d00001 diff --git a/args.py b/args.py index 1231d59..d6e2a6a 100644 --- a/args.py +++ b/args.py @@ -17,7 +17,7 @@ def get_args(parser, description='MILNCE'): help='True for lf encoding') parser.add_argument('-decoder', default='disc', choices=['disc'], help='Decoder to use for training') - parser.add_argument('-finetune_textEncoder', default=0, + parser.add_argument('-finetune_textEncoder', default=0, type=int, help= 'Finetune the text encoder') parser.add_argument_group('Optimization related arguments') parser.add_argument('-num_epochs', default=45, type=int, help='Epochs') @@ -79,6 +79,11 @@ def get_args(parser, description='MILNCE'): parser.add_argument('-set_cuda_device', type=str, default='') parser.add_argument("-seed", type=int, default=1, help="random seed for initialization") + + + + parser.add_argument('-use_gt', action='store_true', help='Whether to use ground truth for retriveing ranks') + parser.add_argument('--split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') # ---------------------------------------------------------------------------- # input arguments and options # ---------------------------------------------------------------------------- diff --git a/decoders/disc.py b/decoders/disc.py index e6c0bb8..1f9da43 100644 --- a/decoders/disc.py +++ b/decoders/disc.py @@ -38,8 +38,8 @@ def forward(self, enc_out, batch): dtype=torch.float) options = options.view(batch_size*rounds, num_options, -1) for i in range(batch_size*rounds): - opt_embed = self.word_embed(options[i])['last_hidden_state'] - options_embeds[i, :] = opt_embed + options_embeds[i,:] = self.word_embed(options[i])['last_hidden_state'] + #options_embeds[i, :] = opt_embed options_embeds = options_embeds.view(batch_size * rounds, num_options, num_words, -1) else: diff --git a/encoders/lf.py b/encoders/lf.py index 8606b10..76eb4c0 100644 --- a/encoders/lf.py +++ b/encoders/lf.py @@ -37,7 +37,6 @@ def __init__(self, args): self.word_embed = nn.Embedding( args.vocab_size, args.embed_size, padding_idx=0) else: - freeze_layers = True freeze_embeddings = True args.embed_size = 768 self.word_embed = BertModel.from_pretrained('bert-base-uncased') @@ -51,18 +50,14 @@ def __init__(self, args): for param in list(self.word_embed.embeddings.parameters()): param.requires_grad = False print("Froze Embedding Layer") - freeze_layers = "1,2,3,4,5,6,7,8,9,10,11" + freeze_layers = "0,1,2,3,4,5,6,7,8,9" layer_indexes = [int(x) for x in freeze_layers.split(",")] for layer_idx in layer_indexes: for param in list(self.word_embed.encoder.layer[layer_idx].parameters()): param.requires_grad = False print("Froze Layer: ", layer_idx) - - - - - - + for name, param in self.word_embed.named_parameters(): + print(name, param.requires_grad) if self.args.finetune: self.video_embed = S3D( dict_path='data/s3d_dict.npy', space_to_depth=True) diff --git a/evaluate.py b/evaluate.py index ff413ce..0cb0601 100644 --- a/evaluate.py +++ b/evaluate.py @@ -34,6 +34,7 @@ # ---------------------------------------------------------------------------- args = parser.parse_args() +model_args = args ''' log_path = os.path.join(args.load_path, 'eval_results.log') logging.basicConfig(filename='eval_results.log') @@ -219,8 +220,7 @@ def repeat_tensors(batch, num_repeat): batch[key] = batch[key].cuda() if not batch["vid_feat"].shape[0] % args.num_gpu == 0: - num_repeat = args.num_gpu - \ - batch["vid_feat"].shape[0] % args.num_gpu + num_repeat = args.num_gpu - batch["vid_feat"].shape[0] % args.num_gpu batch = repeat_tensors(batch, num_repeat) new_batch = convert_list_to_tensor(batch) dec_out, _ = model(new_batch) @@ -243,8 +243,7 @@ def repeat_tensors(batch, num_repeat): batch[key] = batch[key].cuda() if not batch["vid_feat"].shape[0] % args.num_gpu == 0: - num_repeat = args.num_gpu - \ - batch["vid_feat"].shape[0] % args.num_gpu + num_repeat = args.num_gpu - batch["vid_feat"].shape[0] % args.num_gpu batch = repeat_tensors(batch, num_repeat) new_batch = convert_list_to_tensor(batch) dec_out, _ = model(new_batch) diff --git a/train.py b/train.py index ec288c5..388df52 100644 --- a/train.py +++ b/train.py @@ -31,8 +31,8 @@ if args.save_path == 'checkpoints/': # args.save_path += start_time - args.save_path += 'input_type_{0}_s3d_mixed_5c_fps_{1}_num_frames_{2}_text_encoder_{3}_lr_{4}_unfreeze_layer_{5}_finetune_{6}_use_npy_{7}_batch_size_{8}'.format( - args.input_type, args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, args.finetune, args.use_npy, args.batch_size) + args.save_path += 'input_type_{0}_s3d_mixed_5c_fps_{1}_num_frames_{2}_text_encoder_{3}_lr_{4}_unfreeze_layer_{5}_finetune_{6}_use_npy_{7}_batch_size_{8}_finetuneBert_{9}'.format( + args.input_type, args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, args.finetune, args.use_npy, args.batch_size, args.finetune_textEncoder) # ------------------------------------------------------------------------------------- # setting visdom args From 044f1c55c3d052b0e516b67a6943e61f87b545c7 Mon Sep 17 00:00:00 2001 From: halamri3 Date: Mon, 7 Dec 2020 18:27:06 -0500 Subject: [PATCH 47/49] evaluate --- evaluate.py | 24 ++++++++++++++---------- train.py | 1 - 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/evaluate.py b/evaluate.py index 0cb0601..8e95084 100644 --- a/evaluate.py +++ b/evaluate.py @@ -75,8 +75,7 @@ # ---------------------------------------------------------------------------- # setup the model # ---------------------------------------------------------------------------- - - +''' model = AVSD(model_args) model._load_state_dict_(components) print("Loaded model from {}".format(args.load_path)) @@ -84,16 +83,11 @@ if args.gpuid >= 0: model = torch.nn.DataParallel(model, output_device=0, dim=0) model = model.to(device) - +''' # ---------------------------------------------------------------------------- # evaluation # ---------------------------------------------------------------------------- -print("Evaluation start time: {}".format( - datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S'))) -model.eval() - - def convert_list_to_tensor(batch): new_batch = {} for k, v in batch.items(): @@ -176,14 +170,23 @@ def repeat_tensors(batch, num_repeat): # read saved model and args # ---------------------------------------------------------------------------- ''' + +print("Evaluation start time: {}".format( + datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S'))) + for checkpoint in checkpoints: + print('checkpoint:', checkpoint) model_path = os.path.join(args.load_path, checkpoint) components = torch.load(model_path) model_args = components['model_args'] + + model = AVSD(model_args) + model._load_state_dict_(components) model_args.gpuid = args.gpuid model_args.batch_size = args.batch_size + for arg in vars(args): print('{:<20}: {}'.format(arg, getattr(args, arg))) @@ -191,8 +194,7 @@ def repeat_tensors(batch, num_repeat): # setup the model # ---------------------------------------------------------------------------- - model = AVSD(model_args) - model._load_state_dict_(components) + print("Loaded model from {}".format(args.load_path)) if args.gpuid >= 0: @@ -271,3 +273,5 @@ def repeat_tensors(batch, num_repeat): print("Writing ranks to {}".format(args.save_path)) os.makedirs(os.path.dirname(args.save_path), exist_ok=True) json.dump(ranks_json, open(args.save_path, 'w')) + + diff --git a/train.py b/train.py index 388df52..e2e5031 100644 --- a/train.py +++ b/train.py @@ -33,7 +33,6 @@ # args.save_path += start_time args.save_path += 'input_type_{0}_s3d_mixed_5c_fps_{1}_num_frames_{2}_text_encoder_{3}_lr_{4}_unfreeze_layer_{5}_finetune_{6}_use_npy_{7}_batch_size_{8}_finetuneBert_{9}'.format( args.input_type, args.fps, args.num_frames, args.text_encoder, args.lr, args.unfreeze_layers, args.finetune, args.use_npy, args.batch_size, args.finetune_textEncoder) - # ------------------------------------------------------------------------------------- # setting visdom args # ------------------------------------------------------------------------------------- From 6be2f14afe5d28b508131ef15ab1edc9786a8805 Mon Sep 17 00:00:00 2001 From: halamri3 Date: Sun, 10 Jan 2021 18:02:10 -0500 Subject: [PATCH 48/49] fix a minor bug --- args.py | 5 +-- evaluate.py | 34 +++++++++++---- utils/eval_utils.py | 10 ++++- utils/visualize.py | 100 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 134 insertions(+), 15 deletions(-) diff --git a/args.py b/args.py index d6e2a6a..0291066 100644 --- a/args.py +++ b/args.py @@ -79,9 +79,8 @@ def get_args(parser, description='MILNCE'): parser.add_argument('-set_cuda_device', type=str, default='') parser.add_argument("-seed", type=int, default=1, help="random seed for initialization") - - - + + parser.add_argument('-save_ranks', action='store_true', help='Whether to save retrieved ranks') parser.add_argument('-use_gt', action='store_true', help='Whether to use ground truth for retriveing ranks') parser.add_argument('--split', default='test', choices=['val', 'test', 'train'], help='Split to evaluate on') # ---------------------------------------------------------------------------- diff --git a/evaluate.py b/evaluate.py index 8e95084..e5e361a 100644 --- a/evaluate.py +++ b/evaluate.py @@ -16,7 +16,9 @@ from decoders import Decoder from encoders import Encoder, LateFusionEncoder from models import AVSD -from utils import get_gt_ranks, process_ranks, scores_to_ranks +from utils import get_gt_ranks, process_ranks, scores_to_ranks, visualize +import pprint + parser = argparse.ArgumentParser() VisDialDataset.add_cmdline_args(parser) @@ -24,6 +26,11 @@ args = get_args(parser) +viz = visualize.VisdomVisualize( + env_name=args.visEnvName, + server=args.server, + port=args.serverPort) + # seed for reproducibility torch.manual_seed(1234) torch.backends.cudnn.deterministic = True @@ -174,13 +181,17 @@ def repeat_tensors(batch, num_repeat): print("Evaluation start time: {}".format( datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S'))) +i=0 + for checkpoint in checkpoints: print('checkpoint:', checkpoint) model_path = os.path.join(args.load_path, checkpoint) components = torch.load(model_path) model_args = components['model_args'] - + if i == 0: + viz.showText(pprint.pformat(args, indent=4)) + i +=1 model = AVSD(model_args) model._load_state_dict_(components) model_args.gpuid = args.gpuid @@ -210,6 +221,7 @@ def repeat_tensors(batch, num_repeat): model.eval() if args.use_gt: + viz.save() # ------------------------------------------------------------------------ # calculate automatic metrics and finish # ------------------------------------------------------------------------ @@ -221,16 +233,22 @@ def repeat_tensors(batch, num_repeat): if args.gpuid >= 0: batch[key] = batch[key].cuda() - if not batch["vid_feat"].shape[0] % args.num_gpu == 0: - num_repeat = args.num_gpu - batch["vid_feat"].shape[0] % args.num_gpu - batch = repeat_tensors(batch, num_repeat) new_batch = convert_list_to_tensor(batch) dec_out, _ = model(new_batch) ranks = scores_to_ranks(dec_out.data) gt_ranks = get_gt_ranks(ranks, batch['ans_ind'].data) all_ranks.append(gt_ranks) all_ranks = torch.cat(all_ranks, 0) - process_ranks(all_ranks, args.load_path, checkpoint[6:-4]) + + all_metrics = process_ranks(all_ranks, args.load_path, checkpoint[6:-4]) + iter_id = checkpoint[6:-4] + for metric_name, metric_value in all_metrics.items(): + print(f"{metric_name}: {metric_value}") + if 'round' in metric_name: + viz.plotLine(iter_id, metric_value, 'Retrieval Round Val Metrics Round -' + metric_name.split('_')[-1], + metric_name) + else: + viz.plotLine(iter_id.split('_')[1], metric_value, 'Retrieval Val Metrics', metric_name) gc.collect() else: # ------------------------------------------------------------------------ @@ -244,9 +262,7 @@ def repeat_tensors(batch, num_repeat): if args.gpuid >= 0: batch[key] = batch[key].cuda() - if not batch["vid_feat"].shape[0] % args.num_gpu == 0: - num_repeat = args.num_gpu - batch["vid_feat"].shape[0] % args.num_gpu - batch = repeat_tensors(batch, num_repeat) + new_batch = convert_list_to_tensor(batch) dec_out, _ = model(new_batch) ranks = scores_to_ranks(dec_out.data) diff --git a/utils/eval_utils.py b/utils/eval_utils.py index 4205e9c..0d69338 100644 --- a/utils/eval_utils.py +++ b/utils/eval_utils.py @@ -39,8 +39,13 @@ def process_ranks(ranks, save_path, epoch): f.write("\tmeanRR: {}\n".format(torch.mean(ranks.reciprocal()))) f.write('\n') f.close() - - + + metrics = { "r@1": num_r1 / num_ques, + "r@5": num_r5 / num_ques, + "r@10": num_r10 / num_ques, + "mean": torch.mean(ranks), + "mrr": torch.mean(ranks.reciprocal()) } + print("\tNo. questions: {}".format(num_ques)) print("\tr@1: {}".format(num_r1 / num_ques)) print("\tr@5: {}".format(num_r5 / num_ques)) @@ -48,6 +53,7 @@ def process_ranks(ranks, save_path, epoch): print("\tmeanR: {}".format(torch.mean(ranks))) print("\tmeanRR: {}".format(torch.mean(ranks.reciprocal()))) + return metrics def scores_to_ranks(scores): # sort in descending order - largest score gets highest rank diff --git a/utils/visualize.py b/utils/visualize.py index d69622d..97ca0fb 100644 --- a/utils/visualize.py +++ b/utils/visualize.py @@ -13,6 +13,8 @@ def __init__(self, env_name='main', server="0.0.0.0", port=8899): ) self.plot_list = {} self.env = env_name + self.is_enabled = True + def plotLine(self, scalar_name, split, title_name, x ,y): if scalar_name not in self.plot_list: @@ -33,4 +35,100 @@ def writeText(self, dict): output = '' for arg in vars(dict): output=output+('{:<20}: {}{}'.format(arg, getattr(dict, arg),"\n")) - self.viz.text(output) \ No newline at end of file + self.viz.text(output) + + +class VisdomVisualize(): + def __init__(self, + env_name='main', + server="http://127.0.0.1", + port=8855, + enable=True): + ''' + Initialize a visdom server on server:port + ''' + print("Initializing visdom env [%s]" % env_name) + self.is_enabled = enable + self.env_name = env_name + if self.is_enabled: + self.viz = Visdom( + port=port, + env=env_name, + server=server, + ) + else: + self.viz = None + self.wins = {} + + def linePlot(self, x, y, key, line_name, xlabel="Epochs"): + ''' + Add or update a line plot on the visdom server self.viz + Argumens: + x : Scalar -> X-coordinate on plot + y : Scalar -> Value at x + key : Name of plot/graph + line_name : Name of line within plot/graph + xlabel : Label for x-axis (default: # Iterations) + Plots and lines are created if they don't exist, otherwise + they are updated. + ''' + key = str(key) + if self.is_enabled: + if key in self.wins.keys(): + self.viz.line( + X = np.array([x]), + Y = np.array([y]), + win = self.wins[key], + update = 'append', + name = line_name, + opts = dict(showlegend=True), + ) + else: + self.wins[key] = self.viz.line( + X = np.array([x]), + Y = np.array([y]), + win = key, + name = line_name, + opts = { + 'xlabel': xlabel, + 'ylabel': key, + 'title': key, + 'showlegend': True, + # 'legend': [line_name], + } + ) + + def showText(self, text, key): + ''' + Created a named text window or updates an existing one with + the name == key + ''' + key = str(key) + if self.is_enabled: + win = self.wins[key] if key in self.wins else None + self.wins[key] = self.viz.text(text, win=win) + + def addText(self, text): + ''' + Adds an unnamed text window without keeping track of win id + ''' + if self.is_enabled: + self.viz.text(text) + + def save(self): + if self.is_enabled: + self.viz.save([self.env_name]) + + def histPlot(self, x, key): + key = str(key) + if self.is_enabled: + if key in self.wins.keys(): + self.viz.histogram( + X = x.cpu().numpy(), + win = self.wins[key], + ) + else: + self.wins[key] = self.viz.histogram( + X = x.cpu().numpy(), + win = key + ) From f1b7a388c5727c652d9643740194302622532e15 Mon Sep 17 00:00:00 2001 From: halamri3 Date: Mon, 1 Feb 2021 12:19:11 -0500 Subject: [PATCH 49/49] update train --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index e2e5031..419c124 100644 --- a/train.py +++ b/train.py @@ -232,7 +232,7 @@ def repeat_tensors(batch, num_repeat): validation_losses = [] model.eval() model.zero_grad() - for _, val_batch in tqdm(enumerate(dataloader_val)): + for _, val_batch in enumerate(dataloader_val): for key in val_batch: if not isinstance(val_batch[key], list): val_batch[key] = Variable(val_batch[key])