@@ -168,7 +168,7 @@ def __getitem__(self, idx):
168168 start_time = time .time ()
169169 audio = self .load_file (audio_filename )
170170
171- audio , t_start , t_end , seconds_start , seconds_total = self .pad_crop (audio )
171+ audio , t_start , t_end , seconds_start , seconds_total , padding_mask = self .pad_crop (audio )
172172
173173 # Run augmentations on this sample (including random crop)
174174 if self .augs is not None :
@@ -190,6 +190,7 @@ def __getitem__(self, idx):
190190 info ["timestamps" ] = (t_start , t_end )
191191 info ["seconds_start" ] = seconds_start
192192 info ["seconds_total" ] = seconds_total
193+ info ["padding_mask" ] = padding_mask
193194
194195 end_time = time .time ()
195196
@@ -199,6 +200,9 @@ def __getitem__(self, idx):
199200 custom_metadata = self .custom_metadata_fn (info , audio )
200201 info .update (custom_metadata )
201202
203+ if "__reject__" in info and info ["__reject__" ]:
204+ return self [random .randrange (len (self ))]
205+
202206 return (audio , info )
203207 except Exception as e :
204208 print (f'Couldn\' t load file { audio_filename } : { e } ' )
@@ -339,8 +343,12 @@ def log_and_continue(exn):
339343
340344
341345def is_valid_sample (sample ):
342- return "json" in sample and "audio" in sample and not is_silence (sample ["audio" ])
346+ has_json = "json" in sample
347+ has_audio = "audio" in sample
348+ is_silent = is_silence (sample ["audio" ])
349+ is_rejected = "__reject__" in sample ["json" ] and sample ["json" ]["__reject__" ]
343350
351+ return has_json and has_audio and not is_silent and not is_rejected
344352
345353class S3DatasetConfig :
346354 def __init__ (
@@ -446,10 +454,11 @@ def wds_preprocess(self, sample):
446454 # Pad/crop and get the relative timestamp
447455 pad_crop = PadCrop_Normalized_T (
448456 self .sample_size , randomize = self .random_crop , sample_rate = self .sample_rate )
449- audio , t_start , t_end , seconds_start , seconds_total = pad_crop (
457+ audio , t_start , t_end , seconds_start , seconds_total , padding_mask = pad_crop (
450458 audio )
451459 sample ["json" ]["seconds_start" ] = seconds_start
452460 sample ["json" ]["seconds_total" ] = seconds_total
461+ sample ["json" ]["padding_mask" ] = padding_mask
453462 else :
454463 t_start , t_end = 0 , 1
455464
0 commit comments