@@ -250,7 +250,11 @@ def run(self):
250
250
if self .do_retry :
251
251
self .do_retry_transfer ()
252
252
else :
253
- self .do_transfer ()
253
+ try :
254
+ self .do_transfer ()
255
+ except NotOnTapeException as _ :
256
+ self .logger .notice ("Doing transfer re-try due to missing file on tape" )
257
+ self .do_retry_transfer ()
254
258
elif self .oper in [self .config .PURGE_OP , self .config .DELETE_OP ]:
255
259
self .archive_prepare ()
256
260
self .do_delete ((self .oper == self .config .DELETE_OP ))
@@ -437,7 +441,7 @@ def do_retry_transfer(self):
437
441
self .set_status ("verifying" )
438
442
check_ok , __ = self .archive .verify (False )
439
443
440
- # For PUT operations what that all the files are on tape
444
+ # For PUT operations wait that all the files are on tape
441
445
if self .archive .d2t :
442
446
self .set_status ("wait_on_tape" )
443
447
self .wait_on_tape ()
@@ -1068,20 +1072,20 @@ def evict_disk_cache(self):
1068
1072
1069
1073
def wait_on_tape (self ):
1070
1074
""" Check and wait that all the files are on tape, which in our case
1071
- means checking the "m" bit. If file is not on tape then suspend the
1072
- current thread for a period between 1 and 10 minutes depending on the
1073
- index of the failed file.
1075
+ means checking the "m" bit. If a file is not on tape then suspend the
1076
+ current thread for a period of 5 to 60 seconds but abort if the file
1077
+ fails to be archived on tape afte 24h
1074
1078
"""
1075
- min_timeout , max_timeout = 5 , 1
1079
+ max_timeout_per_entry = int (self .config .ARCHIVE_MAX_TIMEOUT )
1080
+ min_timeout , max_timeout = 5 , 60
1076
1081
1077
- while True :
1078
- indx = 0 # index of the first file not on tape
1079
- all_on_tape = True
1082
+ for fentry in self .archive .files ():
1083
+ start_ts = time .time ()
1084
+ __ , dst = self .archive .get_endpoints (fentry [1 ])
1085
+ url = client .URL (dst )
1086
+ file_on_tape = False
1080
1087
1081
- for fentry in self .archive .files ():
1082
- indx += 1
1083
- __ , dst = self .archive .get_endpoints (fentry [1 ])
1084
- url = client .URL (dst )
1088
+ while not file_on_tape :
1085
1089
st_stat , resp_stat = self .archive .fs_dst .stat (url .path )
1086
1090
1087
1091
if not st_stat .ok :
@@ -1092,21 +1096,24 @@ def wait_on_tape(self):
1092
1096
# Check file is on tape
1093
1097
if resp_stat .size != 0 and not (resp_stat .flags & StatInfoFlags .BACKUP_EXISTS ):
1094
1098
self .logger .debug ("File {0} is not yet on tape" .format (dst ))
1095
- all_on_tape = False
1096
- break
1097
-
1098
- if all_on_tape :
1099
- break
1100
- else :
1101
- # Set timeout value
1102
- ratio = indx / int (self .archive .header ['num_files' ])
1103
- timeout = int (max_timeout * (1 - ratio ))
1099
+ timeout = randrange (min_timeout , max_timeou )
1100
+ self .logger .info ("Going to sleep for {0} seconds" .format (timeout ))
1101
+ sleep (timeout )
1102
+
1103
+ if time .time () - start_ts > max_timeout_per_entry :
1104
+ self .logger .notice ("Entry not archived within the maximum timeout."
1105
+ " entry={0} archive_max_timeout={1}" .format (
1106
+ fentry [1 ], max_timeout_per_entry ))
1107
+ break
1108
+ else :
1109
+ file_on_tape = True
1110
+ else :
1111
+ file_on_tape = True
1104
1112
1105
- if timeout < min_timeout :
1106
- timeout = min_timeout
1113
+ if not file_on_tape :
1114
+ # Throw exception to re-try the failed transfer
1115
+ raise NotOnTapeException ()
1107
1116
1108
- self .logger .info ("Going to sleep for {0} seconds" .format (timeout ))
1109
- sleep (timeout )
1110
1117
1111
1118
def backup_prepare (self ):
1112
1119
""" Prepare requested backup operation.
0 commit comments