Skip to content

Commit 7a2b065

Browse files
committed
ARCHVIE: Add retry functionality for archive operations that timeout while
waiting for the files to be marked as on-tape. The default max timeout is set to 24 hours and can be changed by using the ARCHIVE_MAX_TIMEOUT parameter in the eosarchived configuration. Fixes EOS-6206
1 parent 216a408 commit 7a2b065

File tree

4 files changed

+45
-27
lines changed

4 files changed

+45
-27
lines changed

archive/eosarch/configuration.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def __init__(self):
4141
4242
Args:
4343
fn_conf (string): Path to the configuration file, which in normal
44-
conditions should be/etc/eosarchived.conf
44+
conditions should be /etc/eosarchived.conf
4545
"""
4646
try:
4747
LOG_DIR = os.environ["LOG_DIR"]
@@ -89,6 +89,7 @@ def __init__(self):
8989
self.__dict__['OPT_FORCE'] = 'force'
9090
self.__dict__['ARCH_FN'] = ".archive"
9191
self.__dict__['ARCH_INIT'] = ".archive.init"
92+
self.__dict__['ARCHIVE_MAX_TIMEOUT'] = '86400'
9293

9394
try:
9495
with open(archive_conf, 'r') as f:

archive/eosarch/exceptions.py

+5
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,8 @@ class CheckEntryException(Exception):
3333
""" Exception raised in cache a verify entry operation failes.
3434
"""
3535
pass
36+
37+
class NotOnTapeException(Exception):
38+
""" Exception raised when a file is not on tape after the maximum
39+
configured timeout per entry
40+
"""

archive/eosarch/transfer.py

+33-26
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,11 @@ def run(self):
250250
if self.do_retry:
251251
self.do_retry_transfer()
252252
else:
253-
self.do_transfer()
253+
try:
254+
self.do_transfer()
255+
except NotOnTapeException as _:
256+
self.logger.notice("Doing transfer re-try due to missing file on tape")
257+
self.do_retry_transfer()
254258
elif self.oper in [self.config.PURGE_OP, self.config.DELETE_OP]:
255259
self.archive_prepare()
256260
self.do_delete((self.oper == self.config.DELETE_OP))
@@ -437,7 +441,7 @@ def do_retry_transfer(self):
437441
self.set_status("verifying")
438442
check_ok, __ = self.archive.verify(False)
439443

440-
# For PUT operations what that all the files are on tape
444+
# For PUT operations wait that all the files are on tape
441445
if self.archive.d2t:
442446
self.set_status("wait_on_tape")
443447
self.wait_on_tape()
@@ -1068,20 +1072,20 @@ def evict_disk_cache(self):
10681072

10691073
def wait_on_tape(self):
10701074
""" Check and wait that all the files are on tape, which in our case
1071-
means checking the "m" bit. If file is not on tape then suspend the
1072-
current thread for a period between 1 and 10 minutes depending on the
1073-
index of the failed file.
1075+
means checking the "m" bit. If a file is not on tape then suspend the
1076+
current thread for a period of 5 to 60 seconds but abort if the file
1077+
fails to be archived on tape afte 24h
10741078
"""
1075-
min_timeout, max_timeout = 5, 1
1079+
max_timeout_per_entry = int(self.config.ARCHIVE_MAX_TIMEOUT)
1080+
min_timeout, max_timeout = 5, 60
10761081

1077-
while True:
1078-
indx = 0 # index of the first file not on tape
1079-
all_on_tape = True
1082+
for fentry in self.archive.files():
1083+
start_ts = time.time()
1084+
__, dst = self.archive.get_endpoints(fentry[1])
1085+
url = client.URL(dst)
1086+
file_on_tape = False
10801087

1081-
for fentry in self.archive.files():
1082-
indx += 1
1083-
__, dst = self.archive.get_endpoints(fentry[1])
1084-
url = client.URL(dst)
1088+
while not file_on_tape:
10851089
st_stat, resp_stat = self.archive.fs_dst.stat(url.path)
10861090

10871091
if not st_stat.ok:
@@ -1092,21 +1096,24 @@ def wait_on_tape(self):
10921096
# Check file is on tape
10931097
if resp_stat.size != 0 and not (resp_stat.flags & StatInfoFlags.BACKUP_EXISTS):
10941098
self.logger.debug("File {0} is not yet on tape".format(dst))
1095-
all_on_tape = False
1096-
break
1097-
1098-
if all_on_tape:
1099-
break
1100-
else:
1101-
# Set timeout value
1102-
ratio = indx / int(self.archive.header['num_files'])
1103-
timeout = int(max_timeout * (1 - ratio))
1099+
timeout = randrange(min_timeout, max_timeou)
1100+
self.logger.info("Going to sleep for {0} seconds".format(timeout))
1101+
sleep(timeout)
1102+
1103+
if time.time() - start_ts > max_timeout_per_entry:
1104+
self.logger.notice("Entry not archived within the maximum timeout."
1105+
" entry={0} archive_max_timeout={1}".format(
1106+
fentry[1], max_timeout_per_entry))
1107+
break
1108+
else:
1109+
file_on_tape = True
1110+
else:
1111+
file_on_tape = True
11041112

1105-
if timeout < min_timeout:
1106-
timeout = min_timeout
1113+
if not file_on_tape:
1114+
# Throw exception to re-try the failed transfer
1115+
raise NotOnTapeException()
11071116

1108-
self.logger.info("Going to sleep for {0} seconds".format(timeout))
1109-
sleep(timeout)
11101117

11111118
def backup_prepare(self):
11121119
""" Prepare requested backup operation.

archive/eosarchived.conf

+5
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,8 @@ POLL_TIMEOUT=30000
4747

4848
# Join timeout in seconds for running threads inside a process
4949
JOIN_TIMEOUT=1
50+
51+
# Maximum timeout value in seconds for a file entry to be migrated to tape.
52+
# When this timeout expires the transfer process is retried. By default this
53+
# is 86400 seconds (1 day).
54+
#ARCHIVE_MAX_TIMEOUT=86400

0 commit comments

Comments
 (0)