From 5dd04862563f8bbc76874417d83573d3ffd751b3 Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Tue, 1 Oct 2024 14:58:52 -0700 Subject: [PATCH 1/2] Get rid of error message after COG jobs successfully complete. Even when the integrated alerts COG jobs successfully complete, there is a data-updates error message. It turns out this is because we didn't increase the integrated_alerts job timeout when we added the very long extra COG steps. So, I just added an extra 7 hours to the timeout for IntegratedAlertsSync job. --- src/Dockerfile | 2 +- src/datapump/sync/sync.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index c332b78..5ef2992 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -15,7 +15,7 @@ RUN pip install . -t python # to change the hash of the file and get TF to realize it needs to be # redeployed. Ticket for a better solution: # https://gfw.atlassian.net/browse/GTC-1250 -# change 8 +# change 9 RUN yum install -y zip geos-devel diff --git a/src/datapump/sync/sync.py b/src/datapump/sync/sync.py index b58107f..389330a 100644 --- a/src/datapump/sync/sync.py +++ b/src/datapump/sync/sync.py @@ -224,7 +224,11 @@ def build_jobs(self, config: DatapumpConfig) -> List[Job]: band_count=1, union_bands=True, compute_stats=False, - timeout_sec=21600, + # This timeout is about 5-6 hours for the date_conf and intensity + # raster jobs (run in series), and then another 6-7 hours for the + # default and intensity COG jobs (run in parallel). The + # generation of the default COG takes the longest. + timeout_sec=13 * 3600, ), tile_cache_parameters=RasterTileCacheParameters( max_zoom=14, From d9e2ab95f76ac823047abd8dde38b9e52561c0d0 Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Wed, 2 Oct 2024 15:14:24 -0700 Subject: [PATCH 2/2] Put in retries for ThrottlingException for describe_cluster() call We seem to seem to get a ThrottlingException on describe_cluster() fairly often (often happens several time every couple of nights). This causes the entire job process to fail (or at least not finish the post-processing). So, I'm adding a few retries if we get a ClientError which is a ThrottlingException on the describe_cluster() call in check_analysis(). --- src/datapump/jobs/geotrellis.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/datapump/jobs/geotrellis.py b/src/datapump/jobs/geotrellis.py index e33844d..6b9a9d6 100644 --- a/src/datapump/jobs/geotrellis.py +++ b/src/datapump/jobs/geotrellis.py @@ -23,6 +23,8 @@ Partition, Partitions, ) +from botocore.exceptions import ClientError +import time WORKER_INSTANCE_TYPES = ["r5.2xlarge", "r4.2xlarge"] # "r6g.2xlarge" MASTER_INSTANCE_TYPE = "r5.2xlarge" @@ -145,9 +147,22 @@ def cancel_analysis(self): client.terminate_job_flows(JobFlowIds=[self.emr_job_id]) def check_analysis(self) -> JobStatus: - cluster_description = get_emr_client().describe_cluster( - ClusterId=self.emr_job_id - ) + num_retries = 3 + for i in range(num_retries): + try: + cluster_description = get_emr_client().describe_cluster( + ClusterId=self.emr_job_id + ) + break + except ClientError as e: + # Retry up to 3 times if we get a throttling exception + if i + 1 < num_retries and e.response['Error']['Code'] == 'ThrottlingException': + print("Throttling exception occurred. Retrying in 30 seconds...") + time.sleep(30) + continue + else: + raise + status = cluster_description["Cluster"]["Status"] LOGGER.info(