From 5dd04862563f8bbc76874417d83573d3ffd751b3 Mon Sep 17 00:00:00 2001
From: Dan Scales <dan.scales@gmail.com>
Date: Tue, 1 Oct 2024 14:58:52 -0700
Subject: [PATCH 1/2] Get rid of error message after COG jobs successfully
 complete.

Even when the integrated alerts COG jobs successfully complete, there is
a data-updates error message. It turns out this is because we didn't
increase the integrated_alerts job timeout when we added the very long
extra COG steps. So, I just added an extra 7 hours to the timeout for
IntegratedAlertsSync job.
---
 src/Dockerfile            | 2 +-
 src/datapump/sync/sync.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/Dockerfile b/src/Dockerfile
index c332b78..5ef2992 100644
--- a/src/Dockerfile
+++ b/src/Dockerfile
@@ -15,7 +15,7 @@ RUN pip install . -t python
 # to change the hash of the file and get TF to realize it needs to be
 # redeployed. Ticket for a better solution:
 # https://gfw.atlassian.net/browse/GTC-1250
-# change 8
+# change 9
 
 RUN yum install -y zip geos-devel
 
diff --git a/src/datapump/sync/sync.py b/src/datapump/sync/sync.py
index b58107f..389330a 100644
--- a/src/datapump/sync/sync.py
+++ b/src/datapump/sync/sync.py
@@ -224,7 +224,11 @@ def build_jobs(self, config: DatapumpConfig) -> List[Job]:
                     band_count=1,
                     union_bands=True,
                     compute_stats=False,
-                    timeout_sec=21600,
+                    # This timeout is about 5-6 hours for the date_conf and intensity
+                    # raster jobs (run in series), and then another 6-7 hours for the
+                    # default and intensity COG jobs (run in parallel). The
+                    # generation of the default COG takes the longest.
+                    timeout_sec=13 * 3600,
                 ),
                 tile_cache_parameters=RasterTileCacheParameters(
                     max_zoom=14,

From d9e2ab95f76ac823047abd8dde38b9e52561c0d0 Mon Sep 17 00:00:00 2001
From: Dan Scales <dan.scales@gmail.com>
Date: Wed, 2 Oct 2024 15:14:24 -0700
Subject: [PATCH 2/2] Put in retries for ThrottlingException for
 describe_cluster() call

We seem to seem to get a ThrottlingException on describe_cluster()
fairly often (often happens several time every couple of nights). This
causes the entire job process to fail (or at least not finish the
post-processing). So, I'm adding a few retries if we get a ClientError
which is a ThrottlingException on the describe_cluster() call in
check_analysis().
---
 src/datapump/jobs/geotrellis.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/datapump/jobs/geotrellis.py b/src/datapump/jobs/geotrellis.py
index e33844d..6b9a9d6 100644
--- a/src/datapump/jobs/geotrellis.py
+++ b/src/datapump/jobs/geotrellis.py
@@ -23,6 +23,8 @@
     Partition,
     Partitions,
 )
+from botocore.exceptions import ClientError
+import time
 
 WORKER_INSTANCE_TYPES = ["r5.2xlarge", "r4.2xlarge"]  # "r6g.2xlarge"
 MASTER_INSTANCE_TYPE = "r5.2xlarge"
@@ -145,9 +147,22 @@ def cancel_analysis(self):
         client.terminate_job_flows(JobFlowIds=[self.emr_job_id])
 
     def check_analysis(self) -> JobStatus:
-        cluster_description = get_emr_client().describe_cluster(
-            ClusterId=self.emr_job_id
-        )
+        num_retries = 3
+        for i in range(num_retries):
+            try:
+                cluster_description = get_emr_client().describe_cluster(
+                    ClusterId=self.emr_job_id
+                )
+                break
+            except ClientError as e:
+                # Retry up to 3 times if we get a throttling exception
+                if i + 1 < num_retries and e.response['Error']['Code'] == 'ThrottlingException':
+                    print("Throttling exception occurred. Retrying in 30 seconds...")
+                    time.sleep(30)
+                    continue
+                else:
+                    raise
+
         status = cluster_description["Cluster"]["Status"]
 
         LOGGER.info(