From debc26a4f62b2a148455d4d427d7d9271ae6ccaa Mon Sep 17 00:00:00 2001
From: ppca <xyzheng@uchicago.edu>
Date: Mon, 20 Nov 2023 09:06:06 -0800
Subject: [PATCH] fix(state-parts-dump-check): bound process_part by timeout
 (#10215)

Problem:
Occasionally, the monitoring will be stuck. What happens is there are a
few parts for which the process_part() is called, but never finished,
i.e. not retried, not success, not failed.
My guess is that the get_part().await somehow is stuck waiting forever,
and since there was no timeout bound for the process_part(), the tokio
task that runs process_part could be hung forever, thus the monitoring
app is stuck.
I tried making the task for part_id = 100 sleep for 1000000 seconds, and
the program finishes all other parts except this one and hangs. This is
similar to what I see on monitoring node.

Solution:
bound process_part by a timeout, and initiate retry if timeout passed.
Thus no matter for which reason the task hangs, it will retry itself.

Note, the timeout is set to 10 mins because the timeout starts counting
once the task is spawn, and counts real time instead of cpu time for the
task, i.e. if this task is not running, time still elapse. So we need to
make sure within timeout * MAX_RETRIES, we are able to finish processing
all parts.
---
 tools/state-parts-dump-check/src/cli.rs | 38 ++++++++++++++++---------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/tools/state-parts-dump-check/src/cli.rs b/tools/state-parts-dump-check/src/cli.rs
index d276b5b78d2..13884081c82 100644
--- a/tools/state-parts-dump-check/src/cli.rs
+++ b/tools/state-parts-dump-check/src/cli.rs
@@ -19,8 +19,10 @@ use std::path::PathBuf;
 use std::sync::Arc;
 use std::thread::sleep;
 use std::time::{Duration, Instant};
+use tokio::time::timeout;
 
-const MAX_RETRIES: u32 = 3;
+// This isn't set to a huge number because if we keep retrying for too long, we will miss the next epoch's check
+const MAX_RETRIES: u32 = 5;
 
 #[derive(clap::Parser)]
 pub struct StatePartsDumpCheckCommand {
@@ -566,28 +568,36 @@ async fn process_part_with_3_retries(
         let chain_id = chain_id.clone();
         let epoch_id = epoch_id.clone();
         let external = external.clone();
-        res = process_part(
-            part_id,
-            chain_id,
-            epoch_id,
-            epoch_height,
-            shard_id,
-            state_root,
-            num_parts,
-            external,
+        // timeout is needed to deal with edge cases where process_part awaits forever, i.e. the get_part().await somehow waits forever
+        // this is set to a long duration because the timer for each task, i.e. process_part, starts when the task is started, i.e. tokio::spawn is called,
+        // and counts actual time instead of CPU time.
+        // The bottom line is this timeout * MAX_RETRIES > time it takes for all parts to be processed, which is typically < 30 mins on monitoring nodes
+        let timeout_duration = tokio::time::Duration::from_secs(600);
+        res = timeout(
+            timeout_duration,
+            process_part(
+                part_id,
+                chain_id,
+                epoch_id,
+                epoch_height,
+                shard_id,
+                state_root,
+                num_parts,
+                external,
+            ),
         )
         .await;
         match res {
-            Ok(_) => {
+            Ok(Ok(_)) => {
                 tracing::info!(shard_id, epoch_height, part_id, "process_part success.",);
                 break;
             }
-            Err(_) if retries < MAX_RETRIES => {
+            _ if retries < MAX_RETRIES => {
                 tracing::info!(shard_id, epoch_height, part_id, "process_part failed. Will retry.",);
                 retries += 1;
                 tokio::time::sleep(Duration::from_secs(5)).await;
             }
-            Err(_) => {
+            _ => {
                 tracing::info!(
                     shard_id,
                     epoch_height,
@@ -598,7 +608,7 @@ async fn process_part_with_3_retries(
             }
         }
     }
-    res
+    res?
 }
 
 async fn process_part(