From eff772b6348af91428558c0a858b32bd9a75a907 Mon Sep 17 00:00:00 2001
From: Gertjan <gertjan.franken@kuleuven.be>
Date: Wed, 4 Dec 2024 12:20:54 +0000
Subject: [PATCH] Improve worker error reporting and allow more RAM.

By changing the way containers are started, we can now better report on
the cause of a failure.

RAM allowance is increased, because it sometimes caused a 137 (OOM)
which would kill the container.
---
 bci/distribution/worker_manager.py      | 39 ++++++++++++++++++-------
 bci/evaluations/evaluation_framework.py |  9 +++---
 bci/worker.py                           |  8 ++++-
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/bci/distribution/worker_manager.py b/bci/distribution/worker_manager.py
index e619401..3390a1c 100644
--- a/bci/distribution/worker_manager.py
+++ b/bci/distribution/worker_manager.py
@@ -42,6 +42,8 @@ def __run_container(self, params: WorkerParameters, blocking_wait=True) -> None:
         container_name = f'bh_worker_{container_id}'
 
         def start_container_thread():
+            if (host_pwd := os.getenv('HOST_PWD', None)) is None:
+                raise AttributeError('Could not find HOST_PWD environment var')
             try:
                 # Sometimes, it takes a while for Docker to remove the container
                 while True:
@@ -60,17 +62,20 @@ def start_container_thread():
                     for container in active_containers:
                         logger.info(f'Removing old container \'{container.attrs["Name"]}\' to start new one')
                         container.remove(force=True)
-                if (host_pwd := os.getenv('HOST_PWD', None)) is None:
-                    raise AttributeError('Could not find HOST_PWD environment var')
-                self.client.containers.run(
+            except docker.errors.APIError:
+                logger.error("Could not consult list of active containers", exc_info=True)
+
+            container = None
+            try:
+                container = self.client.containers.run(
                     f'bughog/worker:{Global.get_tag()}',
                     name=container_name,
                     hostname=container_name,
                     shm_size='2gb',
                     network='bh_net',
-                    mem_limit='1g',  # To prevent one container from consuming multiple gigs of memory (was the case for a Firefox evaluation)
-                    detach=False,
-                    remove=True,
+                    mem_limit='4g',  # To prevent one container from consuming multiple gigs of memory (was the case for a Firefox evaluation)
+                    mem_reservation='2g',
+                    detach=True,
                     labels=['bh_worker'],
                     command=[params.serialize()],
                     volumes=[
@@ -86,20 +91,34 @@ def start_container_thread():
                         '/dev/shm:/dev/shm',
                     ],
                 )
-                logger.debug(f"Container '{container_name}' finished experiments for '{params.state}'")
-                Clients.push_results_to_all()
+                result = container.wait()
+                if result["StatusCode"] != 0:
+                    logger.error(
+                        f"'{container_name}' exited unexpectedly with status code {result['StatusCode']}. "
+                        "Check the worker logs in ./logs/ for more information."
+                    )
+                else:
+                    logger.debug(f"Container '{container_name}' finished experiments for '{params.state}'")
+                    Clients.push_results_to_all()
+            except docker.errors.APIError:
+                logger.error("Received a docker error", exc_info=True)
             except docker.errors.ContainerError:
                 logger.error(
                     f"Could not run container '{container_name}' or container was unexpectedly removed", exc_info=True
                 )
+                if container is not None:
+                    container_info = container.attrs["State"]
+                    logger.error(f"'{container_name}' exited unexpectedly with {container_info}", exc_info=True)
             finally:
+                if container is not None:
+                    container.remove()
                 self.container_id_pool.put(container_id)
 
         thread = threading.Thread(target=start_container_thread)
         thread.start()
         logger.info(f"Container '{container_name}' started experiments for '{params.state}'")
-        # To avoid race-condition where more than max containers are started
-        time.sleep(3)
+        # Sleep to avoid all workers downloading browser binaries at once, clogging up all IO.
+        time.sleep(5)
 
     def get_nb_of_running_worker_containers(self):
         return len(self.get_runnning_containers())
diff --git a/bci/evaluations/evaluation_framework.py b/bci/evaluations/evaluation_framework.py
index d71de4e..86361c1 100644
--- a/bci/evaluations/evaluation_framework.py
+++ b/bci/evaluations/evaluation_framework.py
@@ -1,7 +1,6 @@
 import logging
 import os
 import re
-import traceback
 from abc import ABC, abstractmethod
 
 from bci.browser.configuration.browser import Browser
@@ -17,7 +16,7 @@ class EvaluationFramework(ABC):
     def __init__(self):
         self.should_stop = False
 
-    def evaluate(self, worker_params: WorkerParameters):
+    def evaluate(self, worker_params: WorkerParameters, is_worker=False):
         test_params = worker_params.create_test_params()
 
         if MongoDB().has_result(test_params):
@@ -42,8 +41,10 @@ def evaluate(self, worker_params: WorkerParameters):
             logger.info(f'Test finalized: {test_params}')
         except Exception as e:
             state.condition = StateCondition.FAILED
-            logger.error('An error occurred during evaluation', exc_info=True)
-            traceback.print_exc()
+            if is_worker:
+                raise e
+            else:
+                logger.error('An error occurred during evaluation', exc_info=True)
         finally:
             browser.post_test_cleanup()
 
diff --git a/bci/worker.py b/bci/worker.py
index a18928f..fe17129 100644
--- a/bci/worker.py
+++ b/bci/worker.py
@@ -22,7 +22,12 @@ def run(params: WorkerParameters):
     evaluation_framework = CustomEvaluationFramework()
     # browser_build, repo_state = get_browser_build_and_repo_state(params)
 
-    evaluation_framework.evaluate(params)
+    try:
+        evaluation_framework.evaluate(params, is_worker=True)
+    except Exception:
+        logger.fatal("An exception occurred during evaluation", exc_info=True)
+        logging.shutdown()
+        os._exit(1)
 
 
 if __name__ == '__main__':
@@ -35,4 +40,5 @@ def run(params: WorkerParameters):
     logger.info('Worker started')
     run(params)
     logger.info('Worker finished, exiting...')
+    logging.shutdown()
     os._exit(0)