Merge branch 'develop' into straggler_handling_update

ishant162 · web-flow · commit 2aa0b222e164 · 2025-01-15T09:32:51.000+05:30
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -25,6 +25,6 @@ jobs:
     - name: Install linters
       run: |
         python -m pip install --upgrade pip
-        pip install -r linters-requirements.txt 
+        pip install -r linters-requirements.txt
     - name: Lint with OpenFL-specific rules
       run: bash scripts/lint.sh
diff --git a/.github/workflows/taskrunner.yml b/.github/workflows/taskrunner.yml
@@ -17,29 +17,19 @@ env:
 jobs:
   build:
     if: github.event.pull_request.draft == false
-    strategy:
-      matrix:
-       os: ['ubuntu-latest', 'windows-latest']
-       python-version: ["3.10", "3.11", "3.12"]
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-latest
     timeout-minutes: 15
     
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python 
       uses: actions/setup-python@v4
       with:
-        python-version: ${{ matrix.python-version }}
+        python-version: "3.10"
     - name: Install dependencies ubuntu
-      if: matrix.os == 'ubuntu-latest'
       run: |
         python -m pip install --upgrade pip
         pip install .
-    - name: Install dependencies windows
-      if: matrix.os == 'windows-latest'
+    - name: Task Runner API
       run: |
-        python -m pip install --upgrade pip
-        pip install .
-    - name: Test TaskRunner API
-      run: |
-        python -m tests.github.test_hello_federation --template keras_cnn_mnist --fed_workspace aggregator --col1 col1 --col2 col2  --rounds-to-train 3 --save-model output_model
+          python -m tests.github.test_hello_federation --template torch_cnn_mnist --fed_workspace aggregator --col1 collaborator1 --col2 collaborator2  --rounds-to-train 3 --save-model output_model
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
@@ -13,6 +13,9 @@ env:
 
 jobs:
   pytest-coverage: # from pytest_coverage.yml
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
     runs-on: ubuntu-latest
     timeout-minutes: 15
 
@@ -21,7 +24,7 @@ jobs:
     - name: Set up Python 3
       uses: actions/setup-python@v3
       with:
-        python-version: "3.10"
+        python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -13,14 +13,18 @@ env:
 
 jobs:
   pytest-coverage: # from pytest_coverage.yml
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
     runs-on: windows-latest
     timeout-minutes: 15
+    
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python 3
       uses: actions/setup-python@v3
       with:
-        python-version: "3.10"
+        python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/docs/about/features_index/taskrunner.rst b/docs/about/features_index/taskrunner.rst
@@ -44,8 +44,9 @@ Configurable Settings
  - :code:`best_state_path`: (str:path) Defines the weight protobuf file path that will be saved to for the highest accuracy model during the experiment.
  - :code:`last_state_path`: (str:path)  Defines the weight protobuf file path that will be saved to during the last round completed in each experiment.
  - :code:`rounds_to_train`: (int) Specifies the number of rounds in a federation. A federated learning round is defined as one complete iteration when the collaborators train the model and send the updated model weights back to the aggregator to form a new global model. Within a round, collaborators can train the model for multiple iterations called epochs.
- - :code:`write_logs`: (boolean) Metric logging callback feature. By default, logging is done through `tensorboard <https://www.tensorflow.org/tensorboard/get_started>`_ but users can also use custom metric logging function for each task.      
-
+ - :code:`write_logs`: (boolean) Metric logging callback feature. By default, logging is done through `tensorboard <https://www.tensorflow.org/tensorboard/get_started>`_ but users can also use custom metric logging function for each task.     
+ - :code:`persist_checkpoint`: (boolean) Specifies whether to enable the storage of a persistent checkpoint in non-volatile storage for recovery purposes. When enabled, the aggregator will restore its state to what it was prior to the restart, ensuring continuity after a restart. 
+ - :code:`persistent_db_path`: (str:path) Defines the persisted database path. 
 
 - :class:`Collaborator <openfl.component.Collaborator>`
     `openfl.component.Collaborator <https://github.com/intel/openfl/blob/develop/openfl/component/collaborator/collaborator.py>`_
diff --git a/openfl-tutorials/experimental/workflow/105_Numpy_Linear_Regression_Workflow.ipynb b/openfl-tutorials/experimental/workflow/105_Numpy_Linear_Regression_Workflow.ipynb
@@ -42,10 +42,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Below code will display the print statement output on screen as well\n",
-    "import sys\n",
-    "sys.stdout = open('/dev/stdout', 'w')\n",
-    "\n",
     "!pip install git+https://github.com/securefederatedai/openfl.git\n",
     "!pip install -r workflow_interface_requirements.txt\n",
     "!pip install matplotlib\n",
@@ -308,7 +304,7 @@
     "        self.current_round += 1\n",
     "        if self.current_round < self.rounds:\n",
     "            self.next(self.aggregated_model_validation,\n",
-    "                      foreach='collaborators', exclude=['private'])\n",
+    "                      foreach='collaborators')\n",
     "        else:\n",
     "            self.next(self.end)\n",
     "\n",
diff --git a/openfl-workspace/workspace/plan/defaults/aggregator.yaml b/openfl-workspace/workspace/plan/defaults/aggregator.yaml
@@ -1,3 +1,5 @@
 template : openfl.component.Aggregator
 settings :
     db_store_rounds : 2
+    persist_checkpoint: True
+    persistent_db_path: save/tensor.db
diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py
@@ -12,10 +12,11 @@
 
 import openfl.callbacks as callbacks_module
 from openfl.component.aggregator.straggler_handling import CutoffPolicy, StragglerPolicy
-from openfl.databases import TensorDB
+from openfl.databases import PersistentTensorDB, TensorDB
 from openfl.interface.aggregation_functions import WeightedAverage
 from openfl.pipelines import NoCompressionPipeline, TensorCodec
 from openfl.protocols import base_pb2, utils
+from openfl.protocols.base_pb2 import NamedTensor
 from openfl.utilities import TaskResultKey, TensorKey, change_tags
 
 logger = logging.getLogger(__name__)
@@ -82,6 +83,8 @@ def __init__(
         log_memory_usage=False,
         write_logs=False,
         callbacks: Optional[List] = None,
+        persist_checkpoint=True,
+        persistent_db_path=None,
     ):
         """Initializes the Aggregator.
 
@@ -109,6 +112,7 @@ def __init__(
             callbacks: List of callbacks to be used during the experiment.
         """
         self.round_number = 0
+        self.next_model_round_number = 0
 
         if single_col_cert_common_name:
             logger.warning(
@@ -135,6 +139,16 @@ def __init__(
         self.quit_job_sent_to = []
 
         self.tensor_db = TensorDB()
+        if persist_checkpoint:
+            persistent_db_path = persistent_db_path or "tensor.db"
+            logger.info(
+                "Persistent checkpoint is enabled, setting persistent db at path %s",
+                persistent_db_path,
+            )
+            self.persistent_db = PersistentTensorDB(persistent_db_path)
+        else:
+            logger.info("Persistent checkpoint is disabled")
+            self.persistent_db = None
         # FIXME: I think next line generates an error on the second round
         # if it is set to 1 for the aggregator.
         self.db_store_rounds = db_store_rounds
@@ -152,8 +166,25 @@ def __init__(
         # TODO: Remove. Used in deprecated interactive and native APIs
         self.best_tensor_dict: dict = {}
         self.last_tensor_dict: dict = {}
+        # these enable getting all tensors for a task
+        self.collaborator_tasks_results = {}  # {TaskResultKey: list of TensorKeys}
+        self.collaborator_task_weight = {}  # {TaskResultKey: data_size}
 
-        if initial_tensor_dict:
+        # maintain a list of collaborators that have completed task and
+        # reported results in a given round
+        self.collaborators_done = []
+        # Initialize a lock for thread safety
+        self.lock = Lock()
+        self.use_delta_updates = use_delta_updates
+
+        self.model = None  # Initialize the model attribute to None
+        if self.persistent_db and self._recover():
+            logger.info("recovered state of aggregator")
+
+        # The model is built by recovery if at least one round has finished
+        if self.model:
+            logger.info("Model was loaded by recovery")
+        elif initial_tensor_dict:
             self._load_initial_tensors_from_dict(initial_tensor_dict)
             self.model = utils.construct_model_proto(
                 tensor_dict=initial_tensor_dict,
@@ -166,20 +197,6 @@ def __init__(
 
         self.collaborator_tensor_results = {}  # {TensorKey: nparray}}
 
-        # these enable getting all tensors for a task
-        self.collaborator_tasks_results = {}  # {TaskResultKey: list of TensorKeys}
-
-        self.collaborator_task_weight = {}  # {TaskResultKey: data_size}
-
-        # maintain a list of collaborators that have completed task and
-        # reported results in a given round
-        self.collaborators_done = []
-
-        # Initialize a lock for thread safety
-        self.lock = Lock()
-
-        self.use_delta_updates = use_delta_updates
-
         # Callbacks
         self.callbacks = callbacks_module.CallbackList(
             callbacks,
@@ -193,6 +210,79 @@ def __init__(
         self.callbacks.on_experiment_begin()
         self.callbacks.on_round_begin(self.round_number)
 
+    def _recover(self):
+        """Populates the aggregator state to the state it was prior a restart"""
+        recovered = False
+        # load tensors persistent DB
+        tensor_key_dict = self.persistent_db.load_tensors(
+            self.persistent_db.get_tensors_table_name()
+        )
+        if len(tensor_key_dict) > 0:
+            logger.info(f"Recovering {len(tensor_key_dict)} model tensors")
+            recovered = True
+            self.tensor_db.cache_tensor(tensor_key_dict)
+            committed_round_number, self.best_model_score = (
+                self.persistent_db.get_round_and_best_score()
+            )
+            logger.info("Recovery - Setting model proto")
+            to_proto_tensor_dict = {}
+            for tk in tensor_key_dict:
+                tk_name, _, _, _, _ = tk
+                to_proto_tensor_dict[tk_name] = tensor_key_dict[tk]
+            self.model = utils.construct_model_proto(
+                to_proto_tensor_dict, committed_round_number, self.compression_pipeline
+            )
+            # round number is the current round which is still in process
+            #  i.e. committed_round_number + 1
+            self.round_number = committed_round_number + 1
+            logger.info(
+                "Recovery - loaded round number %s and best score %s",
+                self.round_number,
+                self.best_model_score,
+            )
+
+        next_round_tensor_key_dict = self.persistent_db.load_tensors(
+            self.persistent_db.get_next_round_tensors_table_name()
+        )
+        if len(next_round_tensor_key_dict) > 0:
+            logger.info(f"Recovering {len(next_round_tensor_key_dict)} next round model tensors")
+            recovered = True
+            self.tensor_db.cache_tensor(next_round_tensor_key_dict)
+
+        logger.debug("Recovery - this is the tensor_db after recovery: %s", self.tensor_db)
+
+        if self.persistent_db.is_task_table_empty():
+            logger.debug("task table is empty")
+            return recovered
+
+        logger.info("Recovery - Replaying saved task results")
+        task_id = 1
+        while True:
+            task_result = self.persistent_db.get_task_result_by_id(task_id)
+            if not task_result:
+                break
+            recovered = True
+            collaborator_name = task_result["collaborator_name"]
+            round_number = task_result["round_number"]
+            task_name = task_result["task_name"]
+            data_size = task_result["data_size"]
+            serialized_tensors = task_result["named_tensors"]
+            named_tensors = [
+                NamedTensor.FromString(serialized_tensor)
+                for serialized_tensor in serialized_tensors
+            ]
+            logger.info(
+                "Recovery - Replaying task results %s %s %s",
+                collaborator_name,
+                round_number,
+                task_name,
+            )
+            self.process_task_results(
+                collaborator_name, round_number, task_name, data_size, named_tensors
+            )
+            task_id += 1
+        return recovered
+
     def _load_initial_tensors(self):
         """Load all of the tensors required to begin federated learning.
 
@@ -253,9 +343,12 @@ def _save_model(self, round_number, file_path):
             for k, v in og_tensor_dict.items()
         ]
         tensor_dict = {}
+        tensor_tuple_dict = {}
         for tk in tensor_keys:
             tk_name, _, _, _, _ = tk
-            tensor_dict[tk_name] = self.tensor_db.get_tensor_from_cache(tk)
+            tensor_value = self.tensor_db.get_tensor_from_cache(tk)
+            tensor_dict[tk_name] = tensor_value
+            tensor_tuple_dict[tk] = tensor_value
             if tensor_dict[tk_name] is None:
                 logger.info(
                     "Cannot save model for round %s. Continuing...",
@@ -265,6 +358,19 @@ def _save_model(self, round_number, file_path):
         if file_path == self.best_state_path:
             self.best_tensor_dict = tensor_dict
         if file_path == self.last_state_path:
+            # Transaction to persist/delete all data needed to increment the round
+            if self.persistent_db:
+                if self.next_model_round_number > 0:
+                    next_round_tensors = self.tensor_db.get_tensors_by_round_and_tags(
+                        self.next_model_round_number, ("model",)
+                    )
+                self.persistent_db.finalize_round(
+                    tensor_tuple_dict, next_round_tensors, self.round_number, self.best_model_score
+                )
+                logger.info(
+                    "Persist model and clean task result for round %s",
+                    round_number,
+                )
             self.last_tensor_dict = tensor_dict
         self.model = utils.construct_model_proto(
             tensor_dict, round_number, self.compression_pipeline
@@ -364,7 +470,7 @@ def get_tasks(self, collaborator_name):
         # if no tasks, tell the collaborator to sleep
         if len(tasks) == 0:
             tasks = None
-            sleep_time = self._get_sleep_time()
+            sleep_time = Aggregator._get_sleep_time()
 
             return tasks, self.round_number, sleep_time, time_to_quit
 
@@ -394,7 +500,7 @@ def get_tasks(self, collaborator_name):
         # been completed
         if len(tasks) == 0:
             tasks = None
-            sleep_time = self._get_sleep_time()
+            sleep_time = Aggregator._get_sleep_time()
 
             return tasks, self.round_number, sleep_time, time_to_quit
 
@@ -604,6 +710,31 @@ def send_local_task_results(
         Returns:
             None
         """
+        # Save task and its metadata for recovery
+        serialized_tensors = [tensor.SerializeToString() for tensor in named_tensors]
+        if self.persistent_db:
+            self.persistent_db.save_task_results(
+                collaborator_name, round_number, task_name, data_size, serialized_tensors
+            )
+            logger.debug(
+                f"Persisting task results {task_name} from {collaborator_name} round {round_number}"
+            )
+        logger.info(
+            f"Collaborator {collaborator_name} is sending task results "
+            f"for {task_name}, round {round_number}"
+        )
+        self.process_task_results(
+            collaborator_name, round_number, task_name, data_size, named_tensors
+        )
+
+    def process_task_results(
+        self,
+        collaborator_name,
+        round_number,
+        task_name,
+        data_size,
+        named_tensors,
+    ):
         if self._time_to_quit() or collaborator_name in self.stragglers:
             logger.warning(
                 f"STRAGGLER: Collaborator {collaborator_name} is reporting results "
@@ -618,11 +749,6 @@ def send_local_task_results(
             )
             return
 
-        logger.info(
-            f"Collaborator {collaborator_name} is sending task results "
-            f"for {task_name}, round {round_number}"
-        )
-
         task_key = TaskResultKey(task_name, collaborator_name, round_number)
 
         # we mustn't have results already
@@ -862,7 +988,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result
             new_model_report,
             ("model",),
         )
-
+        self.next_model_round_number = new_model_round_number
         # Finally, cache the updated model tensor
         self.tensor_db.cache_tensor({final_model_tk: new_model_nparray})
 
diff --git a/openfl/databases/__init__.py b/openfl/databases/__init__.py
diff --git a/openfl/databases/persistent_db.py b/openfl/databases/persistent_db.py
diff --git a/openfl/databases/tensor_db.py b/openfl/databases/tensor_db.py
diff --git a/openfl/experimental/workflow/workspace_export/export.py b/openfl/experimental/workflow/workspace_export/export.py