diff --git a/.github/actions/tr_post_test_run/action.yml b/.github/actions/tr_post_test_run/action.yml index 04444af4d9..f5c8849283 100644 --- a/.github/actions/tr_post_test_run/action.yml +++ b/.github/actions/tr_post_test_run/action.yml @@ -16,7 +16,7 @@ runs: if: ${{ always() }} run: | export PYTHONPATH="$PYTHONPATH:." - python tests/end_to_end/utils/summary_helper.py + python tests/end_to_end/utils/summary_helper.py --func_name "print_task_runner_score" echo "Test summary printed" shell: bash diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml new file mode 100644 index 0000000000..646bebbc61 --- /dev/null +++ b/.github/workflows/bandit.yml @@ -0,0 +1,51 @@ +name: Bandit Code Scan + +on: + push: + branches: + - develop + - v1.7.x + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + +jobs: + bandit_scan: + if: github.event.pull_request.draft == false + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: Bandit Scan + runs-on: ubuntu-22.04 + timeout-minutes: 15 + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set Filename Suffix Report Date and Time + run: | + echo "REPORT_DATE=$(date +'%d-%b-%Y_%H-%M-%S')" >> $GITHUB_ENV + + - name: Define SARIF Report Path + run: echo "SARIF_REPORT_PATH=${{ github.workspace }}/results.sarif" >> $GITHUB_ENV + + - name: Perform Bandit Analysis + uses: PyCQA/bandit-action@v1 + with: + configfile: 'DEFAULT' + profile: 'DEFAULT' + tests: 'DEFAULT' + skips: 'DEFAULT' + severity: 'DEFAULT' + confidence: 'DEFAULT' + exclude: '.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg' + baseline: 'DEFAULT' + ini: 'DEFAULT' + targets: '.' + + - name: Upload Bandit SARIF Report as Artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: "bandit-report-summary_${{ env.REPORT_DATE }}" + path: ${{ env.SARIF_REPORT_PATH }} diff --git a/.github/workflows/federated_runtime.yml b/.github/workflows/federated_runtime.yml new file mode 100644 index 0000000000..717b96e490 --- /dev/null +++ b/.github/workflows/federated_runtime.yml @@ -0,0 +1,66 @@ +#--------------------------------------------------------------------------- +# Workflow to run 301_MNIST_Watermarking notebook +# Authors - Noopur, Payal Chaurasiya +#--------------------------------------------------------------------------- +name: Federated Runtime 301 MNIST Watermarking + +on: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + + workflow_dispatch: + +permissions: + contents: read + +jobs: + test_federated_runtime_301_watermarking_notebook: + if: github.event.pull_request.draft == false + runs-on: ubuntu-22.04 + timeout-minutes: 20 + steps: + - name: Checkout OpenFL repository + uses: actions/checkout@v4.1.1 + with: + fetch-depth: 2 # needed for detecting changes + submodules: "true" + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: "3.10" + + - name: Install dependencies # Without this step, fx command will not work + id: install_dependencies + run: | + python -m pip install --upgrade pip ipython ipykernel + pip install . + pip install -r test-requirements.txt + + - name: Run Federated Runtime 301 MNIST Watermarking via pytest + id: run_tests + run: | + python -m pytest -s tests/end_to_end/test_suites/wf_federated_runtime_tests.py -k test_federated_runtime_301_watermarking + echo "Federated Runtime 301 MNIST Watermarking test run completed" + + - name: Print test summary + id: print_test_summary + if: ${{ always() }} + run: | + export PYTHONPATH="$PYTHONPATH:." + python tests/end_to_end/utils/summary_helper.py --func_name "print_federated_runtime_score" + echo "Test summary printed" + + - name: Tar files + if: ${{ always() }} # collect artifacts regardless of failures + run: | + tar -cvf notebook_301.tar --exclude="__pycache__" $HOME/results --ignore-failed-read + echo "TAR file created" + + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + if: ${{ always() }} # collect artifacts regardless of failures + with: + name: federated_runtime_301_watermarking_${{ github.run_id }} + path: notebook_301.tar diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 36d7fdc41e..afa5ea6a2f 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -25,6 +25,6 @@ jobs: - name: Install linters run: | python -m pip install --upgrade pip - pip install -r linters-requirements.txt + pip install -r linters-requirements.txt - name: Lint with OpenFL-specific rules run: bash scripts/lint.sh diff --git a/.github/workflows/pki.yml b/.github/workflows/pki.yml index bf907273ed..f5e176fd6d 100644 --- a/.github/workflows/pki.yml +++ b/.github/workflows/pki.yml @@ -33,20 +33,3 @@ jobs: - name: Test PKI run: | python tests/github/pki_insecure_client.py - test_wrong_common_name: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Set up Python 3 - uses: actions/setup-python@v3 - with: - python-version: "3.10" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install . - - name: Test PKI - run: | - python tests/github/pki_wrong_cn.py \ No newline at end of file diff --git a/.github/workflows/straggler-handling.yml b/.github/workflows/straggler-handling.yml index 450caf8e8a..64a2f07153 100644 --- a/.github/workflows/straggler-handling.yml +++ b/.github/workflows/straggler-handling.yml @@ -21,7 +21,7 @@ jobs: matrix: os: ['ubuntu-latest', 'windows-latest'] runs-on: ${{ matrix.os }} - timeout-minutes: 15 + timeout-minutes: 30 steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/task_runner_basic_e2e.yml b/.github/workflows/task_runner_basic_e2e.yml index b50eedd526..4c4aaa12d7 100644 --- a/.github/workflows/task_runner_basic_e2e.yml +++ b/.github/workflows/task_runner_basic_e2e.yml @@ -31,7 +31,7 @@ jobs: test_with_tls: name: tr_tls runs-on: ubuntu-22.04 - timeout-minutes: 15 + timeout-minutes: 30 strategy: matrix: # There are open issues for some of the models, so excluding them for now: @@ -74,7 +74,7 @@ jobs: test_with_non_tls: name: tr_non_tls runs-on: ubuntu-22.04 - timeout-minutes: 15 + timeout-minutes: 30 strategy: matrix: # Testing non TLS scenario only for torch_cnn_mnist model and python 3.10 @@ -117,7 +117,7 @@ jobs: test_with_no_client_auth: name: tr_no_client_auth runs-on: ubuntu-22.04 - timeout-minutes: 15 + timeout-minutes: 30 strategy: matrix: # Testing non TLS scenario only for torch_cnn_mnist model and python 3.10 @@ -160,7 +160,7 @@ jobs: test_memory_logs: name: tr_tls_memory_logs runs-on: ubuntu-22.04 - timeout-minutes: 15 + timeout-minutes: 30 strategy: matrix: # Testing non TLS scenario only for torch_cnn_mnist model and python 3.10 diff --git a/.github/workflows/taskrunner.yml b/.github/workflows/taskrunner.yml index a9093be4c1..088ee60c64 100644 --- a/.github/workflows/taskrunner.yml +++ b/.github/workflows/taskrunner.yml @@ -17,29 +17,19 @@ env: jobs: build: if: github.event.pull_request.draft == false - strategy: - matrix: - os: ['ubuntu-latest', 'windows-latest'] - python-version: ["3.10", "3.11", "3.12"] - runs-on: ${{ matrix.os }} - timeout-minutes: 15 - + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: - uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v4 with: - python-version: ${{ matrix.python-version }} + python-version: "3.10" - name: Install dependencies ubuntu - if: matrix.os == 'ubuntu-latest' - run: | - python -m pip install --upgrade pip - pip install . - - name: Install dependencies windows - if: matrix.os == 'windows-latest' run: | python -m pip install --upgrade pip pip install . - - name: Test TaskRunner API + - name: Task Runner API run: | - python -m tests.github.test_hello_federation --template keras_cnn_mnist --fed_workspace aggregator --col1 col1 --col2 col2 --rounds-to-train 3 --save-model output_model + python -m tests.github.test_hello_federation --template torch_cnn_mnist --fed_workspace aggregator --col1 collaborator1 --col2 collaborator2 --rounds-to-train 3 --save-model output_model \ No newline at end of file diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index c968e85f11..617c004751 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -13,6 +13,9 @@ env: jobs: pytest-coverage: # from pytest_coverage.yml + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] runs-on: ubuntu-latest timeout-minutes: 15 @@ -21,7 +24,7 @@ jobs: - name: Set up Python 3 uses: actions/setup-python@v3 with: - python-version: "3.10" + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/wf_functional_e2e.yml b/.github/workflows/wf_functional_e2e.yml index 923aa73bae..1831949299 100644 --- a/.github/workflows/wf_functional_e2e.yml +++ b/.github/workflows/wf_functional_e2e.yml @@ -29,9 +29,9 @@ env: NUM_COLLABORATORS: ${{ github.event.inputs.num_collaborators || '2' }} jobs: - test_wf_func: + test_wf_functional_local_runtime: if: github.event.pull_request.draft == false - name: wf_func + name: wf_functional_local_runtime runs-on: ubuntu-22.04 timeout-minutes: 15 strategy: @@ -74,7 +74,7 @@ jobs: if: ${{ always() }} run: | export PYTHONPATH="$PYTHONPATH:." - python tests/end_to_end/utils/summary_helper.py + python tests/end_to_end/utils/summary_helper.py --func_name "print_local_runtime_score" echo "Test summary printed" - name: Create Tar (exclude cert and data folders) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 341b93b7f1..5f3ffa1220 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -13,14 +13,18 @@ env: jobs: pytest-coverage: # from pytest_coverage.yml + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] runs-on: windows-latest timeout-minutes: 15 + steps: - uses: actions/checkout@v3 - name: Set up Python 3 uses: actions/setup-python@v3 with: - python-version: "3.10" + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/workflow_interface_101_mnist.yml b/.github/workflows/workflow_interface_101_mnist.yml index 57e1dae46e..e5146fc32e 100644 --- a/.github/workflows/workflow_interface_101_mnist.yml +++ b/.github/workflows/workflow_interface_101_mnist.yml @@ -14,10 +14,10 @@ permissions: contents: read jobs: - run_notebook: + wf_local_101_mnist: if: github.event.pull_request.draft == false runs-on: ubuntu-22.04 - timeout-minutes: 15 + timeout-minutes: 30 steps: - name: Checkout OpenFL repository uses: actions/checkout@v4.1.1 @@ -31,16 +31,17 @@ jobs: with: python-version: "3.10" - - name: Install Jupyter Lab Package - run: pip install jupyterlab + - name: Install Papermill Package + run: python -m pip install --upgrade pip ipython ipykernel papermill - name: Run Notebook run: | - jupyter nbconvert --execute --to notebook ./openfl-tutorials/experimental/workflow/101_MNIST.ipynb + cd openfl-tutorials/experimental/workflow + papermill 101_MNIST.ipynb 101_MNIST_output.ipynb --request-save-on-cell-execute --log-output --autosave-cell-every 30 echo "Notebook run completed" - name: Tar files - run: tar -cvf notebook.tar ./openfl-tutorials/experimental/workflow/101_MNIST.nbconvert.ipynb + run: tar -cvf notebook.tar ./openfl-tutorials/experimental/workflow/101_MNIST_output.ipynb - name: Upload Artifacts uses: actions/upload-artifact@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eb9b3a0916..e2008b7ec5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,4 +27,10 @@ repos: - --in-place - --remove-unused-variables - --recursive - - --ignore-pass-statements \ No newline at end of file + - --ignore-pass-statements + - repo: https://github.com/PyCQA/bandit + rev: 1.7.4 + hooks: + - id: bandit + args: ["-c", "pre_commit.toml"] + additional_dependencies: ["bandit[toml]"] \ No newline at end of file diff --git a/README.md b/README.md index 5ed611dd72..275899ed8b 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,9 @@ OpenFL supports popular aggregation algorithms out-of-the-box, with more algorit | FedProx | [Li et al., 2020](https://arxiv.org/pdf/1812.06127.pdf) | yes | yes | - | | FedCurv | [Shoham et al., 2019](https://arxiv.org/pdf/1910.07796.pdf) | yes | - | - | +### Enabling Bandit Precommit +To ensure that precommit is setup in your local for Bandit Scan. For more details, kindly follow this doc: [Setup Guide - Precommit](precommit-doc.md) + ## Contributing We welcome contributions! Please refer to the [contributing guidelines](https://openfl.readthedocs.io/en/latest/contributing.html). diff --git a/docs/about/features_index/fed_eval.rst b/docs/about/features_index/fed_eval.rst index e35c0f5afa..3f8e6b8637 100644 --- a/docs/about/features_index/fed_eval.rst +++ b/docs/about/features_index/fed_eval.rst @@ -24,34 +24,486 @@ In general pipeline is as follows: Example Using the Task Runner API (Aggregator-based Workflow) -------------------------------------------------------------- -To demonstrate usage of the task runner API (aggregator-based workflow) for federated evaluation, consider the `Hello Federation example `_. This sample script creates a simple federation with two collaborator nodes and one aggregator node, and executes based on a user specified workspace template. We provide a ``torch_cnn_mnist_fed_eval`` template, which is a federated evaluation template adapted from ``torch_cnn_mnist``. +The following steps can be leveraged to achieve practical e2e usage of FedEval -This script can be directly executed as follows: +*N.B*: We will be using torch_cnn_mnist plan itself for both training and with some minor changes for evaluation as well + +*Prerequisites*: Please ensure that OpenFL version==1.7 is installed or you can also choose to install latest from source. + +With OpenFL version==1.7 aggregator start command is enhanced to have an optional argument '--task_group' which, as the help suggest, will select the provided task_groups task to assigner for execution in the collaborator(s), since this defaults to 'learning' + +.. code-block:: shell + + Usage: fx aggregator start [OPTIONS] + + Start the aggregator service. + + Args: plan (str): Path to plan config file authorized_cols (str): Path to authorized collaborators file + task_group (str): Selected task-group for assignement - defaults to 'learning' + + Options: + -p, --plan PATH Federated learning plan [plan/plan.yaml] + -c, --authorized_cols PATH Authorized collaborator list [plan/cols.yaml] + --task_group TEXT Selected task-group for assignment - defaults to learning + --help Show this message and exit. + +1. **Setup** +We will use the `torch_cnn_mnist` workspace for training + +Let's first configure a workspace with all necesary certificates .. code-block:: shell - $ python test_hello_federation.py --template torch_cnn_mnist_fed_eval + fx workspace create --prefix ./cnn_train_eval --template torch_cnn_mnist + cd cnn_train_eval + fx workspace certify + fx aggregator generate-cert-request + fx aggregator certify --silent + +Succesful run of this will show in console both the FL plan details and certificates generations + +.. code-block:: shell + + INFO Parsing Federated Learning Plan : SUCCESS : + + settings: + + best_state_path: save/best.pbuf + + db_store_rounds: 2 + + init_state_path: save/init.pbuf + + last_state_path: save/last.pbuf + + rounds_to_train: 2 + + write_logs: false + + template: openfl.component.aggregator.Aggregator + + assigner: + + settings: + + task_groups: + + - name: learning + + percentage: 1.0 + + tasks: + + - aggregated_model_validation + + - train + + - locally_tuned_model_validation + + template: openfl.component.RandomGroupedAssigner + + collaborator: + + settings: + + db_store_rounds: 1 + + delta_updates: false + + opt_treatment: RESET + + template: openfl.component.collaborator.Collaborator + + compression_pipeline: + + settings: {} + + template: openfl.pipelines.NoCompressionPipeline + + data_loader: + + settings: + + batch_size: 64 + + collaborator_count: 2 + + template: src.dataloader.PyTorchMNISTInMemory + + network: + + settings: + + agg_addr: devvm###.com + + agg_port: 55529 + + cert_folder: cert + + client_reconnect_interval: 5 + + hash_salt: auto + + require_client_auth: true + + use_tls: true + + template: openfl.federation.Network + + task_runner: + + settings: {} + + template: src.taskrunner.TemplateTaskRunner + + tasks: + + aggregated_model_validation: + + function: validate_task + + kwargs: + + apply: global + + metrics: + + - acc + + locally_tuned_model_validation: + + function: validate_task + + kwargs: + + apply: local + + metrics: + + - acc + + settings: {} + + train: + + function: train_task + + kwargs: + + epochs: 1 + + metrics: + + - loss + New workspace directory structure: + cnn_train_eval + ├── requirements.txt + ├── .workspace + ├── logs + ├── data + ├── cert + ├── README.md + ├── src + │ ├── __init__.py + │ ├── taskrunner.py + │ ├── cnn_model.py + │ └── dataloader.py + ├── plan + │ ├── cols.yaml + │ ├── plan.yaml + │ ├── data.yaml + │ └── defaults + └── save + + 6 directories, 11 files + + ✔️ OK + Setting Up Certificate Authority... + + Done. + + ✔️ OK + Creating AGGREGATOR certificate key pair with following settings: CN=devvm###.com, SAN=DNS:devvm###.com + + ✔️ OK + The CSR Hash for file server/agg_devvm###.com.csr = 3affa56ce391a084961c5f1ba634f223536173665daa6191e705e13557f36d58c844133758f804d1f85d93bfc113fd7b -In order to adapt this template for federated evaluation, the following defaults were added for assigner, aggregator and tasks and same referenced in the ``plan.yaml``: + Signing AGGREGATOR certificate + + ✔️ OK + +2. Initialize the plan + +.. code-block:: shell + + cd ~/src/clean/openfl/cnn_train_eval + fx plan initialize >~/plan.log 2>&1 & + tail -f ~/plan.log + +This should initialize the plan with random initial weights in ``init.pbuf`` + +.. code-block:: shell + + WARNING Following parameters omitted from global initial model, local initialization will determine values: [] plan.py:186 + INFO Creating Initial Weights File 🠆 save/init.pbuf + plan.py:196 + ✔️ OK + +3. Next run the 'learning' federation with two collaborators + +.. code-block:: shell + + ## Create two collaborators + cd ~/src/clean/openfl/cnn_train_eval + fx collaborator create -n collaborator1 -d 1 + fx collaborator generate-cert-request -n collaborator1 + fx collaborator certify -n collaborator1 --silent + fx collaborator create -n collaborator2 -d 2 + fx collaborator generate-cert-request -n collaborator2 + fx collaborator certify -n collaborator2 --silent + + ## start the fedeval federation + fx aggregator start > ~/fx_aggregator.log 2>&1 & + fx collaborator start -n collaborator1 > ~/collab1.log 2>&1 & + fx collaborator start -n collaborator2 > ~/collab2.log 2>&1 & + cd ~ + tail -f plan.log fx_aggregator.log collab1.log collab2.log + +This script will run two collaborator and start the aggregator with default `--task_group` 'learning' + +The same is defined in the assigner section of the plan which comes from the defaults itself + +.. code-block:: yaml + + assigner: + + settings: + + task_groups: + + - name: learning + + percentage: 1.0 + + tasks: + + - aggregated_model_validation + + - train + + - locally_tuned_model_validation + +This will run the 2 rounds of training across both the collaborators + +.. code-block:: shell + + ==> fx_aggregator.log <== + INFO Sending tasks to collaborator collaborator2 for round 0 + aggregator.py:409 -.. literalinclude:: ../../../openfl-workspace/torch_cnn_mnist_fed_eval/plan/plan.yaml + ==> collab2.log <== + INFO Received Tasks: [name: "aggregated_model_validation" + collaborator.py:184 + , name: "train" -.. literalinclude:: ../../../openfl-workspace/workspace/plan/defaults/federated-evaluation/aggregator.yaml + , name: "locally_tuned_model_validation" -.. literalinclude:: ../../../openfl-workspace/workspace/plan/defaults/federated-evaluation/assigner.yaml + ] -.. literalinclude:: ../../../openfl-workspace/workspace/plan/defaults/federated-evaluation/tasks_torch.yaml +Post the end of learning federation we can note what is the best model accuracy reported and save the ``best.pbuf`` file for next step - evaluation -Key Changes for Federated Evaluation by baking in defaults for: +.. code-block:: shell + + ==> fx_aggregator.log <== + [06:09:27] INFO Collaborator collaborator1 is sending task results for train, round 1 + + [06:09:28] INFO Collaborator collaborator1 is sending task results for locally_tuned_model_validation, round 1 aggregator.py:629 + INFO Round 1: Collaborators that have completed all tasks: ['collaborator2', 'collaborator1'] aggregator.py:1049 + INFO Round 1: saved the best model with score 0.960096 + + INFO Saving round 1 model... + + INFO Experiment Completed. Cleaning up... + +In this case we can confirm that post the 2 rounds of training the model reported an accuracy of 0.960096 + +.. code-block:: shell + + Round 1: saved the best model with score 0.960096 + aggregator.py:955 + +Let's save this model (``best.pbuf``) for later usage + +.. code-block:: shell -1. **aggregator.settings.rounds_to_train**: Set to 1 -2. **assigner**: Assign to aggregated_model_validation instead of default assignments -3. **tasks**: Set to aggregated_model_validation instead of default tasks + cp cnn_train_eval/save/best.pbuf ~/trained_model.pbuf + devuser@devvm:~/src/clean/openfl$ + +Now let's create another workspace using the same plan and steps as mentioned in learning Setup: + +Post this we will do plan initialize and we shall replace the ``init.pbuf`` with the previously saved ``best.pbuf`` and then re-adjust the plan +to use "evaluation" defaults. + +Once all the pieces are in place we then run the aggregator in evaluation mode by supplying the `--task_group` as "evaluation" validating the +accuracy of the previously trained model + +The updated plan post initialization with edits to make it ready for evaluation will be as follows: + +.. code-block:: yaml + + aggregator: + settings: + best_state_path: save/best.pbuf + db_store_rounds: 2 + init_state_path: save/init.pbuf + last_state_path: save/last.pbuf + rounds_to_train: 1 + write_logs: false + template: openfl.component.aggregator.Aggregator + assigner: + settings: + task_groups: + - name: evaluation + percentage: 1.0 + tasks: + - aggregated_model_validation + template: openfl.component.RandomGroupedAssigner + collaborator: + settings: + db_store_rounds: 1 + delta_updates: false + opt_treatment: RESET + template: openfl.component.collaborator.Collaborator + compression_pipeline: + settings: {} + template: openfl.pipelines.NoCompressionPipeline + data_loader: + settings: + batch_size: 64 + collaborator_count: 2 + template: src.dataloader.PyTorchMNISTInMemory + network: + settings: + agg_addr: devvm###.com + agg_port: 55529 + cert_folder: cert + client_reconnect_interval: 5 + hash_salt: auto + require_client_auth: true + use_tls: true + template: openfl.federation.Network + task_runner: + settings: {} + template: src.taskrunner.TemplateTaskRunner + tasks: + aggregated_model_validation: + function: validate_task + kwargs: + apply: global + metrics: + - acc + locally_tuned_model_validation: + function: validate_task + kwargs: + apply: local + metrics: + - acc + settings: {} + train: + function: train_task + kwargs: + epochs: 1 + metrics: + - loss + +We have done following changes to the initialized torch_cnn_mnist plan in the new workspace: + - Set the rounds_to_train to 1 as evaluation needs just one round of federation run across the collaborators + - Removed all other training related tasks from assigner settings except "aggregated_model_validation" +Now let's replace the ``init.pbuf`` with the previously saved ``trained_model.pbuf`` + +.. code-block:: shell -**Optional**: modify ``src/pt_cnn.py`` to remove optimizer initialization and definition of loss function as these are not needed for evaluation + ll cnn_eval/save/init.pbuf + -rw------- 1 devuser devuser 1722958 Jan 14 09:44 cnn_eval/save/init.pbuf + (venv) devuser@devvm:~/src/clean/openfl$ cp ~/trained_model.pbuf cnn_eval/save/init.pbuf + (venv) devuser@devvm:~/src/clean/openfl$ ll cnn_eval/save/init.pbuf + -rw------- 1 devuser devuser 1722974 Jan 14 09:52 cnn_eval/save/init.pbuf + (venv) devuser@devvm:~/src/clean/openfl$ + +Notice the size changes in the ``init.pbuf`` as its replaced by the trained model we saved from the training run of the federation + +Now finally let's run the federation and this time we will launch the aggregator with overriding the default value of `--task_group` to "evaluation" + +.. code-block:: shell + + ## Create two collaborators + cd ~/src/clean/openfl/cnn_eval + fx collaborator create -n collaborator1 -d 1 + fx collaborator generate-cert-request -n collaborator1 + fx collaborator certify -n collaborator1 --silent + fx collaborator create -n collaborator2 -d 2 + fx collaborator generate-cert-request -n collaborator2 + fx collaborator certify -n collaborator2 --silent + + ## start the fedeval federation + fx aggregator start --task_group evaluation > ~/fx_aggregator.log 2>&1 & + fx collaborator start -n collaborator1 > ~/collab1.log 2>&1 & + fx collaborator start -n collaborator2 > ~/collab2.log 2>&1 & + cd ~ + tail -f plan.log fx_aggregator.log collab1.log collab2.log + +Notice the only change in fedration run steps from previous training round is the additional argument `--task_group` to aggregator start + +Now since the aggregators' task_group is set to "evaluation" it will skip the `round_number_check` and use the init model supplied just for evaluation + +.. code-block:: shell + + INFO Setting aggregator to assign: evaluation task_group + aggregator.py:101 + INFO 🧿 Starting the Aggregator Service. + aggregator.py:103 + + INFO Skipping round_number check for evaluation task_group + aggregator.py:215 + INFO Starting Aggregator gRPC Server + +In each collaborator logs we can see that the assigned task is only the evaluation task + +.. code-block:: shell + + => collab1.log <== + INFO Waiting for tasks... + collaborator.py:234 + INFO Received Tasks: [name: "aggregated_model_validation" + collaborator.py:184 + ] + ==> collab2.log <== + INFO Waiting for tasks... + collaborator.py:234 + INFO Received Tasks: [name: "aggregated_model_validation" + collaborator.py:184 + ] + +And post the federation run, since its only evaluation run, we get from the collaborator the accuracy of the init model which, as per successful +evaluation, is same as previously trained best models' accuracy, in our case that was 0.960096 + +.. code-block:: shell -This sample script will create a federation based on the `torch_cnn_mnist_fed_eval` template using the `plan.yaml` file defined above, spawning two collaborator nodes and a single aggregator node. The model will be sent to the two collaborator nodes, where each collaborator will perform model validation on its own local data. The accuracy from this model validation will then be send back to the aggregator where it will aggregated into a final accuracy metric. The federation will then be shutdown. + ==> fx_aggregator.log <== + [10:00:15] INFO Collaborator collaborator2 is sending task results for aggregated_model_validation, round 0 aggregator.py:629 + INFO Round 0: Collaborators that have completed all tasks: ['collaborator2'] + aggregator.py:1049 + INFO Collaborator collaborator1 is sending task results for aggregated_model_validation, round 0 aggregator.py:629 + INFO Round 0: Collaborators that have completed all tasks: ['collaborator2', 'collaborator1'] aggregator.py:1049 + INFO Round 0: saved the best model with score 0.960096 + aggregator.py:955 + INFO Saving round 0 model... + aggregator.py:994 + INFO Experiment Completed. Cleaning up... + aggregator.py:1005 + INFO Sending signal to collaborator collaborator1 to shutdown... + aggregator.py:356 --- -Congratulations, you have successfully performed federated evaluation across two decentralized collaborator nodes with minor default reference changes to plan \ No newline at end of file +Congratulations, you have successfully performed federated evaluation across two decentralized collaborator nodes using the same plan with minor evaluation-related changes leveraging a previously trained OpenFL model protobuf as input. \ No newline at end of file diff --git a/docs/about/features_index/taskrunner.rst b/docs/about/features_index/taskrunner.rst index 2097c72f32..f8730f463b 100644 --- a/docs/about/features_index/taskrunner.rst +++ b/docs/about/features_index/taskrunner.rst @@ -44,8 +44,9 @@ Configurable Settings - :code:`best_state_path`: (str:path) Defines the weight protobuf file path that will be saved to for the highest accuracy model during the experiment. - :code:`last_state_path`: (str:path) Defines the weight protobuf file path that will be saved to during the last round completed in each experiment. - :code:`rounds_to_train`: (int) Specifies the number of rounds in a federation. A federated learning round is defined as one complete iteration when the collaborators train the model and send the updated model weights back to the aggregator to form a new global model. Within a round, collaborators can train the model for multiple iterations called epochs. - - :code:`write_logs`: (boolean) Metric logging callback feature. By default, logging is done through `tensorboard `_ but users can also use custom metric logging function for each task. - + - :code:`write_logs`: (boolean) Metric logging callback feature. By default, logging is done through `tensorboard `_ but users can also use custom metric logging function for each task. + - :code:`persist_checkpoint`: (boolean) Specifies whether to enable the storage of a persistent checkpoint in non-volatile storage for recovery purposes. When enabled, the aggregator will restore its state to what it was prior to the restart, ensuring continuity after a restart. + - :code:`persistent_db_path`: (str:path) Defines the persisted database path. - :class:`Collaborator ` `openfl.component.Collaborator `_ diff --git a/docs/developer_guide/advanced_topics/overriding_agg_fn.rst b/docs/developer_guide/advanced_topics/overriding_agg_fn.rst index e2bc2fd396..253a437bf6 100644 --- a/docs/developer_guide/advanced_topics/overriding_agg_fn.rst +++ b/docs/developer_guide/advanced_topics/overriding_agg_fn.rst @@ -7,20 +7,9 @@ Override Aggregation Function ***************************** -With the aggregator-based workflow, you can use custom aggregation functions for each task via Python\*\ API or command line interface. +With the aggregator-based workflow, you can use custom aggregation functions for each task via command line interface. -Python API (Deprecated) -========== - -1. Create an implementation of :class:`openfl.interface.aggregation_functions.core.AggregationFunction`. - -2. In the ``override_config`` keyword argument of the :func:`openfl.native.run_experiment` native function, pass the implementation as a ``tasks.{task_name}.aggregation_type`` parameter. - -.. note:: - See `Federated PyTorch MNIST Tutorial `_ for an example of the custom aggregation function. - - Command Line Interface ====================== diff --git a/docs/developer_guide/advanced_topics/overriding_plan_settings.rst b/docs/developer_guide/advanced_topics/overriding_plan_settings.rst deleted file mode 100644 index aae1f7ea02..0000000000 --- a/docs/developer_guide/advanced_topics/overriding_plan_settings.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. # Copyright (C) 2020-2023 Intel Corporation -.. # SPDX-License-Identifier: Apache-2.0 - -.. _overriding_plan_settings: - -*********************** -Updating plan settings -*********************** - -With the director-based workflow, you can use custom plan settings before starting the experiment. Changing plan settings in command line interface is straightforward by modifying plan.yaml. -When using Python API or Director Envoy based interactive API (Deprecated), **override_config** can be used to update plan settings. - - -Python API (Deprecated) -========== - -Modify the plan settings: - -.. code-block:: python - - final_fl_model = fx.run_experiment(collaborators, override_config={ - 'aggregator.settings.rounds_to_train': 5, - 'aggregator.settings.log_metric_callback': write_metric, - }) - - -Director Envoy Based Interactive API Interface (Deprecated) -=========================================================== -Once you create an FL_experiment object, a basic federated learning plan with default settings is created. To check the default plan settings, print the plan as shown below: - -.. code-block:: python - - fl_experiment = FLExperiment(federation=federation, experiment_name=experiment_name) - import openfl.native as fx - print(fx.get_plan(fl_plan=fl_experiment.plan)) - -Here is an example of the default plan settings that get displayed: - -.. code-block:: python - - "aggregator.settings.best_state_path": "save/best.pbuf", - "aggregator.settings.db_store_rounds": 2, - "aggregator.settings.init_state_path": "save/init.pbuf", - "aggregator.settings.last_state_path": "save/last.pbuf", - "aggregator.settings.rounds_to_train": 10, - "aggregator.settings.write_logs": true, - "aggregator.template": "openfl.component.Aggregator", - "assigner.settings.task_groups.0.name": "train_and_validate", - "assigner.settings.task_groups.0.percentage": 1.0, - "assigner.settings.task_groups.0.tasks.0": "aggregated_model_validation", - "assigner.settings.task_groups.0.tasks.1": "train", - "assigner.settings.task_groups.0.tasks.2": "locally_tuned_model_validation", - "assigner.template": "openfl.component.RandomGroupedAssigner", - "collaborator.settings.db_store_rounds": 1, - "collaborator.settings.delta_updates": false, - "collaborator.settings.opt_treatment": "RESET", - "collaborator.template": "openfl.component.Collaborator", - "compression_pipeline.settings": {}, - "compression_pipeline.template": "openfl.pipelines.NoCompressionPipeline", - "data_loader.settings": {}, - "data_loader.template": "openfl.federated.DataLoader", - "network.settings.agg_addr": "auto", - "network.settings.agg_port": "auto", - "network.settings.cert_folder": "cert", - "network.settings.client_reconnect_interval": 5, - "network.settings.disable_client_auth": false, - "network.settings.hash_salt": "auto", - "network.settings.tls": true, - "network.template": "openfl.federation.Network", - "task_runner.settings": {}, - "task_runner.template": "openfl.federated.task.task_runner.CoreTaskRunner", - "tasks.settings": {} - - -Use **override_config** with FL_experiment.start to make any changes to the default plan settings. It's essentially a dictionary with the keys corresponding to plan parameters along with the corresponding values (or list of values). Any new key entry will be added to the plan and for any existing key present in the plan, the value will be overrriden. - - -.. code-block:: python - - fl_experiment.start(model_provider=MI, - task_keeper=TI, - data_loader=fed_dataset, - rounds_to_train=5, - opt_treatment='CONTINUE_GLOBAL', - override_config={'aggregator.settings.db_store_rounds': 1, 'compression_pipeline.template': 'openfl.pipelines.KCPipeline', 'compression_pipeline.settings.n_clusters': 2}) - - -Since 'aggregator.settings.db_store_rounds' and 'compression_pipeline.template' fields are already present in the plan, the values of these fields get replaced. Field 'compression_pipeline.settings.n_clusters' is a new entry that gets added to the plan: - -.. code-block:: python - - INFO Updating aggregator.settings.db_store_rounds to 1... native.py:102 - - INFO Updating compression_pipeline.template to openfl.pipelines.KCPipeline... native.py:102 - - INFO Did not find compression_pipeline.settings.n_clusters in config. Make sure it should exist. Creating... native.py:105 - - -A full implementation can be found at `Federated_Pytorch_MNIST_Tutorial.ipynb `_ and at `Tensorflow_MNIST.ipynb `_. diff --git a/docs/developer_guide/running_the_federation.notebook.rst b/docs/developer_guide/running_the_federation.notebook.rst deleted file mode 100644 index 44e18e1380..0000000000 --- a/docs/developer_guide/running_the_federation.notebook.rst +++ /dev/null @@ -1,219 +0,0 @@ -.. # Copyright (C) 2020-2023 Intel Corporation -.. # SPDX-License-Identifier: Apache-2.0 - -.. _running_notebook: - -********************************** -Aggregator-Based Workflow Tutorial (Deprecated) -********************************** - -You will start a Jupyter\* \ lab server and receive a URL you can use to access the tutorials. Jupyter notebooks are provided for PyTorch\* \ and TensorFlow\* \ that simulate a federation on a local machine. - -.. note:: - - Follow the procedure to become familiar with the APIs used in aggregator-based workflow and conventions such as *FL Plans*, *Aggregators*, and *Collaborators*. - - -Start the Tutorials -=================== - -1. Start a Python\* \ 3.10 (>=3.10, <3.13) virtual environment and confirm OpenFL is available. - - .. code-block:: python - - fx - - You should see a list of available commands - -2. Start a Jupyter server. This returns a URL to access available tutorials. - - .. code-block:: python - - fx tutorial start - -3. Open the URL (including the token) in your browser. - -4. Choose a tutorial from which to start. Each tutorial is a demonstration of a simulated federated learning. The following are examples of available tutorials: - - - :code:`Federated Keras MNIST Tutorial`: workspace with a simple `Keras `_ CNN model that will download the `MNIST `_ dataset and train in a federation. - - :code:`Federated Pytorch MNIST Tutorial`: workspace with a simple `PyTorch `_ CNN model that will download the `MNIST `_ dataset and train in a federation. - - :code:`Federated PyTorch UNET Tutorial`: workspace with a UNET `PyTorch `_ model that will download the `Hyper-Kvasir `_ dataset and train in a federation. - - :code:`Federated PyTorch TinyImageNet`: workspace with a MobileNet-V2 `PyTorch `_ model that will download the `Tiny-ImageNet `_ dataset and train in a federation. - - -Familiarize with the API Concepts in an Aggregator-Based Worklow -================================================================ - -Step 1: Enable the OpenFL Python API -------------------------------------------- - -Add the following lines to your Python script. - - .. code-block:: python - - import openfl.native as fx - from openfl.federated import FederatedModel, FederatedDataSet - -This loads the OpenFL package and import wrappers that adapt your existing data and models to a (simulated) federated context. - -Step 2: Set Up the Experiment ------------------------------ - -For a basic experiment, run the following command. - - .. code-block:: python - - fx.init() - - -This creates a workspace directory containing default FL plan values for your experiments, and sets up a an experiment with two collaborators (the collaborators are creatively named **one** and **two**). - -For an experiment with more collaborators, run the following command. - - .. code-block:: python - - collaborator_list = [str(i) for i in range(NUM_COLLABORATORS)] - fx.init('keras_cnn_mnist', col_names=collaborator_list) - - -.. note:: - - The following are template recommendations for training models: - - - For Keras models, run :code:`fx.init('keras_cnn_mnist')` to start with the *keras_cnn_mnist* template. - - For PyTorch models, run :code:`fx.init('torch_cnn_mnist')` to start with the *torch_cnn_mnist* template. - - -Step 3: Customize the Federated Learning Plan (FL Plan) -------------------------------------------------------- - -For this example, the experiment is set up with the *keras_cnn_mnist* template. - - .. code-block:: python - - fx.init('keras_cnn_mnist') - - -See the FL plan values that can be set with the :code:`fx.get_plan()` command. - - .. code-block:: python - - print(fx.get_plan()) - - { - "aggregator.settings.best_state_path": "save/keras_cnn_mnist_best.pbuf", - "aggregator.settings.init_state_path": "save/keras_cnn_mnist_init.pbuf", - "aggregator.settings.last_state_path": "save/keras_cnn_mnist_last.pbuf", - "aggregator.settings.rounds_to_train": 10, - "aggregator.template": "openfl.component.Aggregator", - ... - } - -Based on this plan values, the experiment will run for 10 rounds. You can customize the experiment to run for 20 rounds either at runtime or ahead of time. - -Set the value at **runtime** with the :code:`override-config` parameter of :code:`fx.run_experiment`. - - .. code-block:: python - - #set values at experiment runtime - fx.run_experiment(experiment_collaborators, override_config={"aggregator.settings.rounds_to_train": 20}) - - -Set the value **ahead of time** with :code:`fx.update_plan()`. - - .. code-block:: python - - #Set values ahead of time with fx.update_plan() - fx.update_plan({"aggregator.settings.rounds_to_train": 20}) - - -Step 4: Wrap the Data and Model -------------------------------- - -Use the :code:`FederatedDataSet` function to wrap in-memory numpy datasets and split the data into N mutually-exclusive chunks for each collaborator participating in the experiment. - - .. code-block:: python - - fl_data = FederatedDataSet(train_images, train_labels, valid_images, valid_labels, batch_size=32, num_classes=classes) - -Similarly, the :code:`FederatedModel` function takes as an argument your model definition. For the first example, you can wrap a Keras model in a function that outputs the compiled model. - -**Example 1:** - - .. code-block:: python - - def build_model(feature_shape,classes): - #Defines the MNIST model - model = Sequential() - model.add(Dense(64, input_shape=feature_shape, activation='relu')) - model.add(Dense(64, activation='relu')) - model.add(Dense(classes, activation='softmax')) - - model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy']) - return model - - fl_model = FederatedModel(build_model, data_loader=fl_data) - -For the second example with a PyTorch model, the :code:`FederatedModel` function takes the following parameters: - -- The class that defines the network definition and associated forward function -- The lambda optimizer method that can be set to a newly instantiated network -- The loss function - -**Example 2:** - - .. code-block:: python - - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 16, 3) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(16, 32, 3) - self.fc1 = nn.Linear(32 * 5 * 5, 32) - self.fc2 = nn.Linear(32, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(x.size(0),-1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return F.log_softmax(x, dim=1) - - optimizer = lambda x: optim.Adam(x, lr=1e-4) - - def cross_entropy(output, target): - """Binary cross-entropy metric - """ - return F.binary_cross_entropy_with_logits(input=output,target=target) - - fl_model = FederatedModel(build_model=Net, optimizer=optimizer, loss_fn=cross_entropy, data_loader=fl_data) - - -Step 5: Define the Collaborators --------------------------------- - -Define the collaborators taking part in the experiment. The example below uses the collaborator list, created earlier with the the :code:`fx.init()` command. - - .. code-block:: python - - experiment_collaborators = {col_name:col_model for col_name, col_model \ - in zip(collaborator_list, fl_model.setup(len(collaborator_list)))} - -This command creates a model for each collaborator with their data shard. - -.. note:: - - In production deployments of OpenFL, each collaborator will have the data on premise. Splitting data into shards is not necessary. - -Step 6: Run the Experiment --------------------------- - -Run the experiment for five rounds and return the final model once completed. - - .. code-block:: python - - final_fl_model = fx.run_experiment(experiment_collaborators, override_config={"aggregator.settings.rounds_to_train": 5}) \ No newline at end of file diff --git a/docs/get_started/examples.rst b/docs/get_started/examples.rst index 7c2bb92dd2..dcb172d233 100644 --- a/docs/get_started/examples.rst +++ b/docs/get_started/examples.rst @@ -7,12 +7,7 @@ Examples for Running a Federation ================================= -OpenFL currently offers four ways to set up and run experiments with a federation: -the Task Runner API, Python Native API, the Interactive API (Deprecated), and the Workflow API. -the Task Runner API is advised for production scenarios where the workload needs to be verified prior to execution, whereas the python native API provides a clean python interface on top of it intended for simulation purposes. -The Interactive API (Deprecated) introduces a convenient way to set up a federation and brings “long-lived” components in a federation (“Director” and “Envoy”), -while the Task Runner API workflow is advised for scenarios where the workload needs to be verified prior to execution. In contrast, the currently experimental Workflow API -is introduced to provide significant flexility to researchers and developers in the construction of federated learning experiments. +OpenFL currently offers two ways to set up and run experiments with a federation: the Task Runner API and the Workflow API. The Task Runner API is advised for production scenarios where the workload needs to be verified prior to execution. The experimental Workflow API is introduced to provide significant flexility to researchers and developers in the construction of federated learning experiments. As OpenFL nears it's 2.0 release, we expect to consolidate these APIs and make the Workflow API the primary interface going forward. See our `roadmap `_ for more details. diff --git a/docs/get_started/examples/python_native_pytorch_mnist.rst b/docs/get_started/examples/python_native_pytorch_mnist.rst deleted file mode 100644 index 38e6028962..0000000000 --- a/docs/get_started/examples/python_native_pytorch_mnist.rst +++ /dev/null @@ -1,173 +0,0 @@ -.. # Copyright (C) 2020-2023 Intel Corporation -.. # SPDX-License-Identifier: Apache-2.0 - -.. _python_native_pytorch_mnist: - -========================================== -Python Native API: Federated PyTorch MNIST (Deprecated) -========================================== - -In this tutorial, we will set up a federation and train a basic PyTorch model on the MNIST dataset using the Python Native API. -See `full notebook `_. - -.. note:: - - Ensure you have installed the OpenFL package. - - See :ref:`installation` for details. - - -Install additional dependencies if not already installed - -.. code-block:: shell - - $ pip install torch torchvision - -.. code-block:: python - - import numpy as np - import torch - import torch.nn as nn - import torch.nn.functional as F - import torch.optim as optim - - import torchvision - import torchvision.transforms as transforms - import openfl.native as fx - from openfl.federated import FederatedModel,FederatedDataSet - -After importing the required packages, the next step is setting up our openfl workspace. -To do this, simply run the ``fx.init()`` command as follows: - -.. code-block:: python - - #Setup default workspace, logging, etc. - fx.init('torch_cnn_mnist', log_level='METRIC', log_file='./spam_metric.log') - -Now we are ready to define our dataset and model to perform federated learning on. -The dataset should be composed of a numpy array. We start with a simple fully connected model that is trained on the MNIST dataset. - -.. code-block:: python - - def one_hot(labels, classes): - return np.eye(classes)[labels] - - transform = transforms.Compose( - [transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - - trainset = torchvision.datasets.MNIST(root='./data', train=True, - download=True, transform=transform) - - train_images,train_labels = trainset.train_data, np.array(trainset.train_labels) - train_images = torch.from_numpy(np.expand_dims(train_images, axis=1)).float() - - validset = torchvision.datasets.MNIST(root='./data', train=False, - download=True, transform=transform) - - valid_images,valid_labels = validset.test_data, np.array(validset.test_labels) - valid_images = torch.from_numpy(np.expand_dims(valid_images, axis=1)).float() - valid_labels = one_hot(valid_labels,10) - -.. code-block:: python - - feature_shape = train_images.shape[1] - classes = 10 - - fl_data = FederatedDataSet(train_images,train_labels,valid_images,valid_labels,batch_size=32,num_classes=classes) - - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 16, 3) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(16, 32, 3) - self.fc1 = nn.Linear(32 * 5 * 5, 32) - self.fc2 = nn.Linear(32, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(x.size(0),-1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return F.log_softmax(x, dim=1) - - optimizer = lambda x: optim.Adam(x, lr=1e-4) - - def cross_entropy(output, target): - """Binary cross-entropy metric - """ - return F.cross_entropy(input=output,target=target) - - -Here we can define metric logging function. It should has the following signature described below. You can use it to write metrics to tensorboard or some another specific logging. - -.. code-block:: python - - from torch.utils.tensorboard import SummaryWriter - - writer = SummaryWriter('./logs/cnn_mnist', flush_secs=5) - - - def write_metric(node_name, task_name, metric_name, metric, round_number): - writer.add_scalar("{}/{}/{}".format(node_name, task_name, metric_name), - metric, round_number) - -.. code-block:: python - - #Create a federated model using the pytorch class, lambda optimizer function, and loss function - fl_model = FederatedModel(build_model=Net,optimizer=optimizer,loss_fn=cross_entropy,data_loader=fl_data) - -The ``FederatedModel`` object is a wrapper around your Keras, Tensorflow or PyTorch model that makes it compatible with openfl. -It provides built in federated training and validation functions that we will see used below. -Using it's setup function, collaborator models and datasets can be automatically defined for the experiment. - -.. code-block:: python - - collaborator_models = fl_model.setup(num_collaborators=2) - collaborators = {'one':collaborator_models[0],'two':collaborator_models[1]}#, 'three':collaborator_models[2]} - -.. code-block:: python - - #Original MNIST dataset - print(f'Original training data size: {len(train_images)}') - print(f'Original validation data size: {len(valid_images)}\n') - - #Collaborator one's data - print(f'Collaborator one\'s training data size: {len(collaborator_models[0].data_loader.X_train)}') - print(f'Collaborator one\'s validation data size: {len(collaborator_models[0].data_loader.X_valid)}\n') - - #Collaborator two's data - print(f'Collaborator two\'s training data size: {len(collaborator_models[1].data_loader.X_train)}') - print(f'Collaborator two\'s validation data size: {len(collaborator_models[1].data_loader.X_valid)}\n') - - #Collaborator three's data - #print(f'Collaborator three\'s training data size: {len(collaborator_models[2].data_loader.X_train)}') - #print(f'Collaborator three\'s validation data size: {len(collaborator_models[2].data_loader.X_valid)}') - -We can see the current plan values by running the ``fx.get_plan()`` function - -.. code-block:: python - - #Get the current values of the plan. Each of these can be overridden - print(fx.get_plan()) - -Now we are ready to run our experiment. -If we want to pass in custom plan settings, we can easily do that with the override_config parameter - -.. code-block:: python - - # Run experiment, return trained FederatedModel - - final_fl_model = fx.run_experiment(collaborators, override_config={ - 'aggregator.settings.rounds_to_train': 5, - 'aggregator.settings.log_metric_callback': write_metric, - }) - -.. code-block:: python - - #Save final model - final_fl_model.save_native('final_pytorch_model') diff --git a/docs/installation.md b/docs/installation.md index 037bf9fb44..d25a99d95d 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -61,7 +61,7 @@ This method can be used to run federated learning experiments in an isolated env > **Note:** OpenFL image hosted on `docker.io` has not been updated since the 1.5 release due to a change in namespace. We are working on this issue. In the meantime, use the instructions below to build an image from source. ```bash - docker pull intel/openfl + docker pull ghcr.io/securefederatedai/openfl/openfl:latest ``` * Build from source: diff --git a/docs/openfl.native.rst b/docs/openfl.native.rst deleted file mode 100644 index 33fdbb1914..0000000000 --- a/docs/openfl.native.rst +++ /dev/null @@ -1,16 +0,0 @@ -``openfl.native`` module (Deprecated) -===================================== - -.. currentmodule:: openfl.native - -.. automodule:: openfl.native - -.. autosummary:: - :toctree: _autosummary - :template: custom-module-template.rst - :recursive: - - native - fastestimator - -.. TODO(MasterSkepticista) Shrink API namespace diff --git a/docs/openfl.rst b/docs/openfl.rst index a4dd53dc5c..d8cd780922 100644 --- a/docs/openfl.rst +++ b/docs/openfl.rst @@ -16,7 +16,6 @@ Subpackages openfl.databases openfl.federated openfl.interface - openfl.native openfl.pipelines openfl.plugins openfl.protocols diff --git a/docs/releases.md b/docs/releases.md index 7bc328b7b9..30ed029ea6 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -1,5 +1,28 @@ # Releases +## 1.7 +[Full Release Notes](https://github.com/securefederatedai/openfl/releases/tag/v1.7) + +### New Features +- [**FederatedRuntime**](https://openfl.readthedocs.io/en/latest/about/features_index/workflowinterface.html#runtimes-future-plans) for Workflow API: enables a seamless transition from a local simulation (via LocalRuntime) to a distributed Federated Learning deployment - all orchestrated from a familiar Jupyter notebook environment. Check out the [FederatedRuntime 101 Tutorial](https://github.com/securefederatedai/openfl/tree/develop/openfl-tutorials/experimental/workflow/FederatedRuntime/101_MNIST) to try it yourself. The initial version of the FederatedRuntime included in this release is an experimental feature that should be used only in an internal environment. We further recommend that users operate only on artificial or public data that is not considered intellectual property. The experimental tag and restrictions will be removed in future releases of OpenFL. + +- [**Federated XGBoost**](https://github.com/securefederatedai/openfl/tree/develop/openfl-workspace/xgb_higgs): Adding support for XGBoost training in OpenFL via TaskRunner API, illustrated with the Higgs dataset. + +- [**Callbacks**](https://openfl.readthedocs.io/en/latest/openfl.callbacks.html): An abstraction for running user-defined actions in TaskRunner API or Workflow API. Callbacks can be used to perform custom actions at different stages of the Federated Learning process. + +### Enhanced Developer Experience +- **Streamlining OpenFL APIs**: With this release, the OpenFL Team will concentrate on the TaskRunner API and Workflow API. Consequently, the Python Native API and Interactive API have been deprecated and are scheduled for removal in future iterations. + +- **FL Workspace Dockerization**: Revised Task Runner API workspace dockerization process, with TEE-ready containers (using Gramine and Intel® Software Guard Extensions). Follow the [updated instructions](https://github.com/securefederatedai/openfl/blob/develop/openfl-docker/README.md) to enhance the privacy and security of your FL experiments. + +- **Federated Evaluation via TaskRunner API**: OpenFL 1.7 further simplifies the creation of Federated Evaluation experiments via the TaskRunner API (see the example [FedEval workspace](https://github.com/securefederatedai/openfl/tree/develop/openfl-workspace/torch_cnn_mnist_fed_eval)). + +- **Keras 3 API**: Upgrading the base TaskRunner classes and example workspaces to Keras 3 for building state-of-the-art FL experiments with TensorFlow (more backends to be included in the upcoming OpenFL releases). + +- **Updated Tutorials**: This includes fixes to existing tutorial and example code, and migrating a selection of key OpenFL tutorials from deprecated APIs to Workflow API. Check out the updated [Tutorials](https://github.com/securefederatedai/openfl/tree/develop/openfl-tutorials/experimental/workflow) folder. + +- **Updated Official Documentation**: The [OpenFL documentation website](https://openfl.readthedocs.io/en/latest/index.html) has been comprehensively reviewed and reorganized to improve navigation and provide clearer content. + ## 1.6 [Full Release Notes](https://github.com/securefederatedai/openfl/releases/tag/v1.6) diff --git a/linters-requirements.txt b/linters-requirements.txt index b4a2191f21..e5afd1d2ee 100644 --- a/linters-requirements.txt +++ b/linters-requirements.txt @@ -1,2 +1,2 @@ pre-commit -ruff +ruff==0.9.2 \ No newline at end of file diff --git a/openfl-docker/README.md b/openfl-docker/README.md index da8540770d..f9eb586ad9 100644 --- a/openfl-docker/README.md +++ b/openfl-docker/README.md @@ -8,7 +8,7 @@ To develop or simulate experiments within a container, build the base image (or ```shell # Pull latest stable base image -$> docker pull intel/openfl +$> docker pull ghcr.io/securefederatedai/openfl/openfl:latest # Or, build a base image from the latest source code $> docker build . -t openfl -f Dockerfile.base \ @@ -17,7 +17,7 @@ $> docker build . -t openfl -f Dockerfile.base \ Run the container: ```shell -user@vm:~/openfl$ docker run -it --rm openfl:latest bash +user@vm:~/openfl$ docker run -it --rm ghcr.io/securefederatedai/openfl/openfl:latest bash user@7b40624c207a:/$ fx OpenFL - Open Federated Learning @@ -86,4 +86,28 @@ docker run --rm \ -v /var/run/aesmd/aesm.socket:/var/run/aesmd/aesm.socket \ --mount type=bind,source=./certs.tar,target=/certs.tar \ example_workspace bash -c "gramine-sgx fx collaborator start ..." -``` \ No newline at end of file +``` + +### Running OpenFL Container in Production +For running [TaskRunner API](https://openfl.readthedocs.io/en/latest/about/features_index/taskrunner.html#running-the-task-runner) in a production environment with enhanced security, use the following parameters to limit CPU, memory, and process IDs, and to prevent privilege escalation: + +**Example Command**: +```shell +docker run --rm --name --network openfl \ + -v $WORKING_DIRECTORY:/workdir-openfl \ + --cpus="0.1" \ + --memory="512m" \ + --pids-limit 100 \ + --security-opt no-new-privileges \ + openfl:latest +``` +**Parameters**: +```shell +--cpus="0.1": Limits the container to 10% of a single CPU core. +--memory="512m": Limits the container to 512MB of memory. +--pids-limit 100: Limits the number of processes to 100. +--security-opt no-new-privileges: Prevents the container from gaining additional privileges. +``` +These settings help ensure that your containerized application runs securely and efficiently in a production environment + +**Note**: The numbers suggested here are examples/minimal suggestions and need to be adjusted according to the environment and the type of experiments you are aiming to run. \ No newline at end of file diff --git a/openfl-docker/gramine_app/fx.manifest.template b/openfl-docker/gramine_app/fx.manifest.template index 928dff0f56..55e20adf42 100755 --- a/openfl-docker/gramine_app/fx.manifest.template +++ b/openfl-docker/gramine_app/fx.manifest.template @@ -64,6 +64,7 @@ sgx.trusted_files = [ # One should be conservative as to which files are allowed, these can be modified by enclave. sgx.allowed_files = [ "file:{{ workspace_root }}/save", + "file:{{ workspace_root }}/local_state", "file:{{ workspace_root }}/logs", "file:{{ workspace_root }}/cert", "file:{{ workspace_root }}/data", diff --git a/openfl-tutorials/deprecated/native_api/Federated_FedProx_Keras_MNIST_Tutorial.ipynb b/openfl-tutorials/deprecated/native_api/Federated_FedProx_Keras_MNIST_Tutorial.ipynb deleted file mode 100644 index cc0dcc1a9c..0000000000 --- a/openfl-tutorials/deprecated/native_api/Federated_FedProx_Keras_MNIST_Tutorial.ipynb +++ /dev/null @@ -1,376 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Federated Keras MNIST Tutorial" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "#Install Tensorflow and MNIST dataset if not installed\n", - "!pip install tensorflow==2.7.0\n", - "#Alternatively you could use the intel-tensorflow build\n", - "# !pip install intel-tensorflow==2.3.0" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import tensorflow as tf\n", - "import tensorflow.keras as keras\n", - "from tensorflow.keras import backend as K\n", - "from tensorflow.keras import Sequential\n", - "from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D\n", - "from tensorflow.keras.utils import to_categorical\n", - "from tensorflow.keras.datasets import mnist\n", - "\n", - "import openfl.native as fx\n", - "from openfl.federated import FederatedModel,FederatedDataSet\n", - "tf.config.run_functions_eagerly(True)\n", - "tf.random.set_seed(0)\n", - "np.random.seed(0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def test_intel_tensorflow():\n", - " \"\"\"\n", - " Check if Intel version of TensorFlow is installed\n", - " \"\"\"\n", - " import tensorflow as tf\n", - "\n", - " print(\"We are using Tensorflow version {}\".format(tf.__version__))\n", - "\n", - " major_version = int(tf.__version__.split(\".\")[0])\n", - " if major_version >= 2:\n", - " from tensorflow.python.util import _pywrap_util_port\n", - " print(\"Intel-optimizations (DNNL) enabled:\",\n", - " _pywrap_util_port.IsMklEnabled())\n", - " else:\n", - " print(\"Intel-optimizations (DNNL) enabled:\")\n", - "\n", - "test_intel_tensorflow()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After importing the required packages, the next step is setting up our openfl workspace. To do this, simply run the `fx.init()` command as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Setup default workspace, logging, etc.\n", - "fx.init('keras_cnn_mnist')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to define our dataset and model to perform federated learning on. The dataset should be composed of a numpy arrayWe start with a simple fully connected model that is trained on the MNIST dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Import and process training, validation, and test images/labels\n", - "\n", - "# Set the ratio of validation imgs, can't be 0.0\n", - "VALID_PERCENT = 0.3\n", - "\n", - "(X_train, y_train), (X_test, y_test) = mnist.load_data()\n", - "split_on = int((1 - VALID_PERCENT) * len(X_train))\n", - "\n", - "train_images = X_train[0:split_on,:,:]\n", - "train_labels = to_categorical(y_train)[0:split_on,:]\n", - "\n", - "valid_images = X_train[split_on:,:,:]\n", - "valid_labels = to_categorical(y_train)[split_on:,:]\n", - "\n", - "test_images = X_test\n", - "test_labels = to_categorical(y_test)\n", - "\n", - "def preprocess(images):\n", - " #Normalize\n", - " images = (images / 255) - 0.5\n", - " images = images.reshape(images.shape[0], -1)\n", - "# images = np.expand_dims(images, axis=-1)\n", - " return images\n", - "\n", - "# Preprocess the images.\n", - "train_images = preprocess(train_images)\n", - "valid_images = preprocess(valid_images)\n", - "test_images = preprocess(test_images)\n", - "\n", - "feature_shape = train_images.shape[1:]\n", - "classes = 10\n", - "\n", - "class UnbalancedFederatedDataset(FederatedDataSet):\n", - " def split(self, num_collaborators, shuffle=True, equally=False):\n", - " train_idx = self.split_lognormal(self.y_train, num_collaborators)\n", - " X_train = np.array([self.X_train[idx] for idx in train_idx])\n", - " y_train = np.array([self.y_train[idx] for idx in train_idx])\n", - " \n", - " valid_idx = self.split_lognormal(self.y_valid, num_collaborators)\n", - " X_valid = np.array([self.X_valid[idx] for idx in valid_idx])\n", - " y_valid = np.array([self.y_valid[idx] for idx in valid_idx])\n", - " \n", - " return [\n", - " FederatedDataSet(\n", - " X_train[i],\n", - " y_train[i],\n", - " X_valid[i],\n", - " y_valid[i],\n", - " batch_size=self.batch_size,\n", - " num_classes=self.num_classes\n", - " ) for i in range(num_collaborators)\n", - " ]\n", - " \n", - " def split_lognormal(self, labels, num_collaborators):\n", - " from tqdm import trange\n", - " labels = np.argmax(labels, axis=1)\n", - " idx = [[np.nonzero(labels == (col + j) % self.num_classes)[0][np.arange(5) + (col // 10 * 10 + 5 * j)] \\\n", - " for j in range(2)] for col in range(num_collaborators)]\n", - " idx = [np.hstack(tup) for tup in idx]\n", - " assert all([len(i) == 10 for i in idx]), 'All collaborators should have 10 elements at this stage'\n", - " props = np.random.lognormal(0, 2.0, (10,100,2))\n", - " props = np.array([[[len(np.nonzero(labels==label)[0])-1000]] for label in range(10)])*props/np.sum(props,(1,2), keepdims=True)\n", - " #idx = 1000*np.ones(10, dtype=np.int64)\n", - " for user in trange(1000):\n", - " for j in range(2):\n", - " l = (user+j)%10\n", - " num_samples = int(props[l,user//10,j])\n", - " if np.count_nonzero(labels[np.hstack(idx)] == l) + num_samples < len(np.nonzero(labels==l)[0]):\n", - " idx_to_append = np.nonzero(labels == (user + j) % 10)[0][np.arange(num_samples) + np.count_nonzero(labels[np.hstack(idx)] == l)]\n", - " idx[user] = np.append(idx[user], idx_to_append)\n", - " return idx\n", - "\n", - "fl_data = UnbalancedFederatedDataset(train_images,train_labels,valid_images,valid_labels,batch_size=32,num_classes=classes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openfl.utilities.optimizers.keras import FedProxOptimizer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def build_model(input_shape,\n", - " num_classes,\n", - " **kwargs):\n", - " \"\"\"\n", - " Define the model architecture.\n", - "\n", - " Args:\n", - " input_shape (numpy.ndarray): The shape of the data\n", - " num_classes (int): The number of classes of the dataset\n", - "\n", - " Returns:\n", - " tensorflow.python.keras.engine.sequential.Sequential: The model defined in Keras\n", - "\n", - " \"\"\"\n", - " model = Sequential()\n", - " \n", - " model.add(tf.keras.Input(shape=input_shape))\n", - " model.add(Dense(num_classes, activation='softmax'))\n", - "\n", - " model.compile(loss=keras.losses.categorical_crossentropy,\n", - " optimizer=FedProxOptimizer(mu=1),\n", - " metrics=['accuracy'])\n", - "\n", - " return model " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Create a federated model using the build model function and dataset\n", - "fl_model = FederatedModel(build_model, data_loader=fl_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `FederatedModel` object is a wrapper around your Keras, Tensorflow or PyTorch model that makes it compatible with openfl. It provides built in federated training and validation functions that we will see used below. Using it's `setup` function, collaborator models and datasets can be automatically defined for the experiment. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collaborator_models = fl_model.setup(num_collaborators=1000)\n", - " \n", - "collaborators = {f'col{col}':collaborator_models[col] for col in range(len(collaborator_models))}#, 'three':collaborator_models[2]}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Original MNIST dataset\n", - "print(f'Original training data size: {len(train_images)}')\n", - "print(f'Original validation data size: {len(valid_images)}\\n')\n", - "\n", - "#Collaborator one's data\n", - "print(f'Collaborator one\\'s training data size: {len(collaborator_models[0].data_loader.X_train)}')\n", - "print(f'Collaborator one\\'s validation data size: {len(collaborator_models[0].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator two's data\n", - "print(f'Collaborator two\\'s training data size: {len(collaborator_models[1].data_loader.X_train)}')\n", - "print(f'Collaborator two\\'s validation data size: {len(collaborator_models[1].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator three's data\n", - "#print(f'Collaborator three\\'s training data size: {len(collaborator_models[2].data_loader.X_train)}')\n", - "#print(f'Collaborator three\\'s validation data size: {len(collaborator_models[2].data_loader.X_valid)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the current plan values by running the `fx.get_plan()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Get the current values of the plan. Each of these can be overridden\n", - "import json\n", - "print(json.dumps(fx.get_plan(), indent=4, sort_keys=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to run our experiment. If we want to pass in custom plan settings, we can easily do that with the `override_config` parameter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "#Run experiment, return trained FederatedModel\n", - "final_fl_model = fx.run_experiment(collaborators,override_config={'aggregator.settings.rounds_to_train':5, 'collaborator.settings.opt_treatment': 'CONTINUE_GLOBAL'})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Save final model and load into keras\n", - "final_fl_model.save_native('final_model')\n", - "model = tf.keras.models.load_model('./final_model')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Test the final model on our test set\n", - "model.evaluate(test_images,test_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "\n", - "plt.figure(figsize=(9,6), dpi=150)\n", - "plt.title('Keras MNIST unbalanced split')\n", - "plt.plot([0.07627802075538784, 0.07518334008473902, 0.09541350667830556, 0.13141966053564103, 0.15887578643299638], label='FedAvg')\n", - "plt.plot([0.07627802075538784, 0.07518334008473902, 0.09541350667830556, 0.1314459763141349, 0.15887578643299638], linestyle='--', label='FedProx (mu=1e-2)')\n", - "plt.plot([0.07627802075538784, 0.0751056043850258, 0.09555227747093886, 0.131649036151357, 0.15966261748969554], linestyle='--', label='FedProx (mu=1e-1)')\n", - "plt.plot([0.07627802075538784, 0.07517912408802659, 0.09641592293512076, 0.13676991989742965, 0.1684917744528502], linestyle='--', label='FedProx (mu=1e1)')\n", - "\n", - "plt.legend()\n", - "plt.xticks(range(5))\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/openfl-tutorials/deprecated/native_api/Federated_FedProx_PyTorch_MNIST_Tutorial.ipynb b/openfl-tutorials/deprecated/native_api/Federated_FedProx_PyTorch_MNIST_Tutorial.ipynb deleted file mode 100644 index 8f6c23f6c9..0000000000 --- a/openfl-tutorials/deprecated/native_api/Federated_FedProx_PyTorch_MNIST_Tutorial.ipynb +++ /dev/null @@ -1,523 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Federated FedProx PyTorch MNIST Tutorial\n", - "The only difference between this notebook and Federated_Pytorch_MNIST_Tutorial.ipynb is overriding of the `train_epoch` function in model definition. [See details](#FedProx)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Install dependencies if not already installed\n", - "!pip install torch torchvision" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torch.optim as optim\n", - "\n", - "import torchvision\n", - "import torchvision.transforms as transforms\n", - "import openfl.native as fx\n", - "from openfl.federated import FederatedModel,FederatedDataSet\n", - "import random\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "def set_seed(seed):\n", - " torch.manual_seed(seed)\n", - " torch.cuda.manual_seed_all(seed)\n", - " torch.backends.cudnn.deterministic = True\n", - " torch.backends.cudnn.benchmark = False\n", - " np.random.seed(seed)\n", - " random.seed(seed)\n", - "set_seed(10)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After importing the required packages, the next step is setting up our openfl workspace. To do this, simply run the `fx.init()` command as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Setup default workspace, logging, etc.\n", - "fx.init('torch_cnn_mnist')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to define our dataset and model to perform federated learning on. The dataset should be composed of a numpy arrayWe start with a simple fully connected model that is trained on the MNIST dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def one_hot(labels, classes):\n", - " return np.eye(classes)[labels]\n", - "\n", - "transform = transforms.Compose(\n", - " [transforms.ToTensor(),\n", - " transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])\n", - "\n", - "trainset = torchvision.datasets.MNIST(root='./data', train=True,\n", - " download=True, transform=transform)\n", - "\n", - "train_images,train_labels = trainset.train_data, np.array(trainset.train_labels)\n", - "train_images = torch.from_numpy(np.expand_dims(train_images, axis=1)).float()\n", - "train_labels = one_hot(train_labels,10)\n", - "\n", - "validset = torchvision.datasets.MNIST(root='./data', train=False,\n", - " download=True, transform=transform)\n", - "\n", - "valid_images,valid_labels = validset.test_data, np.array(validset.test_labels)\n", - "valid_images = torch.from_numpy(np.expand_dims(valid_images, axis=1)).float()\n", - "valid_labels = one_hot(valid_labels,10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# FedProx" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openfl.utilities.optimizers.torch import FedProxOptimizer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "feature_shape = train_images.shape[1]\n", - "classes = 10\n", - "\n", - "fl_data = FederatedDataSet(train_images,train_labels,valid_images,valid_labels,batch_size=32,num_classes=classes)\n", - "\n", - "class Net(nn.Module):\n", - " def __init__(self):\n", - " super(Net, self).__init__()\n", - " self.conv1 = nn.Conv2d(1, 16, 3)\n", - " self.pool = nn.MaxPool2d(2, 2)\n", - " self.conv2 = nn.Conv2d(16, 32, 3)\n", - " self.fc1 = nn.Linear(32 * 5 * 5, 32)\n", - " self.fc2 = nn.Linear(32, 84)\n", - " self.fc3 = nn.Linear(84, 10)\n", - "\n", - " def forward(self, x):\n", - " x = self.pool(F.relu(self.conv1(x)))\n", - " x = self.pool(F.relu(self.conv2(x)))\n", - " x = x.view(x.size(0),-1)\n", - " x = F.relu(self.fc1(x))\n", - " x = F.relu(self.fc2(x))\n", - " x = self.fc3(x)\n", - " return F.log_softmax(x, dim=1)\n", - " \n", - " def train_epoch(self, batch_generator):\n", - " from openfl.federated.task import PyTorchTaskRunner\n", - " self.optimizer.set_old_weights([p for p in self.parameters()])\n", - " return PyTorchTaskRunner.train_epoch(self, batch_generator)\n", - "\n", - " \n", - "optimizer = lambda x: FedProxOptimizer(x, lr=1e-3, mu=0.1)\n", - "\n", - "def cross_entropy(output, target):\n", - " \"\"\"Binary cross-entropy metric\n", - " \"\"\"\n", - " return F.binary_cross_entropy_with_logits(input=output,target=target.float())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "#Create a federated model using the pytorch class, lambda optimizer function, and loss function\n", - "fl_model = FederatedModel(build_model=Net,optimizer=optimizer,loss_fn=cross_entropy,data_loader=fl_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `FederatedModel` object is a wrapper around your Keras, Tensorflow or PyTorch model that makes it compatible with openfl. It provides built in federated training and validation functions that we will see used below. Using it's `setup` function, collaborator models and datasets can be automatically defined for the experiment. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collaborator_models = fl_model.setup(num_collaborators=2)\n", - "collaborators = {'one':collaborator_models[0],'two':collaborator_models[1]}#, 'three':collaborator_models[2]}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Original MNIST dataset\n", - "print(f'Original training data size: {len(train_images)}')\n", - "print(f'Original validation data size: {len(valid_images)}\\n')\n", - "\n", - "#Collaborator one's data\n", - "print(f'Collaborator one\\'s training data size: {len(collaborator_models[0].data_loader.X_train)}')\n", - "print(f'Collaborator one\\'s validation data size: {len(collaborator_models[0].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator two's data\n", - "print(f'Collaborator two\\'s training data size: {len(collaborator_models[1].data_loader.X_train)}')\n", - "print(f'Collaborator two\\'s validation data size: {len(collaborator_models[1].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator three's data\n", - "#print(f'Collaborator three\\'s training data size: {len(collaborator_models[2].data_loader.X_train)}')\n", - "#print(f'Collaborator three\\'s validation data size: {len(collaborator_models[2].data_loader.X_valid)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the current plan values by running the `fx.get_plan()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - " #Get the current values of the plan. Each of these can be overridden\n", - "import json\n", - "print(json.dumps(fx.get_plan(), indent=4, sort_keys=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to run our experiment. If we want to pass in custom plan settings, we can easily do that with the `override_config` parameter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Run experiment, return trained FederatedModel\n", - "final_fl_model = fx.run_experiment(\n", - " collaborators,\n", - " {\n", - " 'aggregator.settings.rounds_to_train': 5,\n", - " 'collaborator.settings.opt_treatment': 'CONTINUE_GLOBAL',\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Save final model\n", - "final_fl_model.save_native('final_pytorch_model')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# FedProxAdam" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "code_folding": [] - }, - "outputs": [], - "source": [ - "classes = 10\n", - "collaborator_num = 300\n", - "NUM_USER = collaborator_num\n", - "\n", - "def one_hot(labels, classes):\n", - " return np.eye(classes)[labels]\n", - "\n", - "\n", - "def softmax(x):\n", - " ex = np.exp(x)\n", - " sum_ex = np.sum(np.exp(x))\n", - " return ex/sum_ex\n", - "\n", - "\n", - "def generate_synthetic(alpha, beta, iid):\n", - "\n", - " dimension = 60\n", - " NUM_CLASS = 10\n", - "\n", - " samples_per_user = np.random.lognormal(4, 2, (NUM_USER)).astype(int) + 50\n", - " num_samples = np.sum(samples_per_user)\n", - "\n", - " X_split = [[] for _ in range(NUM_USER)]\n", - " y_split = [[] for _ in range(NUM_USER)]\n", - "\n", - " #### define some eprior ####\n", - " mean_W = np.random.normal(0, alpha, NUM_USER)\n", - " mean_b = mean_W\n", - " B = np.random.normal(0, beta, NUM_USER)\n", - " mean_x = np.zeros((NUM_USER, dimension))\n", - "\n", - " diagonal = np.zeros(dimension)\n", - " for j in range(dimension):\n", - " diagonal[j] = np.power((j+1), -1.2)\n", - " cov_x = np.diag(diagonal)\n", - "\n", - " for i in range(NUM_USER):\n", - " if iid == 1:\n", - " mean_x[i] = np.ones(dimension) * B[i] # all zeros\n", - " else:\n", - " mean_x[i] = np.random.normal(B[i], 1, dimension)\n", - "\n", - " if iid == 1:\n", - " W_global = np.random.normal(0, 1, (dimension, NUM_CLASS))\n", - " b_global = np.random.normal(0, 1, NUM_CLASS)\n", - "\n", - " for i in range(NUM_USER):\n", - "\n", - " W = np.random.normal(mean_W[i], 1, (dimension, NUM_CLASS))\n", - " b = np.random.normal(mean_b[i], 1, NUM_CLASS)\n", - "\n", - " if iid == 1:\n", - " W = W_global\n", - " b = b_global\n", - "\n", - " xx = np.random.multivariate_normal(\n", - " mean_x[i], cov_x, samples_per_user[i])\n", - " yy = np.zeros(samples_per_user[i])\n", - "\n", - " for j in range(samples_per_user[i]):\n", - " tmp = np.dot(xx[j], W) + b\n", - " yy[j] = np.argmax(softmax(tmp))\n", - "\n", - " X_split[i] = xx.tolist()\n", - " y_split[i] = yy.tolist()\n", - "\n", - "# print(\"{}-th users has {} exampls\".format(i, len(y_split[i])))\n", - "\n", - " return X_split, y_split\n", - "\n", - "\n", - "class SyntheticFederatedDataset(FederatedDataSet):\n", - " def __init__(self, batch_size=1, num_classes=None, **kwargs):\n", - " X, y = generate_synthetic(0.0, 0.0, 0)\n", - " X = [np.array([np.array(sample).astype(np.float32)\n", - " for sample in col]) for col in X]\n", - " y = [np.array([np.array(one_hot(int(sample), classes))\n", - " for sample in col]) for col in y]\n", - " self.X_train_all = np.array([col[:int(0.9 * len(col))] for col in X])\n", - " self.X_valid_all = np.array([col[int(0.9 * len(col)):] for col in X])\n", - " self.y_train_all = np.array([col[:int(0.9 * len(col))] for col in y])\n", - " self.y_valid_all = np.array([col[int(0.9 * len(col)):] for col in y])\n", - " super().__init__(self.X_train_all[0], self.y_train_all[0], self.X_valid_all[0],\n", - " self.y_valid_all[0], batch_size, num_classes)\n", - "\n", - " def split(self, num_collaborators, shuffle=True, equally=False):\n", - " return [\n", - " FederatedDataSet(\n", - " self.X_train_all[i],\n", - " self.y_train_all[i],\n", - " self.X_valid_all[i],\n", - " self.y_valid_all[i],\n", - " batch_size=self.batch_size,\n", - " num_classes=self.num_classes\n", - " ) for i in range(num_collaborators)\n", - " ]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openfl.utilities.optimizers.torch import FedProxAdam " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class Net(nn.Module):\n", - " def __init__(self):\n", - " super(Net, self).__init__()\n", - " self.linear1 = nn.Linear(60, 100)\n", - " self.linear2 = nn.Linear(100, 10)\n", - "\n", - " def forward(self, x):\n", - " x = self.linear1(x)\n", - " x = self.linear2(x)\n", - " return x\n", - "\n", - " def train_epoch(self, batch_generator):\n", - " from openfl.federated.task import PyTorchTaskRunner\n", - " self.optimizer.set_old_weights(\n", - " [p.clone().detach() for p in self.parameters()])\n", - " return PyTorchTaskRunner.train_epoch(self, batch_generator)\n", - "\n", - "\n", - "def optimizer(x): return FedProxAdam(x, lr=1e-3, mu=0.01)\n", - "# optimizer = lambda x: torch.optim.Adam(x, lr=1e-3)\n", - "\n", - "\n", - "def cross_entropy(output, target):\n", - " \"\"\"Binary cross-entropy metric\n", - " \"\"\"\n", - " return F.cross_entropy(output, torch.max(target, 1)[1])\n", - "# return F.binary_cross_entropy_with_logits(input=output,target=target.float())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fl_data = SyntheticFederatedDataset(batch_size=32, num_classes=classes)\n", - "#Create a federated model using the pytorch class, lambda optimizer function, and loss function\n", - "fl_model = FederatedModel(build_model=Net,optimizer=optimizer,loss_fn=cross_entropy,data_loader=fl_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `FederatedModel` object is a wrapper around your Keras, Tensorflow or PyTorch model that makes it compatible with openfl. It provides built in federated training and validation functions that we will see used below. Using it's `setup` function, collaborator models and datasets can be automatically defined for the experiment. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collaborator_models = fl_model.setup(num_collaborators=collaborator_num,device='cpu')\n", - "collaborators = {f'col{i}':collaborator_models[i] for i in range(collaborator_num)}#, 'three':collaborator_models[2]}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "a = np.argmax(collaborators['col3'].data_loader.y_valid, axis =1)\n", - "import matplotlib.pyplot as plt\n", - "plt.hist(a)\n", - "collaborator_models[1].data_loader.y_valid.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the current plan values by running the `fx.get_plan()` function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to run our experiment. If we want to pass in custom plan settings, we can easily do that with the `override_config` parameter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Run experiment, return trained FederatedModel\n", - "final_fl_model = fx.run_experiment(\n", - " collaborators,\n", - " {\n", - " 'aggregator.settings.rounds_to_train': 20,\n", - " 'collaborator.settings.opt_treatment': 'CONTINUE_GLOBAL',\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Save final model\n", - "final_fl_model.save_native('final_pytorch_model')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/openfl-tutorials/deprecated/native_api/Federated_Keras_MNIST_Tutorial.ipynb b/openfl-tutorials/deprecated/native_api/Federated_Keras_MNIST_Tutorial.ipynb deleted file mode 100644 index fbdab4b46e..0000000000 --- a/openfl-tutorials/deprecated/native_api/Federated_Keras_MNIST_Tutorial.ipynb +++ /dev/null @@ -1,280 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Federated Keras MNIST Tutorial" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "#Install Tensorflow and MNIST dataset if not installed\n", - "!pip install tensorflow==2.13\n", - "\n", - "#Alternatively you could use the intel-tensorflow build\n", - "# !pip install intel-tensorflow==2.13" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import tensorflow as tf\n", - "import tensorflow.keras as keras\n", - "from tensorflow.keras import backend as K\n", - "from tensorflow.keras import Sequential\n", - "from tensorflow.keras.layers import Conv2D, Flatten, Dense\n", - "from tensorflow.keras.utils import to_categorical\n", - "from tensorflow.keras.datasets import mnist\n", - "\n", - "import openfl.native as fx\n", - "from openfl.federated import FederatedModel,FederatedDataSet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def test_intel_tensorflow():\n", - " \"\"\"\n", - " Check if Intel version of TensorFlow is installed\n", - " \"\"\"\n", - " import tensorflow as tf\n", - "\n", - " print(\"We are using Tensorflow version {}\".format(tf.__version__))\n", - "\n", - " major_version = int(tf.__version__.split(\".\")[0])\n", - " if major_version >= 2:\n", - " from tensorflow.python.util import _pywrap_util_port\n", - " print(\"Intel-optimizations (DNNL) enabled:\",\n", - " _pywrap_util_port.IsMklEnabled())\n", - " else:\n", - " print(\"Intel-optimizations (DNNL) enabled:\")\n", - "\n", - "test_intel_tensorflow()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After importing the required packages, the next step is setting up our openfl workspace. To do this, simply run the `fx.init()` command as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Setup default workspace, logging, etc.\n", - "fx.init('keras_cnn_mnist')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to define our dataset and model to perform federated learning on. The dataset should be composed of a numpy arrayWe start with a simple fully connected model that is trained on the MNIST dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Import and process training, validation, and test images/labels\n", - "\n", - "# Set the ratio of validation imgs, can't be 0.0\n", - "VALID_PERCENT = 0.3\n", - "\n", - "(X_train, y_train), (X_test, y_test) = mnist.load_data()\n", - "split_on = int((1 - VALID_PERCENT) * len(X_train))\n", - "\n", - "train_images = X_train[0:split_on,:,:]\n", - "train_labels = to_categorical(y_train)[0:split_on,:]\n", - "\n", - "valid_images = X_train[split_on:,:,:]\n", - "valid_labels = to_categorical(y_train)[split_on:,:]\n", - "\n", - "test_images = X_test\n", - "test_labels = to_categorical(y_test)\n", - "\n", - "def preprocess(images):\n", - " #Normalize\n", - " images = (images / 255) - 0.5\n", - " #Flatten\n", - " images = images.reshape((-1, 784))\n", - " return images\n", - "\n", - "# Preprocess the images.\n", - "train_images = preprocess(train_images)\n", - "valid_images = preprocess(valid_images)\n", - "test_images = preprocess(test_images)\n", - "\n", - "feature_shape = train_images.shape[1]\n", - "classes = 10\n", - "\n", - "fl_data = FederatedDataSet(train_images,train_labels,valid_images,valid_labels,batch_size=32,num_classes=classes)\n", - "\n", - "def build_model(feature_shape,classes):\n", - " #Defines the MNIST model\n", - " model = Sequential()\n", - " model.add(Dense(64, input_shape=feature_shape, activation='relu'))\n", - " model.add(Dense(64, activation='relu'))\n", - " model.add(Dense(classes, activation='softmax'))\n", - " \n", - " model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'],)\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Create a federated model using the build model function and dataset\n", - "fl_model = FederatedModel(build_model,data_loader=fl_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `FederatedModel` object is a wrapper around your Keras, Tensorflow or PyTorch model that makes it compatible with openfl. It provides built in federated training and validation functions that we will see used below. Using it's `setup` function, collaborator models and datasets can be automatically defined for the experiment. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collaborator_models = fl_model.setup(num_collaborators=2)\n", - "collaborators = {'one':collaborator_models[0],'two':collaborator_models[1]}#, 'three':collaborator_models[2]}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Original MNIST dataset\n", - "print(f'Original training data size: {len(train_images)}')\n", - "print(f'Original validation data size: {len(valid_images)}\\n')\n", - "\n", - "#Collaborator one's data\n", - "print(f'Collaborator one\\'s training data size: {len(collaborator_models[0].data_loader.X_train)}')\n", - "print(f'Collaborator one\\'s validation data size: {len(collaborator_models[0].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator two's data\n", - "print(f'Collaborator two\\'s training data size: {len(collaborator_models[1].data_loader.X_train)}')\n", - "print(f'Collaborator two\\'s validation data size: {len(collaborator_models[1].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator three's data\n", - "#print(f'Collaborator three\\'s training data size: {len(collaborator_models[2].data_loader.X_train)}')\n", - "#print(f'Collaborator three\\'s validation data size: {len(collaborator_models[2].data_loader.X_valid)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the current plan values by running the `fx.get_plan()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Get the current values of the plan. Each of these can be overridden\n", - "print(fx.get_plan())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to run our experiment. If we want to pass in custom plan settings, we can easily do that with the `override_config` parameter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "#Run experiment, return trained FederatedModel\n", - "final_fl_model = fx.run_experiment(collaborators,override_config={'aggregator.settings.rounds_to_train':5})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Save final model and load into keras\n", - "final_fl_model.save_native('final_model')\n", - "model = tf.keras.models.load_model('./final_model')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Test the final model on our test set\n", - "model.evaluate(test_images,test_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/openfl-tutorials/deprecated/native_api/Federated_PyTorch_TinyImageNet.ipynb b/openfl-tutorials/deprecated/native_api/Federated_PyTorch_TinyImageNet.ipynb deleted file mode 100644 index b526806bc3..0000000000 --- a/openfl-tutorials/deprecated/native_api/Federated_PyTorch_TinyImageNet.ipynb +++ /dev/null @@ -1,378 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Federated PyTorch TinyImageNet Tutorial" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook is an example of Transfer Learning \n", - "\n", - "Custom DataLoader is used with OpenFL Python API" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Install dependencies if not already installed\n", - "!pip install torch torchvision\n", - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import glob\n", - "from torch.utils.data import Dataset, DataLoader\n", - "from PIL import Image\n", - "\n", - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torch.optim as optim\n", - "\n", - "import torchvision\n", - "from torchvision import transforms as T\n", - "\n", - "import openfl.native as fx\n", - "from openfl.federated import FederatedModel, FederatedDataSet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After importing the required packages, the next step is setting up our openfl workspace. To do this, simply run the `fx.init()` command as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Setup default workspace, logging, etc.\n", - "fx.init('torch_cnn_mnist')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to define our dataset and model to perform federated learning on. The dataset should be composed of a numpy arrayWe start with a simple fully connected model that is trained on the MNIST dataset. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Download the data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!wget --no-clobber http://cs231n.stanford.edu/tiny-imagenet-200.zip\n", - "!unzip -n tiny-imagenet-200.zip\n", - "TINY_IMAGENET_ROOT = './tiny-imagenet-200/'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Describe the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class TinyImageNet(Dataset):\n", - " \"\"\"\n", - " Contains 200 classes for training. Each class has 500 images. \n", - " Parameters\n", - " ----------\n", - " root: string\n", - " Root directory including `train` and `val` subdirectories.\n", - " split: string\n", - " Indicating which split to return as a data set.\n", - " Valid option: [`train`, `val`]\n", - " transform: torchvision.transforms\n", - " A (series) of valid transformation(s).\n", - " \"\"\"\n", - " def __init__(self, root, split='train', transform=None, target_transform=None):\n", - " NUM_IMAGES_PER_CLASS = 500\n", - " self.root = os.path.expanduser(root)\n", - " self.transform = transform\n", - " self.target_transform = target_transform\n", - " self.split_dir = os.path.join(self.root, split)\n", - " self.image_paths = sorted(glob.iglob(os.path.join(self.split_dir, '**', '*.JPEG'), recursive=True))\n", - " \n", - " self.labels = {} # fname - label number mapping\n", - "\n", - " # build class label - number mapping\n", - " with open(os.path.join(self.root, 'wnids.txt'), 'r') as fp:\n", - " self.label_texts = sorted([text.strip() for text in fp.readlines()])\n", - " self.label_text_to_number = {text: i for i, text in enumerate(self.label_texts)}\n", - "\n", - " if split == 'train':\n", - " for label_text, i in self.label_text_to_number.items():\n", - " for cnt in range(NUM_IMAGES_PER_CLASS):\n", - " self.labels[f'{label_text}_{cnt}.JPEG'] = i\n", - " elif split == 'val':\n", - " with open(os.path.join(self.split_dir, 'val_annotations.txt'), 'r') as fp:\n", - " for line in fp.readlines():\n", - " terms = line.split('\\t')\n", - " file_name, label_text = terms[0], terms[1]\n", - " self.labels[file_name] = self.label_text_to_number[label_text]\n", - " \n", - " \n", - " def __len__(self):\n", - " return len(self.image_paths)\n", - "\n", - " def __getitem__(self, index):\n", - " file_path = self.image_paths[index]\n", - " label = self.labels[os.path.basename(file_path)]\n", - " label = self.target_transform(label) if self.target_transform else label\n", - " return self.read_image(file_path), label\n", - "\n", - " def read_image(self, path):\n", - " img = Image.open(path)\n", - " return self.transform(img) if self.transform else img\n", - "\n", - "def one_hot(labels, classes):\n", - " return np.eye(classes)[labels]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "normalize = T.Normalize(mean=[0.485, 0.456, 0.406],\n", - " std=[0.229, 0.224, 0.225])\n", - "\n", - "augmentation = T.RandomApply([\n", - " T.RandomHorizontalFlip(),\n", - " T.RandomRotation(10),\n", - " T.RandomResizedCrop(64)], p=.8)\n", - "\n", - "training_transform = T.Compose([\n", - " T.Lambda(lambda x: x.convert(\"RGB\")),\n", - " augmentation,\n", - " T.ToTensor(),\n", - " normalize])\n", - "\n", - "valid_transform = T.Compose([\n", - " T.Lambda(lambda x: x.convert(\"RGB\")),\n", - " T.ToTensor(),\n", - " normalize])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Implement Federated dataset\n", - "We have to implement `split` method" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openfl.utilities.data_splitters import EqualNumPyDataSplitter\n", - "from torch.utils.data import Subset\n", - "\n", - "\n", - "train_set = TinyImageNet(TINY_IMAGENET_ROOT, 'train', transform=training_transform)\n", - "valid_set = TinyImageNet(TINY_IMAGENET_ROOT, 'val', transform=valid_transform, \\\n", - " target_transform=lambda target: one_hot(target, 200))\n", - "\n", - "class TinyImageNetFederatedDataset(DataLoader):\n", - " def __init__(self, train_set, valid_set, batch_size):\n", - " self.data_splitter = EqualNumPyDataSplitter()\n", - " self.train_set = train_set\n", - " self.valid_set = valid_set\n", - " self.batch_size = batch_size\n", - " \n", - " def split(self, num_collaborators):\n", - " train_split = self.data_splitter.split([label for _, label in self.train_set], num_collaborators)\n", - " valid_split = self.data_splitter.split([label for _, label in self.valid_set], num_collaborators)\n", - " return [\n", - " TinyImageNetFederatedDataset(\n", - " Subset(self.train_set, train_split[i]),\n", - " Subset(self.valid_set, valid_split[i]),\n", - " self.batch_size\n", - " )\n", - " for i in range(num_collaborators)\n", - " ]\n", - " \n", - " def get_feature_shape(self):\n", - " return self.train_set[0][0].shape\n", - " \n", - " def get_train_loader(self, num_batches=None):\n", - " return DataLoader(self.train_set, batch_size=self.batch_size)\n", - " \n", - " def get_valid_loader(self):\n", - " return DataLoader(self.valid_set)\n", - " \n", - " def get_train_data_size(self):\n", - " return len(self.train_set)\n", - " \n", - " def get_valid_data_size(self):\n", - " return len(self.valid_set)\n", - " \n", - "fl_data = TinyImageNetFederatedDataset(train_set, valid_set, batch_size=32)\n", - "\n", - "num_classes = 200" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Define model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class Net(nn.Module):\n", - " def __init__(self):\n", - " super(Net, self).__init__()\n", - " self.model = torchvision.models.mobilenet_v2(pretrained=True)\n", - " self.model.requires_grad_(False)\n", - " self.model.classifier[1] = torch.nn.Linear(in_features=1280, \\\n", - " out_features=num_classes, bias=True)\n", - "\n", - " def forward(self, x):\n", - " x = self.model.forward(x)\n", - " return x\n", - "\n", - " \n", - "optimizer = lambda x: optim.Adam(x, lr=1e-4)\n", - "\n", - "def cross_entropy(output, target):\n", - " \"\"\"Binary cross-entropy metric\n", - " \"\"\"\n", - " return F.cross_entropy(input=output,target=target)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Create a federated model using the pytorch class, lambda optimizer function, and loss function\n", - "fl_model = FederatedModel(build_model=Net,optimizer=optimizer,loss_fn=cross_entropy, \\\n", - " data_loader=fl_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `FederatedModel` object is a wrapper around your Keras, Tensorflow or PyTorch model that makes it compatible with openfl. It provides built in federated training and validation functions that we will see used below. Using it's `setup` function, collaborator models and datasets can be automatically defined for the experiment. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collaborator_models = fl_model.setup(num_collaborators=10)\n", - "collaborators = {'one':collaborator_models[0],'two':collaborator_models[1]}#, 'three':collaborator_models[2]}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Original TinyImageNet dataset\n", - "print(f'Original training data size: {len(fl_data.train_set)}')\n", - "print(f'Original validation data size: {len(fl_data.valid_set)}\\n')\n", - "\n", - "#Collaborator one's data\n", - "for i, model in enumerate(collaborator_models):\n", - " print(f'Collaborator {i}\\'s training data size: {len(model.data_loader.train_set)}')\n", - " print(f'Collaborator {i}\\'s validation data size: {len(model.data_loader.valid_set)}\\n')\n", - "\n", - "#Collaborator three's data\n", - "#print(f'Collaborator three\\'s training data size: {len(collaborator_models[2].data_loader.X_train)}')\n", - "#print(f'Collaborator three\\'s validation data size: {len(collaborator_models[2].data_loader.X_valid)}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Run experiment, return trained FederatedModel\n", - "final_fl_model = fx.run_experiment(collaborators,{'aggregator.settings.rounds_to_train':10})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Save final model\n", - "final_fl_model.save_native('final_model.pth')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/openfl-tutorials/deprecated/native_api/Federated_PyTorch_UNET_Tutorial.ipynb b/openfl-tutorials/deprecated/native_api/Federated_PyTorch_UNET_Tutorial.ipynb deleted file mode 100644 index 7ee6c2e692..0000000000 --- a/openfl-tutorials/deprecated/native_api/Federated_PyTorch_UNET_Tutorial.ipynb +++ /dev/null @@ -1,545 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Federated PyTorch UNET Tutorial" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Install dependencies if not already installed\n", - "!pip install torch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First of all we need to set up our OpenFL workspace. To do this, simply run the `fx.init()` command as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import openfl.native as fx\n", - "\n", - "# Setup default workspace, logging, etc. Install additional requirements\n", - "fx.init('torch_unet_kvasir')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import installed modules\n", - "import PIL\n", - "import json\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torch.optim as optim\n", - "import numpy as np\n", - "from skimage import io\n", - "from torchvision import transforms as tsf\n", - "import matplotlib.pyplot as plt\n", - "from torch.utils.data import Dataset, DataLoader\n", - "\n", - "from os import listdir\n", - "\n", - "from openfl.federated import FederatedModel, FederatedDataSet\n", - "from openfl.utilities import TensorKey\n", - "from openfl.utilities import validate_file_hash" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Download Kvasir dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!wget 'https://datasets.simula.no/downloads/hyper-kvasir/hyper-kvasir-segmented-images.zip' -O kvasir.zip\n", - "ZIP_SHA384 = ('66cd659d0e8afd8c83408174'\n", - " '1ade2b75dada8d4648b816f2533c8748b1658efa3d49e205415d4116faade2c5810e241e')\n", - "validate_file_hash('./kvasir.zip', ZIP_SHA384)\n", - "!unzip -n kvasir.zip -d ./data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to define our dataset and model to perform federated learning on." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_PATH = './data/segmented-images/'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def read_data(image_path, mask_path):\n", - " \"\"\"\n", - " Read image and mask from disk.\n", - " \"\"\"\n", - " img = io.imread(image_path)\n", - " assert(img.shape[2] == 3)\n", - " mask = io.imread(mask_path)\n", - " return (img, mask[:, :, 0].astype(np.uint8))\n", - "\n", - "\n", - "class KvasirDataset(Dataset):\n", - " \"\"\"\n", - " Kvasir dataset contains 1000 images for all collaborators.\n", - " Args:\n", - " data_path: path to dataset on disk\n", - " collaborator_count: total number of collaborators\n", - " collaborator_num: number of current collaborator\n", - " is_validation: validation option\n", - " \"\"\"\n", - "\n", - " def __init__(self, data_path, collaborator_count, collaborator_num, is_validation):\n", - " self.images_path = './data/segmented-images/images/'\n", - " self.masks_path = './data/segmented-images/masks/'\n", - " self.images_names = [\n", - " img_name\n", - " for img_name in sorted(listdir(self.images_path))\n", - " if len(img_name) > 3 and img_name[-3:] == 'jpg'\n", - " ]\n", - "\n", - " self.images_names = self.images_names[collaborator_num:: collaborator_count]\n", - " self.is_validation = is_validation\n", - " assert(len(self.images_names) > 8)\n", - " validation_size = len(self.images_names) // 8\n", - " if is_validation:\n", - " self.images_names = self.images_names[-validation_size:]\n", - " else:\n", - " self.images_names = self.images_names[: -validation_size]\n", - "\n", - " self.img_trans = tsf.Compose([\n", - " tsf.ToPILImage(),\n", - " tsf.Resize((332, 332)),\n", - " tsf.ToTensor(),\n", - " tsf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])\n", - " self.mask_trans = tsf.Compose([\n", - " tsf.ToPILImage(),\n", - " tsf.Resize((332, 332), interpolation=PIL.Image.NEAREST),\n", - " tsf.ToTensor()])\n", - "\n", - " def __getitem__(self, index):\n", - " name = self.images_names[index]\n", - " img, mask = read_data(self.images_path + name, self.masks_path + name)\n", - " img = self.img_trans(img).numpy()\n", - " mask = self.mask_trans(mask).numpy()\n", - " return img, mask\n", - "\n", - " def __len__(self):\n", - " return len(self.images_names)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we redefine `FederatedDataSet` methods, if we don't want to use default batch generator from `FederatedDataSet`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class KvasirFederatedDataset(FederatedDataSet):\n", - " def __init__(self, collaborator_count=1, collaborator_num=0, batch_size=1, **kwargs):\n", - " \"\"\"Instantiate the data object\n", - " Args:\n", - " collaborator_count: total number of collaborators\n", - " collaborator_num: number of current collaborator\n", - " batch_size: the batch size of the data loader\n", - " **kwargs: additional arguments, passed to super init\n", - " \"\"\"\n", - " super().__init__([], [], [], [], batch_size, num_classes=2, **kwargs)\n", - "\n", - " self.collaborator_num = int(collaborator_num)\n", - "\n", - " self.batch_size = batch_size\n", - "\n", - " self.training_set = KvasirDataset(\n", - " DATA_PATH, collaborator_count, collaborator_num, is_validation=False\n", - " )\n", - " self.valid_set = KvasirDataset(\n", - " DATA_PATH, collaborator_count, collaborator_num, is_validation=True\n", - " )\n", - "\n", - " self.train_loader = self.get_train_loader()\n", - " self.val_loader = self.get_valid_loader()\n", - "\n", - " def get_valid_loader(self, num_batches=None):\n", - " return DataLoader(self.valid_set, num_workers=8, batch_size=self.batch_size)\n", - "\n", - " def get_train_loader(self, num_batches=None):\n", - " return DataLoader(\n", - " self.training_set, num_workers=8, batch_size=self.batch_size, shuffle=True\n", - " )\n", - "\n", - " def get_train_data_size(self):\n", - " return len(self.training_set)\n", - "\n", - " def get_valid_data_size(self):\n", - " return len(self.valid_set)\n", - "\n", - " def get_feature_shape(self):\n", - " return self.valid_set[0][0].shape\n", - "\n", - " def split(self, collaborator_count, shuffle=True, equally=True):\n", - " return [\n", - " KvasirFederatedDataset(collaborator_count,\n", - " collaborator_num, self.batch_size)\n", - " for collaborator_num in range(collaborator_count)\n", - " ]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our Unet model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def soft_dice_loss(output, target):\n", - " num = target.size(0)\n", - " m1 = output.view(num, -1)\n", - " m2 = target.view(num, -1)\n", - " intersection = m1 * m2\n", - " score = 2.0 * (intersection.sum(1) + 1) / (m1.sum(1) + m2.sum(1) + 1)\n", - " score = 1 - score.sum() / num\n", - " return score\n", - "\n", - "\n", - "def soft_dice_coef(output, target):\n", - " num = target.size(0)\n", - " m1 = output.view(num, -1)\n", - " m2 = target.view(num, -1)\n", - " intersection = m1 * m2\n", - " score = 2.0 * (intersection.sum(1) + 1) / (m1.sum(1) + m2.sum(1) + 1)\n", - " return score.sum()\n", - "\n", - "\n", - "class DoubleConv(nn.Module):\n", - " def __init__(self, in_ch, out_ch):\n", - " super(DoubleConv, self).__init__()\n", - " self.in_ch = in_ch\n", - " self.out_ch = out_ch\n", - " self.conv = nn.Sequential(\n", - " nn.Conv2d(in_ch, out_ch, 3, padding=1),\n", - " nn.BatchNorm2d(out_ch),\n", - " nn.ReLU(inplace=True),\n", - " nn.Conv2d(out_ch, out_ch, 3, padding=1),\n", - " nn.BatchNorm2d(out_ch),\n", - " nn.ReLU(inplace=True),\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.conv(x)\n", - " return x\n", - "\n", - "\n", - "class Down(nn.Module):\n", - " def __init__(self, in_ch, out_ch):\n", - " super(Down, self).__init__()\n", - " self.mpconv = nn.Sequential(\n", - " nn.MaxPool2d(2),\n", - " DoubleConv(in_ch, out_ch)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.mpconv(x)\n", - " return x\n", - "\n", - "\n", - "class Up(nn.Module):\n", - " def __init__(self, in_ch, out_ch, bilinear=False):\n", - " super(Up, self).__init__()\n", - " self.in_ch = in_ch\n", - " self.out_ch = out_ch\n", - " if bilinear:\n", - " self.Up = nn.Upsample(\n", - " scale_factor=2,\n", - " mode=\"bilinear\",\n", - " align_corners=True\n", - " )\n", - " else:\n", - " self.Up = nn.ConvTranspose2d(in_ch, in_ch // 2, 2, stride=2)\n", - " self.conv = DoubleConv(in_ch, out_ch)\n", - "\n", - " def forward(self, x1, x2):\n", - " x1 = self.Up(x1)\n", - " diffY = x2.size()[2] - x1.size()[2]\n", - " diffX = x2.size()[3] - x1.size()[3]\n", - "\n", - " x1 = F.pad(x1, (diffX // 2, diffX - diffX //\n", - " 2, diffY // 2, diffY - diffY // 2))\n", - "\n", - " x = torch.cat([x2, x1], dim=1)\n", - " x = self.conv(x)\n", - " return x\n", - "\n", - "\n", - "class UNet(nn.Module):\n", - " def __init__(self, n_channels=3, n_classes=1):\n", - " super().__init__()\n", - " self.inc = DoubleConv(n_channels, 64)\n", - " self.down1 = Down(64, 128)\n", - " self.down2 = Down(128, 256)\n", - " self.down3 = Down(256, 512)\n", - " self.down4 = Down(512, 1024)\n", - " self.up1 = Up(1024, 512)\n", - " self.up2 = Up(512, 256)\n", - " self.up3 = Up(256, 128)\n", - " self.up4 = Up(128, 64)\n", - " self.outc = nn.Conv2d(64, n_classes, 1)\n", - "\n", - " def forward(self, x):\n", - " x1 = self.inc(x)\n", - " x2 = self.down1(x1)\n", - " x3 = self.down2(x2)\n", - " x4 = self.down3(x3)\n", - " x5 = self.down4(x4)\n", - " x = self.up1(x5, x4)\n", - " x = self.up2(x, x3)\n", - " x = self.up3(x, x2)\n", - " x = self.up4(x, x1)\n", - " x = self.outc(x)\n", - " x = torch.sigmoid(x)\n", - " return x\n", - "\n", - " def validate(\n", - " self, col_name, round_num, input_tensor_dict, use_tqdm=False, **kwargs\n", - " ):\n", - " \"\"\" Validate. Redifine function from PyTorchTaskRunner, to use our validation\"\"\"\n", - " self.rebuild_model(round_num, input_tensor_dict, validation=True)\n", - " self.eval()\n", - " self.to(self.device)\n", - " val_score = 0\n", - " total_samples = 0\n", - "\n", - " loader = self.data_loader.get_valid_loader()\n", - " if use_tqdm:\n", - " loader = tqdm.tqdm(loader, desc=\"validate\")\n", - "\n", - " with torch.no_grad():\n", - " for data, target in loader:\n", - " samples = target.shape[0]\n", - " total_samples += samples\n", - " data, target = (\n", - " torch.tensor(data).to(self.device),\n", - " torch.tensor(target).to(self.device),\n", - " )\n", - " output = self(data)\n", - " # get the index of the max log-probability\n", - " val = soft_dice_coef(output, target)\n", - " val_score += val.sum().cpu().numpy()\n", - "\n", - " origin = col_name\n", - " suffix = \"validate\"\n", - " if kwargs[\"apply\"] == \"local\":\n", - " suffix += \"_local\"\n", - " else:\n", - " suffix += \"_agg\"\n", - " tags = (\"metric\", suffix)\n", - " output_tensor_dict = {\n", - " TensorKey(\"dice_coef\", origin, round_num, True, tags): np.array(\n", - " val_score / total_samples\n", - " )\n", - " }\n", - " return output_tensor_dict, {}\n", - "\n", - "\n", - "def optimizer(x): return optim.Adam(x, lr=1e-3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create `KvasirFederatedDataset`, federated datasets for collaborators will be created in `split()` method of this object" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fl_data = KvasirFederatedDataset(batch_size=6)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `FederatedModel` object is a wrapper around your Keras, Tensorflow or PyTorch model that makes it compatible with OpenFL. It provides built-in federated training function which will be used while training. Using its `setup` function, collaborator models and datasets can be automatically obtained for the experiment. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a federated model using the pytorch class, optimizer function, and loss function\n", - "fl_model = FederatedModel(build_model=UNet, optimizer=optimizer,\n", - " loss_fn=soft_dice_loss, data_loader=fl_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collaborator_models = fl_model.setup(num_collaborators=2)\n", - "collaborators = {'one': collaborator_models[0], 'two': collaborator_models[1]}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the current FL plan values by running the `fx.get_plan()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get the current values of the FL plan. Each of these can be overridden\n", - "print(fx.get_plan())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to run our experiment. If we want to pass in custom FL plan settings, we can easily do that with the `override_config` parameter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run experiment, return trained FederatedModel\n", - "final_fl_model = fx.run_experiment(\n", - " collaborators, override_config={'aggregator.settings.rounds_to_train': 30})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Save final model\n", - "final_fl_model.save_native('final_pytorch_model')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's visually evaluate the results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collaborator = collaborator_models[0]\n", - "loader = collaborator.runner.data_loader.get_valid_loader()\n", - "model = final_fl_model.model\n", - "model.eval()\n", - "device = final_fl_model.runner.device\n", - "model.to(device)\n", - "with torch.no_grad():\n", - " for batch, _ in zip(loader, range(5)):\n", - " preds = model(batch[0].to(device))\n", - " for image, pred, target in zip(batch[0], preds, batch[1]):\n", - " plt.figure(figsize=(10, 10))\n", - " plt.subplot(131)\n", - " plt.imshow(image.permute(1, 2, 0).data.cpu().numpy() * 0.5 + 0.5)\n", - " plt.title(\"img\")\n", - " plt.subplot(132)\n", - " plt.imshow(pred[0].data.cpu().numpy())\n", - " plt.title(\"pred\")\n", - " plt.subplot(133)\n", - " plt.imshow(target[0].data.cpu().numpy())\n", - " plt.title(\"targ\")\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.10 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.8.10" - }, - "vscode": { - "interpreter": { - "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/openfl-tutorials/deprecated/native_api/Federated_Pytorch_MNIST_Tutorial.ipynb b/openfl-tutorials/deprecated/native_api/Federated_Pytorch_MNIST_Tutorial.ipynb deleted file mode 100644 index 10e83949df..0000000000 --- a/openfl-tutorials/deprecated/native_api/Federated_Pytorch_MNIST_Tutorial.ipynb +++ /dev/null @@ -1,267 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Federated PyTorch MNIST Tutorial" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Install dependencies if not already installed\n", - "!pip install torch torchvision" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torch.optim as optim\n", - "\n", - "import torchvision\n", - "import torchvision.transforms as transforms\n", - "import openfl.native as fx\n", - "from openfl.federated import FederatedModel,FederatedDataSet\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After importing the required packages, the next step is setting up our openfl workspace. To do this, simply run the `fx.init()` command as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Setup default workspace, logging, etc.\n", - "fx.init('torch_cnn_mnist', log_level='METRIC', log_file='./spam_metric.log')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to define our dataset and model to perform federated learning on. The dataset should be composed of a numpy arrayWe start with a simple fully connected model that is trained on the MNIST dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "transform = transforms.Compose(\n", - " [transforms.ToTensor(),\n", - " transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])\n", - "\n", - "trainset = torchvision.datasets.MNIST(root='./data', train=True,\n", - " download=True, transform=transform)\n", - "\n", - "train_images,train_labels = trainset.data, np.array(trainset.targets)\n", - "train_images = torch.from_numpy(np.expand_dims(train_images, axis=1)).float()\n", - "\n", - "validset = torchvision.datasets.MNIST(root='./data', train=False,\n", - " download=True, transform=transform)\n", - "\n", - "valid_images,valid_labels = validset.data, np.array(validset.targets)\n", - "valid_images = torch.from_numpy(np.expand_dims(valid_images, axis=1)).float()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "feature_shape = train_images.shape[1]\n", - "classes = 10\n", - "\n", - "fl_data = FederatedDataSet(train_images,train_labels,valid_images,valid_labels,batch_size=32,num_classes=classes)\n", - "\n", - "class Net(nn.Module):\n", - " def __init__(self):\n", - " super(Net, self).__init__()\n", - " self.conv1 = nn.Conv2d(1, 16, 3)\n", - " self.pool = nn.MaxPool2d(2, 2)\n", - " self.conv2 = nn.Conv2d(16, 32, 3)\n", - " self.fc1 = nn.Linear(32 * 5 * 5, 32)\n", - " self.fc2 = nn.Linear(32, 84)\n", - " self.fc3 = nn.Linear(84, 10)\n", - "\n", - " def forward(self, x):\n", - " x = self.pool(F.relu(self.conv1(x)))\n", - " x = self.pool(F.relu(self.conv2(x)))\n", - " x = x.view(x.size(0),-1)\n", - " x = F.relu(self.fc1(x))\n", - " x = F.relu(self.fc2(x))\n", - " x = self.fc3(x)\n", - " return F.log_softmax(x, dim=1)\n", - " \n", - "optimizer = lambda x: optim.Adam(x, lr=1e-4)\n", - "\n", - "def cross_entropy(output, target):\n", - " \"\"\"Binary cross-entropy metric\n", - " \"\"\"\n", - " return F.cross_entropy(input=output,target=target)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we can define metric logging function. It should has the following signature described below. You can use it to write metrics to tensorboard or some another specific logging." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from torch.utils.tensorboard import SummaryWriter\n", - "\n", - "writer = SummaryWriter('./logs/cnn_mnist', flush_secs=5)\n", - "\n", - "\n", - "def write_metric(node_name, task_name, metric_name, metric, round_number):\n", - " writer.add_scalar(\"{}/{}/{}\".format(node_name, task_name, metric_name),\n", - " metric, round_number)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "#Create a federated model using the pytorch class, lambda optimizer function, and loss function\n", - "fl_model = FederatedModel(build_model=Net,optimizer=optimizer,loss_fn=cross_entropy,data_loader=fl_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `FederatedModel` object is a wrapper around your Keras, Tensorflow or PyTorch model that makes it compatible with openfl. It provides built in federated training and validation functions that we will see used below. Using it's `setup` function, collaborator models and datasets can be automatically defined for the experiment. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collaborator_models = fl_model.setup(num_collaborators=2)\n", - "collaborators = {'one':collaborator_models[0],'two':collaborator_models[1]}#, 'three':collaborator_models[2]}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Original MNIST dataset\n", - "print(f'Original training data size: {len(train_images)}')\n", - "print(f'Original validation data size: {len(valid_images)}\\n')\n", - "\n", - "#Collaborator one's data\n", - "print(f'Collaborator one\\'s training data size: {len(collaborator_models[0].data_loader.X_train)}')\n", - "print(f'Collaborator one\\'s validation data size: {len(collaborator_models[0].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator two's data\n", - "print(f'Collaborator two\\'s training data size: {len(collaborator_models[1].data_loader.X_train)}')\n", - "print(f'Collaborator two\\'s validation data size: {len(collaborator_models[1].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator three's data\n", - "#print(f'Collaborator three\\'s training data size: {len(collaborator_models[2].data_loader.X_train)}')\n", - "#print(f'Collaborator three\\'s validation data size: {len(collaborator_models[2].data_loader.X_valid)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the current plan values by running the `fx.get_plan()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - " #Get the current values of the plan. Each of these can be overridden\n", - "print(fx.get_plan())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to run our experiment. If we want to pass in custom plan settings, we can easily do that with the `override_config` parameter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run experiment, return trained FederatedModel\n", - "\n", - "final_fl_model = fx.run_experiment(collaborators, override_config={\n", - " 'aggregator.settings.rounds_to_train': 5,\n", - " 'aggregator.settings.log_metric_callback': write_metric,\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Save final model\n", - "final_fl_model.save_native('final_pytorch_model')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/openfl-tutorials/deprecated/native_api/Federated_Pytorch_MNIST_custom_aggregation_Tutorial.ipynb b/openfl-tutorials/deprecated/native_api/Federated_Pytorch_MNIST_custom_aggregation_Tutorial.ipynb deleted file mode 100644 index 337ceb3259..0000000000 --- a/openfl-tutorials/deprecated/native_api/Federated_Pytorch_MNIST_custom_aggregation_Tutorial.ipynb +++ /dev/null @@ -1,708 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Federated PyTorch MNIST Tutorial" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Install dependencies if not already installed\n", - "!pip install torch torchvision" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torch.optim as optim\n", - "\n", - "import torchvision\n", - "import torchvision.transforms as transforms\n", - "import openfl.native as fx\n", - "from openfl.federated import FederatedModel,FederatedDataSet\n", - "\n", - "torch.manual_seed(0)\n", - "np.random.seed(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After importing the required packages, the next step is setting up our openfl workspace. To do this, simply run the `fx.init()` command as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Setup default workspace, logging, etc.\n", - "fx.init('torch_cnn_mnist')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to define our dataset and model to perform federated learning on. The dataset should be composed of a numpy arrayWe start with a simple fully connected model that is trained on the MNIST dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def one_hot(labels, classes):\n", - " return np.eye(classes)[labels]\n", - "\n", - "transform = transforms.Compose(\n", - " [transforms.ToTensor(),\n", - " transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])\n", - "\n", - "trainset = torchvision.datasets.MNIST(root='./data', train=True,\n", - " download=True, transform=transform)\n", - "\n", - "train_images,train_labels = trainset.train_data, np.array(trainset.train_labels)\n", - "train_images = torch.from_numpy(np.expand_dims(train_images, axis=1)).float()\n", - "train_labels = one_hot(train_labels,10)\n", - "\n", - "validset = torchvision.datasets.MNIST(root='./data', train=False,\n", - " download=True, transform=transform)\n", - "\n", - "valid_images,valid_labels = validset.test_data, np.array(validset.test_labels)\n", - "valid_images = torch.from_numpy(np.expand_dims(valid_images, axis=1)).float()\n", - "valid_labels = one_hot(valid_labels,10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "feature_shape = train_images.shape[1]\n", - "classes = 10\n", - "\n", - "fl_data = FederatedDataSet(train_images,train_labels,valid_images,valid_labels,batch_size=32,num_classes=classes)\n", - "\n", - "class Net(nn.Module):\n", - " def __init__(self):\n", - " super(Net, self).__init__()\n", - " self.conv1 = nn.Conv2d(1, 16, 3)\n", - " self.pool = nn.MaxPool2d(2, 2)\n", - " self.conv2 = nn.Conv2d(16, 32, 3)\n", - " self.fc1 = nn.Linear(32 * 5 * 5, 32)\n", - " self.fc2 = nn.Linear(32, 84)\n", - " self.fc3 = nn.Linear(84, 10)\n", - "\n", - " def forward(self, x):\n", - " x = self.pool(F.relu(self.conv1(x)))\n", - " x = self.pool(F.relu(self.conv2(x)))\n", - " x = x.view(x.size(0),-1)\n", - " x = F.relu(self.fc1(x))\n", - " x = F.relu(self.fc2(x))\n", - " x = self.fc3(x)\n", - " return x\n", - " \n", - "optimizer = lambda x: optim.Adam(x, lr=1e-4)\n", - "\n", - "def cross_entropy(output, target):\n", - " \"\"\"Binary cross-entropy metric\n", - " \"\"\"\n", - " return F.cross_entropy(input=output,target=torch.argmax(target, dim=1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "#Create a federated model using the pytorch class, lambda optimizer function, and loss function\n", - "fl_model = FederatedModel(build_model=Net,optimizer=optimizer,loss_fn=cross_entropy,data_loader=fl_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `FederatedModel` object is a wrapper around your Keras, Tensorflow or PyTorch model that makes it compatible with openfl. It provides built in federated training and validation functions that we will see used below. Using it's `setup` function, collaborator models and datasets can be automatically defined for the experiment. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collaborator_models = fl_model.setup(num_collaborators=10)\n", - "collaborators = {str(i): collaborator_models[i] for i in range(10)}#, 'three':collaborator_models[2]}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Original MNIST dataset\n", - "print(f'Original training data size: {len(train_images)}')\n", - "print(f'Original validation data size: {len(valid_images)}\\n')\n", - "\n", - "#Collaborator one's data\n", - "print(f'Collaborator one\\'s training data size: {len(collaborator_models[0].data_loader.X_train)}')\n", - "print(f'Collaborator one\\'s validation data size: {len(collaborator_models[0].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator two's data\n", - "print(f'Collaborator two\\'s training data size: {len(collaborator_models[1].data_loader.X_train)}')\n", - "print(f'Collaborator two\\'s validation data size: {len(collaborator_models[1].data_loader.X_valid)}\\n')\n", - "\n", - "#Collaborator three's data\n", - "#print(f'Collaborator three\\'s training data size: {len(collaborator_models[2].data_loader.X_train)}')\n", - "#print(f'Collaborator three\\'s validation data size: {len(collaborator_models[2].data_loader.X_valid)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the current plan values by running the `fx.get_plan()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - " #Get the current values of the plan. Each of these can be overridden\n", - "print(fx.get_plan())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openfl.interface.aggregation_functions import AggregationFunction\n", - "import numpy as np\n", - "\n", - "class ExponentialSmoothingAveraging(AggregationFunction):\n", - " \"\"\"\n", - " Averaging via exponential smoothing.\n", - " \n", - " In order to use this mechanism properly you should specify `aggregator.settings.db_store_rounds` \n", - " in `override_config` keyword argument of `run_experiment` function. \n", - " It should be equal to the number of rounds you want to include in smoothing window.\n", - " \n", - " Args:\n", - " alpha(float): Smoothing term.\n", - " \"\"\"\n", - " def __init__(self, alpha=0.9):\n", - " self.alpha = alpha\n", - " \n", - " def call(self,\n", - " local_tensors,\n", - " db_iterator,\n", - " tensor_name,\n", - " fl_round,\n", - " tags):\n", - " \"\"\"Aggregate tensors.\n", - "\n", - " Args:\n", - " local_tensors(list[openfl.utilities.LocalTensor]): List of local tensors to aggregate.\n", - " db_iterator: iterator over history of all tensors. Columns:\n", - " - 'tensor_name': name of the tensor.\n", - " Examples for `torch.nn.Module`s: 'conv1.weight', 'fc2.bias'.\n", - " - 'round': 0-based number of round corresponding to this tensor.\n", - " - 'tags': tuple of tensor tags. Tags that can appear:\n", - " - 'model' indicates that the tensor is a model parameter.\n", - " - 'trained' indicates that tensor is a part of a training result.\n", - " These tensors are passed to the aggregator node after local learning.\n", - " - 'aggregated' indicates that tensor is a result of aggregation.\n", - " These tensors are sent to collaborators for the next round.\n", - " - 'delta' indicates that value is a difference between rounds\n", - " for a specific tensor.\n", - " also one of the tags is a collaborator name\n", - " if it corresponds to a result of a local task.\n", - "\n", - " - 'nparray': value of the tensor.\n", - " tensor_name: name of the tensor\n", - " fl_round: round number\n", - " tags: tuple of tags for this tensor\n", - " Returns:\n", - " np.ndarray: aggregated tensor\n", - " \"\"\"\n", - " tensors, weights = zip(*[(x.tensor, x.weight) for x in local_tensors])\n", - " tensors, weights = np.array(tensors), np.array(weights)\n", - " average = np.average(tensors, weights=weights, axis=0)\n", - " previous_tensor_values = []\n", - " for record in db_iterator:\n", - " if (\n", - " record['tensor_name'] == tensor_name\n", - " and 'aggregated' in record['tags']\n", - " and 'delta' not in record['tags']\n", - " ):\n", - " previous_tensor_values.append(record['nparray'])\n", - " for i, x in enumerate(previous_tensor_values):\n", - " previous_tensor_values[i] = x * self.alpha * (1 - self.alpha) ** i\n", - " smoothing_term = np.sum(previous_tensor_values, axis=0)\n", - " return self.alpha * average + (1 - self.alpha) * smoothing_term" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openfl.interface.aggregation_functions import AggregationFunction\n", - "import numpy as np\n", - "\n", - "class ClippedAveraging(AggregationFunction):\n", - " def __init__(self, ratio):\n", - " \"\"\"Average clipped tensors.\n", - " \n", - " Args:\n", - " ratio(float): Ratio to multiply with a tensor for clipping\n", - " \"\"\"\n", - " self.ratio = ratio\n", - " \n", - " def call(self,\n", - " local_tensors,\n", - " db_iterator,\n", - " tensor_name,\n", - " fl_round,\n", - " *__):\n", - " \"\"\"Aggregate tensors.\n", - "\n", - " Args:\n", - " local_tensors(list[openfl.utilities.LocalTensor]): List of local tensors to aggregate.\n", - " db_iterator: iterator over history of all tensors. Columns:\n", - " - 'tensor_name': name of the tensor.\n", - " Examples for `torch.nn.Module`s: 'conv1.weight', 'fc2.bias'.\n", - " - 'round': 0-based number of round corresponding to this tensor.\n", - " - 'tags': tuple of tensor tags. Tags that can appear:\n", - " - 'model' indicates that the tensor is a model parameter.\n", - " - 'trained' indicates that tensor is a part of a training result.\n", - " These tensors are passed to the aggregator node after local learning.\n", - " - 'aggregated' indicates that tensor is a result of aggregation.\n", - " These tensors are sent to collaborators for the next round.\n", - " - 'delta' indicates that value is a difference between rounds\n", - " for a specific tensor.\n", - " also one of the tags is a collaborator name\n", - " if it corresponds to a result of a local task.\n", - "\n", - " - 'nparray': value of the tensor.\n", - " tensor_name: name of the tensor\n", - " fl_round: round number\n", - " tags: tuple of tags for this tensor\n", - " Returns:\n", - " np.ndarray: aggregated tensor\n", - " \"\"\"\n", - " clipped_tensors = []\n", - " previous_tensor_value = None\n", - " for record in db_iterator:\n", - " if (\n", - " record['round'] == (fl_round - 1)\n", - " and record['tensor_name'] == tensor_name\n", - " and record['tags'] == ('trained',)\n", - " ):\n", - " previous_tensor_value = record['nparray']\n", - " weights = []\n", - " for local_tensor in local_tensors:\n", - " prev_tensor = previous_tensor_value if previous_tensor_value is not None else local_tensor.tensor\n", - " delta = local_tensor.tensor - prev_tensor\n", - " new_tensor = prev_tensor + delta * self.ratio\n", - " clipped_tensors.append(new_tensor)\n", - " weights.append(local_tensor.weight)\n", - "\n", - " return np.average(clipped_tensors, weights=weights, axis=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openfl.interface.aggregation_functions import AggregationFunction\n", - "\n", - "class ConditionalThresholdAveraging(AggregationFunction):\n", - " def __init__(self, threshold_fn, metric_name='acc', tags=['metric', 'validate_local']):\n", - " \"\"\"Average tensors by metric value on previous round.\n", - " If no tensors match threshold condition, a simple weighted averaging will be performed.\n", - " \n", - " Args:\n", - " threshold_fn(callable): function to define a threshold for each round.\n", - " Has single argument `round_number`. \n", - " Returns threshold value above which collaborators are allowed to participate in aggregation.\n", - " metric_name(str): name of the metric to trace. Can be either 'acc' or 'loss'.\n", - " tags(Tuple[str]): tags of the metric tensor.\n", - " \"\"\"\n", - " self.metric_name = metric_name\n", - " self.threshold_fn = threshold_fn\n", - " self.tags = tags\n", - " self.logged_round = -1\n", - " \n", - " def call(self,\n", - " local_tensors,\n", - " db_iterator,\n", - " tensor_name,\n", - " fl_round,\n", - " *__):\n", - " \"\"\"Aggregate tensors.\n", - "\n", - " Args:\n", - " local_tensors(list[openfl.utilities.LocalTensor]): List of local tensors to aggregate.\n", - " db_iterator: iterator over history of all tensors. Columns:\n", - " - 'tensor_name': name of the tensor.\n", - " Examples for `torch.nn.Module`s: 'conv1.weight', 'fc2.bias'.\n", - " - 'round': 0-based number of round corresponding to this tensor.\n", - " - 'tags': tuple of tensor tags. Tags that can appear:\n", - " - 'model' indicates that the tensor is a model parameter.\n", - " - 'trained' indicates that tensor is a part of a training result.\n", - " These tensors are passed to the aggregator node after local learning.\n", - " - 'aggregated' indicates that tensor is a result of aggregation.\n", - " These tensors are sent to collaborators for the next round.\n", - " - 'delta' indicates that value is a difference between rounds\n", - " for a specific tensor.\n", - " also one of the tags is a collaborator name\n", - " if it corresponds to a result of a local task.\n", - "\n", - " - 'nparray': value of the tensor.\n", - " tensor_name: name of the tensor\n", - " fl_round: round number\n", - " tags: tuple of tags for this tensor\n", - " Returns:\n", - " np.ndarray: aggregated tensor\n", - " \"\"\"\n", - " selected_tensors = []\n", - " selected_weights = []\n", - " for record in db_iterator:\n", - " for local_tensor in local_tensors:\n", - " tags = set(self.tags + [local_tensor.col_name])\n", - " if (\n", - " tags <= set(record['tags']) \n", - " and record['round'] == fl_round\n", - " and record['tensor_name'] == self.metric_name\n", - " and record['nparray'] >= self.threshold_fn(fl_round)\n", - " ):\n", - " selected_tensors.append(local_tensor.tensor)\n", - " selected_weights.append(local_tensor.weight)\n", - " if not selected_tensors:\n", - " if self.logged_round < fl_round:\n", - " fx.logger.warning('No collaborators match threshold condition. Performing simple averaging...')\n", - " selected_tensors = [local_tensor.tensor for local_tensor in local_tensors]\n", - " selected_weights = [local_tensor.weight for local_tensor in local_tensors]\n", - " if self.logged_round < fl_round:\n", - " self.logged_round += 1\n", - " return np.average(selected_tensors, weights=selected_weights, axis=0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Privileged Aggregation Functions\n", - "Most of the time the `AggregationFunction` interface is sufficient to implement custom methods, but in certain scenarios users may want to store additional information inside the TensorDB Dataframe beyond the aggregated tensor. The `openfl.interface.aggregation_functions.experimental.PrivilegedAggregationFunction` interface is provided for this use, and gives the user direct access to aggregator's TensorDB dataframe (notice the `tensor_db` param in the call function replaces the `db_iterator` from the standard AggregationFunction interface). As the name suggests, this interface is called privileged because with great power comes great responsibility, and modifying the TensorDB dataframe directly can lead to unexpected behavior and experiment failures if entries are arbitrarily deleted.\n", - "\n", - "Note that in-place methods (`.loc`) on the tensor_db dataframe are required for write operations. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openfl.interface.aggregation_functions.experimental import PrivilegedAggregationFunction\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "class PrioritizeLeastImproved(PrivilegedAggregationFunction):\n", - " \"\"\"\n", - " Give collaborator with the least improvement in validation accuracy more influence over future weights\n", - " \n", - " \"\"\"\n", - " \n", - " def call(self,\n", - " local_tensors,\n", - " tensor_db,\n", - " tensor_name,\n", - " fl_round,\n", - " tags):\n", - " \"\"\"Aggregate tensors.\n", - "\n", - " Args:\n", - " local_tensors(list[openfl.utilities.LocalTensor]): List of local tensors to aggregate.\n", - " tensor_db: Aggregator's TensorDB [writable]. Columns:\n", - " - 'tensor_name': name of the tensor.\n", - " Examples for `torch.nn.Module`s: 'conv1.weight', 'fc2.bias'.\n", - " - 'round': 0-based number of round corresponding to this tensor.\n", - " - 'tags': tuple of tensor tags. Tags that can appear:\n", - " - 'model' indicates that the tensor is a model parameter.\n", - " - 'trained' indicates that tensor is a part of a training result.\n", - " These tensors are passed to the aggregator node after local learning.\n", - " - 'aggregated' indicates that tensor is a result of aggregation.\n", - " These tensors are sent to collaborators for the next round.\n", - " - 'delta' indicates that value is a difference between rounds\n", - " for a specific tensor.\n", - " also one of the tags is a collaborator name\n", - " if it corresponds to a result of a local task.\n", - "\n", - " - 'nparray': value of the tensor.\n", - " tensor_name: name of the tensor\n", - " fl_round: round number\n", - " tags: tuple of tags for this tensor\n", - " Returns:\n", - " np.ndarray: aggregated tensor\n", - " \"\"\"\n", - " from openfl.utilities import change_tags\n", - "\n", - " tensors, weights, collaborators = zip(*[(x.tensor, x.weight, x.col_name) for idx,x in enumerate(local_tensors)])\n", - " tensors, weights, collaborators = np.array(tensors), np.array(weights), collaborators\n", - "\n", - " if fl_round > 0:\n", - " metric_tags = ('metric','validate_agg')\n", - " collaborator_accuracy = {}\n", - " previous_col_accuracy = {}\n", - " change_in_accuracy = {}\n", - " for col in collaborators:\n", - " col_metric_tag = change_tags(metric_tags,add_field=col)\n", - " collaborator_accuracy[col] = float(tensor_db[(tensor_db['tensor_name'] == 'acc') &\n", - " (tensor_db['round'] == fl_round) &\n", - " (tensor_db['tags'] == col_metric_tag)]['nparray'])\n", - " previous_col_accuracy[col] = float(tensor_db[(tensor_db['tensor_name'] == 'acc') &\n", - " (tensor_db['round'] == fl_round - 1) &\n", - " (tensor_db['tags'] == col_metric_tag)]['nparray'])\n", - " change_in_accuracy[col] = collaborator_accuracy[col] - previous_col_accuracy[col]\n", - " \n", - " \n", - " least_improved_collaborator = min(change_in_accuracy,key=change_in_accuracy.get)\n", - " \n", - " # Dont add least improved collaborator more than once\n", - " if len(tensor_db[(tensor_db['tags'] == ('least_improved',)) &\n", - " (tensor_db['round'] == fl_round)]) == 0:\n", - " tensor_db.loc[tensor_db.shape[0]] = \\\n", - " ['_','_',fl_round,True,('least_improved',),np.array(least_improved_collaborator)]\n", - " least_improved_weight_factor = 0.1 * len(tensor_db[(tensor_db['tags'] == ('least_improved',)) &\n", - " (tensor_db['nparray'] == np.array(least_improved_collaborator))])\n", - " weights[collaborators.index(least_improved_collaborator)] += least_improved_weight_factor\n", - " weights = weights / np.sum(weights)\n", - " \n", - " return np.average(tensors, weights=weights, axis=0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To make the process of writing, reading from, and searching through dataframes easier, we add three methods to the tensor_db dataframe. `store`, `retrieve`, and `search`. Power users can still use all of the built-in pandas dataframe methods, but because some prior knowledge is needed to effectively deal with dataframe column types, iterating through them, and how to store them in a consistent way that won't break other OpenFL functionality, these three methods provide a conventient way to let researchers focus on algorithms instead internal framework machinery. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class FedAvgM_Selection(PrivilegedAggregationFunction):\n", - " \"\"\"\n", - " Adapted from FeTS Challenge 2021\n", - " Federated Brain Tumor Segmentation:Multi-Institutional Privacy-Preserving Collaborative Learning\n", - " Ece Isik-Polat, Gorkem Polat,Altan Kocyigit1, and Alptekin Temizel1\n", - " \n", - " \"\"\"\n", - " \n", - " def call(\n", - " self,\n", - " local_tensors,\n", - " tensor_db,\n", - " tensor_name,\n", - " fl_round,\n", - " tags):\n", - " \n", - " \"\"\"Aggregate tensors.\n", - "\n", - " Args:\n", - " local_tensors(list[openfl.utilities.LocalTensor]): List of local tensors to aggregate.\n", - " tensor_db: Aggregator's TensorDB [writable]. Columns:\n", - " - 'tensor_name': name of the tensor.\n", - " Examples for `torch.nn.Module`s: 'conv1.weight', 'fc2.bias'.\n", - " - 'round': 0-based number of round corresponding to this tensor.\n", - " - 'tags': tuple of tensor tags. Tags that can appear:\n", - " - 'model' indicates that the tensor is a model parameter.\n", - " - 'trained' indicates that tensor is a part of a training result.\n", - " These tensors are passed to the aggregator node after local learning.\n", - " - 'aggregated' indicates that tensor is a result of aggregation.\n", - " These tensors are sent to collaborators for the next round.\n", - " - 'delta' indicates that value is a difference between rounds\n", - " for a specific tensor.\n", - " also one of the tags is a collaborator name\n", - " if it corresponds to a result of a local task.\n", - "\n", - " - 'nparray': value of the tensor.\n", - " tensor_name: name of the tensor\n", - " fl_round: round number\n", - " tags: tuple of tags for this tensor\n", - " Returns:\n", - " np.ndarray: aggregated tensor\n", - " \"\"\"\n", - " #momentum\n", - " tensor_db.store(tensor_name='momentum',nparray=0.9,overwrite=False)\n", - " #aggregator_lr\n", - " tensor_db.store(tensor_name='aggregator_lr',nparray=1.0,overwrite=False)\n", - "\n", - " if fl_round == 0:\n", - " # Just apply FedAvg\n", - "\n", - " tensor_values = [t.tensor for t in local_tensors]\n", - " weight_values = [t.weight for t in local_tensors] \n", - " new_tensor_weight = np.average(tensor_values, weights=weight_values, axis=0) \n", - "\n", - " #if not (tensor_name in weight_speeds):\n", - " if tensor_name not in tensor_db.search(tags=('weight_speeds',))['tensor_name']: \n", - " #weight_speeds[tensor_name] = np.zeros_like(local_tensors[0].tensor) # weight_speeds[tensor_name] = np.zeros(local_tensors[0].tensor.shape)\n", - " tensor_db.store(\n", - " tensor_name=tensor_name, \n", - " tags=('weight_speeds',), \n", - " nparray=np.zeros_like(local_tensors[0].tensor),\n", - " )\n", - " return new_tensor_weight \n", - " else:\n", - " if tensor_name.endswith(\"weight\") or tensor_name.endswith(\"bias\"):\n", - " # Calculate aggregator's last value\n", - " previous_tensor_value = None\n", - " for _, record in tensor_db.iterrows():\n", - " if (record['round'] == fl_round \n", - " and record[\"tensor_name\"] == tensor_name\n", - " and record[\"tags\"] == (\"aggregated\",)): \n", - " previous_tensor_value = record['nparray']\n", - " break\n", - "\n", - " if previous_tensor_value is None:\n", - " logger.warning(\"Error in fedAvgM: previous_tensor_value is None\")\n", - " logger.warning(\"Tensor: \" + tensor_name)\n", - "\n", - " # Just apply FedAvg \n", - " tensor_values = [t.tensor for t in local_tensors]\n", - " weight_values = [t.weight for t in local_tensors] \n", - " new_tensor_weight = np.average(tensor_values, weights=weight_values, axis=0) \n", - " \n", - " if tensor_name not in tensor_db.search(tags=('weight_speeds',))['tensor_name']: \n", - " tensor_db.store(\n", - " tensor_name=tensor_name, \n", - " tags=('weight_speeds',), \n", - " nparray=np.zeros_like(local_tensors[0].tensor),\n", - " )\n", - "\n", - " return new_tensor_weight\n", - " else:\n", - " # compute the average delta for that layer\n", - " deltas = [previous_tensor_value - t.tensor for t in local_tensors]\n", - " weight_values = [t.weight for t in local_tensors]\n", - " average_deltas = np.average(deltas, weights=weight_values, axis=0) \n", - "\n", - " # V_(t+1) = momentum*V_t + Average_Delta_t\n", - " tensor_weight_speed = tensor_db.retrieve(\n", - " tensor_name=tensor_name,\n", - " tags=('weight_speeds',)\n", - " )\n", - " \n", - " momentum = float(tensor_db.retrieve(tensor_name='momentum'))\n", - " aggregator_lr = float(tensor_db.retrieve(tensor_name='aggregator_lr'))\n", - " \n", - " new_tensor_weight_speed = momentum * tensor_weight_speed + average_deltas # fix delete (1-momentum)\n", - " \n", - " tensor_db.store(\n", - " tensor_name=tensor_name, \n", - " tags=('weight_speeds',), \n", - " nparray=new_tensor_weight_speed\n", - " )\n", - " # W_(t+1) = W_t-lr*V_(t+1)\n", - " new_tensor_weight = previous_tensor_value - aggregator_lr*new_tensor_weight_speed\n", - "\n", - " return new_tensor_weight\n", - " else:\n", - " # Just apply FedAvg \n", - " tensor_values = [t.tensor for t in local_tensors]\n", - " weight_values = [t.weight for t in local_tensors] \n", - " new_tensor_weight = np.average(tensor_values, weights=weight_values, axis=0)\n", - "\n", - " return new_tensor_weight" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Run experiment, return trained FederatedModel\n", - "final_fl_model = fx.run_experiment(collaborators,\n", - " {\n", - " 'aggregator.settings.rounds_to_train':5,\n", - " 'aggregator.settings.db_store_rounds':5,\n", - " 'tasks.train.aggregation_type': ClippedAveraging(ratio=0.9)\n", - " })" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Save final model\n", - "final_fl_model.save_native('final_pytorch_model')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3.7", - "language": "python", - "name": "py3.7" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/openfl-tutorials/experimental/workflow/101_MNIST.ipynb b/openfl-tutorials/experimental/workflow/101_MNIST.ipynb index 5757c31199..7393d16a46 100644 --- a/openfl-tutorials/experimental/workflow/101_MNIST.ipynb +++ b/openfl-tutorials/experimental/workflow/101_MNIST.ipynb @@ -67,10 +67,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Below code will display the print statement output on screen as well\n", - "import sys\n", - "sys.stdout = open('/dev/stdout', 'w')\n", - "\n", "!pip install git+https://github.com/securefederatedai/openfl.git\n", "!pip install -r workflow_interface_requirements.txt\n", "!pip install torch\n", diff --git a/openfl-tutorials/experimental/workflow/105_Numpy_Linear_Regression_Workflow.ipynb b/openfl-tutorials/experimental/workflow/105_Numpy_Linear_Regression_Workflow.ipynb index c076b87e12..ada59a995f 100644 --- a/openfl-tutorials/experimental/workflow/105_Numpy_Linear_Regression_Workflow.ipynb +++ b/openfl-tutorials/experimental/workflow/105_Numpy_Linear_Regression_Workflow.ipynb @@ -42,10 +42,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Below code will display the print statement output on screen as well\n", - "import sys\n", - "sys.stdout = open('/dev/stdout', 'w')\n", - "\n", "!pip install git+https://github.com/securefederatedai/openfl.git\n", "!pip install -r workflow_interface_requirements.txt\n", "!pip install matplotlib\n", @@ -308,7 +304,7 @@ " self.current_round += 1\n", " if self.current_round < self.rounds:\n", " self.next(self.aggregated_model_validation,\n", - " foreach='collaborators', exclude=['private'])\n", + " foreach='collaborators')\n", " else:\n", " self.next(self.end)\n", "\n", diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/101_MNIST/README.md b/openfl-tutorials/experimental/workflow/FederatedRuntime/101_MNIST/README.md index e4f771b4c5..bce8042d02 100644 --- a/openfl-tutorials/experimental/workflow/FederatedRuntime/101_MNIST/README.md +++ b/openfl-tutorials/experimental/workflow/FederatedRuntime/101_MNIST/README.md @@ -4,7 +4,7 @@
### 0. If you haven't done so already, create a virtual environment, install OpenFL, and upgrade pip: - - For help with this step, visit the "Install the Package" section of the [OpenFL installation instructions](https://openfl.readthedocs.io/en/latest/get_started/installation.html). + - For help with this step, visit the "Install the Package" section of the [OpenFL installation instructions](https://openfl.readthedocs.io/en/latest/installation.html).
diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Bangalore/Bangalore_config.yaml b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Bangalore/Bangalore_config.yaml similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Bangalore/Bangalore_config.yaml rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Bangalore/Bangalore_config.yaml diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Bangalore/private_attributes.py b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Bangalore/private_attributes.py similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Bangalore/private_attributes.py rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Bangalore/private_attributes.py diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Bangalore/requirements.txt b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Bangalore/requirements.txt similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Bangalore/requirements.txt rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Bangalore/requirements.txt diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Bangalore/start_envoy.sh b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Bangalore/start_envoy.sh similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Bangalore/start_envoy.sh rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Bangalore/start_envoy.sh diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Chandler/Chandler_config.yaml b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Chandler/Chandler_config.yaml similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Chandler/Chandler_config.yaml rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Chandler/Chandler_config.yaml diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Chandler/private_attributes.py b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Chandler/private_attributes.py similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Chandler/private_attributes.py rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Chandler/private_attributes.py diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Chandler/requirements.txt b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Chandler/requirements.txt similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Chandler/requirements.txt rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Chandler/requirements.txt diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Chandler/start_envoy.sh b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Chandler/start_envoy.sh similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/Chandler/start_envoy.sh rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/Chandler/start_envoy.sh diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/README.md b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/README.md similarity index 94% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/README.md rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/README.md index be10df4d4c..e9e1cd5989 100644 --- a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/README.md +++ b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/README.md @@ -4,7 +4,7 @@
### 0. If you haven't done so already, create a virtual environment, install OpenFL, and upgrade pip: - - For help with this step, visit the "Install the Package" section of the [OpenFL installation instructions](https://openfl.readthedocs.io/en/latest/get_started/installation.html). + - For help with this step, visit the "Install the Package" section of the [OpenFL installation instructions](https://openfl.readthedocs.io/en/latest/installation.html).
@@ -22,7 +22,7 @@ - Navigate to the tutorial: ```sh - cd openfl/openfl-tutorials/experimental/workflow/FederatedRuntime/101_MNIST/ + cd openfl/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/ ```
@@ -57,7 +57,7 @@ cd Chandler ```sh cd workspace -jupyter lab mnist_watermarking.ipynb +jupyter lab MNIST_Watermarking.ipynb ``` - A Jupyter Server URL will appear in your terminal. In your browser, proceed to that link. Once the webpage loads, click on the pytorch_tinyimagenet.ipynb file. - To run the experiment, select the icon that looks like two triangles to "Restart Kernel and Run All Cells". diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/director/director_config.yaml b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/director/director_config.yaml similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/director/director_config.yaml rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/director/director_config.yaml diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/director/private_attributes.py b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/director/private_attributes.py similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/director/private_attributes.py rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/director/private_attributes.py diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/director/start_director.sh b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/director/start_director.sh similarity index 100% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/director/start_director.sh rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/director/start_director.sh diff --git a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/workspace/MNIST_Watermarking.ipynb b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/workspace/MNIST_Watermarking.ipynb similarity index 95% rename from openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/workspace/MNIST_Watermarking.ipynb rename to openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/workspace/MNIST_Watermarking.ipynb index 040fb2cb26..0ee4c67681 100644 --- a/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermaking/workspace/MNIST_Watermarking.ipynb +++ b/openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking/workspace/MNIST_Watermarking.ipynb @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "d79eacbd", "metadata": {}, "outputs": [], @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "f7475cba", "metadata": {}, "outputs": [], @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "9bd8ac2d", "metadata": {}, "outputs": [], @@ -193,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "89cf4866", "metadata": {}, "outputs": [], @@ -245,7 +245,7 @@ " watermark_pretrain_optimizer=None,\n", " watermark_retrain_optimizer=None,\n", " round_number=0,\n", - " n_rounds=1,\n", + " n_rounds=3,\n", " **kwargs,\n", " ):\n", " super().__init__(**kwargs)\n", @@ -425,7 +425,20 @@ " + f\" Acc: {self.watermark_retrain_validation_score:<.6f}\")\n", " retrain_round += 1\n", "\n", - " self.next(self.end)\n", + " self.next(self.internal_loop)\n", + " \n", + " @aggregator\n", + " def internal_loop(self):\n", + " \"\"\"\n", + " Internal loop to continue the Federated Learning process.\n", + " \"\"\"\n", + " if self.round_number == self.n_rounds - 1:\n", + " print(f\"\\nCompleted training for all {self.n_rounds} round(s)\")\n", + " self.next(self.end)\n", + " else:\n", + " self.round_number += 1\n", + " print(f\"\\nCompleted round: {self.round_number}\")\n", + " self.next(self.aggregated_model_validation, foreach='collaborators')\n", "\n", " @aggregator\n", " def end(self):\n", @@ -449,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "1715a373", "metadata": {}, "outputs": [], @@ -468,7 +481,7 @@ "federated_runtime = FederatedRuntime(\n", " collaborators=authorized_collaborators,\n", " director=director_info, \n", - " notebook_path='./MNIST_Watermarking.ipynb'\n", + " notebook_path='./MNIST_Watermarking.ipynb',\n", ")" ] }, @@ -552,7 +565,7 @@ ], "metadata": { "kernelspec": { - "display_name": "fed_run", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -566,7 +579,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/openfl-workspace/keras_2dunet/requirements.txt b/openfl-workspace/keras_2dunet/requirements.txt index d790f6ff87..17f7b7c86b 100644 --- a/openfl-workspace/keras_2dunet/requirements.txt +++ b/openfl-workspace/keras_2dunet/requirements.txt @@ -1,4 +1,4 @@ -keras==3.6.0 +keras==3.8.0 nibabel setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability tensorflow==2.18.0 diff --git a/openfl-workspace/keras_cnn_mnist/requirements.txt b/openfl-workspace/keras_cnn_mnist/requirements.txt index 5fa9907811..858a7dc3c8 100644 --- a/openfl-workspace/keras_cnn_mnist/requirements.txt +++ b/openfl-workspace/keras_cnn_mnist/requirements.txt @@ -1,3 +1,3 @@ -keras==3.6.0 +keras==3.8.0 tensorflow==2.18.0 diff --git a/openfl-workspace/keras_nlp/requirements.txt b/openfl-workspace/keras_nlp/requirements.txt index 6bba4a277d..32de452725 100644 --- a/openfl-workspace/keras_nlp/requirements.txt +++ b/openfl-workspace/keras_nlp/requirements.txt @@ -1,2 +1,2 @@ -keras==3.6.0 +keras==3.8.0 tensorflow==2.18.0 diff --git a/openfl-workspace/torch_cnn_mnist/plan/plan.yaml b/openfl-workspace/torch_cnn_mnist/plan/plan.yaml index 6d38003735..cae2fd0028 100644 --- a/openfl-workspace/torch_cnn_mnist/plan/plan.yaml +++ b/openfl-workspace/torch_cnn_mnist/plan/plan.yaml @@ -13,7 +13,7 @@ aggregator: assigner: settings: task_groups: - - name: train_and_validate + - name: learning percentage: 1.0 tasks: - aggregated_model_validation @@ -34,16 +34,8 @@ data_loader: batch_size: 64 collaborator_count: 2 template: src.dataloader.PyTorchMNISTInMemory -network: - settings: - agg_addr: localhost - agg_port: 59583 - cert_folder: cert - client_reconnect_interval: 5 - require_client_auth: true - hash_salt: auto - use_tls: true - template: openfl.federation.Network +network : + defaults : plan/defaults/network.yaml task_runner: settings: {} template: src.taskrunner.TemplateTaskRunner diff --git a/openfl-workspace/torch_unet_kvasir/plan/plan.yaml b/openfl-workspace/torch_unet_kvasir/plan/plan.yaml index b37a8a7d91..6563a31075 100644 --- a/openfl-workspace/torch_unet_kvasir/plan/plan.yaml +++ b/openfl-workspace/torch_unet_kvasir/plan/plan.yaml @@ -5,23 +5,21 @@ aggregator : defaults : plan/defaults/aggregator.yaml template : openfl.component.Aggregator settings : - init_state_path : save/torch_unet_kvasir_init.pbuf - best_state_path : save/torch_unet_kvasir_best.pbuf - last_state_path : save/torch_unet_kvasir_last.pbuf + init_state_path : save/init.pbuf + best_state_path : save/best.pbuf + last_state_path : save/last.pbuf rounds_to_train : 40 collaborator : defaults : plan/defaults/collaborator.yaml template : openfl.component.Collaborator settings : - epochs_per_round : 1.0 - polling_interval : 4 delta_updates : false opt_treatment : RESET data_loader : defaults : plan/defaults/data_loader.yaml - template : src.data_loader.PyTorchKvasirDataLoader + template : src.dataloader.PyTorchKvasirDataLoader settings : collaborator_count : 2 data_group_name : kvasir @@ -29,7 +27,7 @@ data_loader : task_runner : defaults : plan/defaults/task_runner.yaml - template : src.fed_unet_runner.PyTorchFederatedUnet + template : src.taskrunner.PyTorchFederatedUnet settings : n_channels : 3 n_classes : 1 diff --git a/openfl-workspace/torch_unet_kvasir/src/data_loader.py b/openfl-workspace/torch_unet_kvasir/src/dataloader.py similarity index 100% rename from openfl-workspace/torch_unet_kvasir/src/data_loader.py rename to openfl-workspace/torch_unet_kvasir/src/dataloader.py diff --git a/openfl-workspace/torch_unet_kvasir/src/fed_unet_runner.py b/openfl-workspace/torch_unet_kvasir/src/taskrunner.py similarity index 100% rename from openfl-workspace/torch_unet_kvasir/src/fed_unet_runner.py rename to openfl-workspace/torch_unet_kvasir/src/taskrunner.py diff --git a/openfl-workspace/workspace/plan/defaults/aggregator.yaml b/openfl-workspace/workspace/plan/defaults/aggregator.yaml index 43d923b996..9fc0481f29 100644 --- a/openfl-workspace/workspace/plan/defaults/aggregator.yaml +++ b/openfl-workspace/workspace/plan/defaults/aggregator.yaml @@ -1,3 +1,5 @@ template : openfl.component.Aggregator settings : db_store_rounds : 2 + persist_checkpoint: True + persistent_db_path: local_state/tensor.db diff --git a/openfl-workspace/workspace/plan/defaults/assigner.yaml b/openfl-workspace/workspace/plan/defaults/assigner.yaml index 0b7e744475..6a5903794f 100644 --- a/openfl-workspace/workspace/plan/defaults/assigner.yaml +++ b/openfl-workspace/workspace/plan/defaults/assigner.yaml @@ -1,7 +1,7 @@ template : openfl.component.RandomGroupedAssigner settings : task_groups : - - name : train_and_validate + - name : learning percentage : 1.0 tasks : - aggregated_model_validation diff --git a/openfl-workspace/workspace/plan/defaults/federated-evaluation/assigner.yaml b/openfl-workspace/workspace/plan/defaults/federated-evaluation/assigner.yaml index 9d583fa0c4..c660659e83 100644 --- a/openfl-workspace/workspace/plan/defaults/federated-evaluation/assigner.yaml +++ b/openfl-workspace/workspace/plan/defaults/federated-evaluation/assigner.yaml @@ -1,7 +1,7 @@ template : openfl.component.RandomGroupedAssigner settings : task_groups : - - name : validate + - name : evaluation percentage : 1.0 tasks : - aggregated_model_validation \ No newline at end of file diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index eaac9fa6a0..c6829e75b9 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -1,7 +1,6 @@ # Copyright 2020-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - """Aggregator module.""" import logging @@ -12,10 +11,11 @@ import openfl.callbacks as callbacks_module from openfl.component.straggler_handling_functions import CutoffTimeBasedStragglerHandling -from openfl.databases import TensorDB +from openfl.databases import PersistentTensorDB, TensorDB from openfl.interface.aggregation_functions import WeightedAverage from openfl.pipelines import NoCompressionPipeline, TensorCodec from openfl.protocols import base_pb2, utils +from openfl.protocols.base_pb2 import NamedTensor from openfl.utilities import TaskResultKey, TensorKey, change_tags logger = logging.getLogger(__name__) @@ -82,6 +82,9 @@ def __init__( log_memory_usage=False, write_logs=False, callbacks: Optional[List] = None, + persist_checkpoint=True, + persistent_db_path=None, + task_group: str = "learning", ): """Initializes the Aggregator. @@ -108,8 +111,11 @@ def __init__( Defaults to 1. initial_tensor_dict (dict, optional): Initial tensor dictionary. callbacks: List of callbacks to be used during the experiment. + task_group (str, optional): Selected task_group for assignment. """ + self.task_group = task_group self.round_number = 0 + self.next_model_round_number = 0 if single_col_cert_common_name: logger.warning( @@ -137,6 +143,16 @@ def __init__( self.quit_job_sent_to = [] self.tensor_db = TensorDB() + if persist_checkpoint: + persistent_db_path = persistent_db_path or "tensor.db" + logger.info( + "Persistent checkpoint is enabled, setting persistent db at path %s", + persistent_db_path, + ) + self.persistent_db = PersistentTensorDB(persistent_db_path) + else: + logger.info("Persistent checkpoint is disabled") + self.persistent_db = None # FIXME: I think next line generates an error on the second round # if it is set to 1 for the aggregator. self.db_store_rounds = db_store_rounds @@ -154,8 +170,25 @@ def __init__( # TODO: Remove. Used in deprecated interactive and native APIs self.best_tensor_dict: dict = {} self.last_tensor_dict: dict = {} + # these enable getting all tensors for a task + self.collaborator_tasks_results = {} # {TaskResultKey: list of TensorKeys} + self.collaborator_task_weight = {} # {TaskResultKey: data_size} + + # maintain a list of collaborators that have completed task and + # reported results in a given round + self.collaborators_done = [] + # Initialize a lock for thread safety + self.lock = Lock() + self.use_delta_updates = use_delta_updates - if initial_tensor_dict: + self.model = None # Initialize the model attribute to None + if self.persistent_db and self._recover(): + logger.info("recovered state of aggregator") + + # The model is built by recovery if at least one round has finished + if self.model: + logger.info("Model was loaded by recovery") + elif initial_tensor_dict: self._load_initial_tensors_from_dict(initial_tensor_dict) self.model = utils.construct_model_proto( tensor_dict=initial_tensor_dict, @@ -168,20 +201,6 @@ def __init__( self.collaborator_tensor_results = {} # {TensorKey: nparray}} - # these enable getting all tensors for a task - self.collaborator_tasks_results = {} # {TaskResultKey: list of TensorKeys} - - self.collaborator_task_weight = {} # {TaskResultKey: data_size} - - # maintain a list of collaborators that have completed task and - # reported results in a given round - self.collaborators_done = [] - - # Initialize a lock for thread safety - self.lock = Lock() - - self.use_delta_updates = use_delta_updates - # Callbacks self.callbacks = callbacks_module.CallbackList( callbacks, @@ -195,6 +214,79 @@ def __init__( self.callbacks.on_experiment_begin() self.callbacks.on_round_begin(self.round_number) + def _recover(self): + """Populates the aggregator state to the state it was prior a restart""" + recovered = False + # load tensors persistent DB + tensor_key_dict = self.persistent_db.load_tensors( + self.persistent_db.get_tensors_table_name() + ) + if len(tensor_key_dict) > 0: + logger.info(f"Recovering {len(tensor_key_dict)} model tensors") + recovered = True + self.tensor_db.cache_tensor(tensor_key_dict) + committed_round_number, self.best_model_score = ( + self.persistent_db.get_round_and_best_score() + ) + logger.info("Recovery - Setting model proto") + to_proto_tensor_dict = {} + for tk in tensor_key_dict: + tk_name, _, _, _, _ = tk + to_proto_tensor_dict[tk_name] = tensor_key_dict[tk] + self.model = utils.construct_model_proto( + to_proto_tensor_dict, committed_round_number, self.compression_pipeline + ) + # round number is the current round which is still in process + # i.e. committed_round_number + 1 + self.round_number = committed_round_number + 1 + logger.info( + "Recovery - loaded round number %s and best score %s", + self.round_number, + self.best_model_score, + ) + + next_round_tensor_key_dict = self.persistent_db.load_tensors( + self.persistent_db.get_next_round_tensors_table_name() + ) + if len(next_round_tensor_key_dict) > 0: + logger.info(f"Recovering {len(next_round_tensor_key_dict)} next round model tensors") + recovered = True + self.tensor_db.cache_tensor(next_round_tensor_key_dict) + + logger.debug("Recovery - this is the tensor_db after recovery: %s", self.tensor_db) + + if self.persistent_db.is_task_table_empty(): + logger.debug("task table is empty") + return recovered + + logger.info("Recovery - Replaying saved task results") + task_id = 1 + while True: + task_result = self.persistent_db.get_task_result_by_id(task_id) + if not task_result: + break + recovered = True + collaborator_name = task_result["collaborator_name"] + round_number = task_result["round_number"] + task_name = task_result["task_name"] + data_size = task_result["data_size"] + serialized_tensors = task_result["named_tensors"] + named_tensors = [ + NamedTensor.FromString(serialized_tensor) + for serialized_tensor in serialized_tensors + ] + logger.info( + "Recovery - Replaying task results %s %s %s", + collaborator_name, + round_number, + task_name, + ) + self.process_task_results( + collaborator_name, round_number, task_name, data_size, named_tensors + ) + task_id += 1 + return recovered + def _load_initial_tensors(self): """Load all of the tensors required to begin federated learning. @@ -208,9 +300,13 @@ def _load_initial_tensors(self): self.model, compression_pipeline=self.compression_pipeline ) - if round_number > self.round_number: + # Check selected task_group before updating round number + if self.task_group == "evaluation": + logger.info(f"Skipping round_number check for {self.task_group} task_group") + elif round_number > self.round_number: logger.info(f"Starting training from round {round_number} of previously saved model") self.round_number = round_number + tensor_key_dict = { TensorKey(k, self.uuid, self.round_number, False, ("model",)): v for k, v in tensor_dict.items() @@ -255,9 +351,12 @@ def _save_model(self, round_number, file_path): for k, v in og_tensor_dict.items() ] tensor_dict = {} + tensor_tuple_dict = {} for tk in tensor_keys: tk_name, _, _, _, _ = tk - tensor_dict[tk_name] = self.tensor_db.get_tensor_from_cache(tk) + tensor_value = self.tensor_db.get_tensor_from_cache(tk) + tensor_dict[tk_name] = tensor_value + tensor_tuple_dict[tk] = tensor_value if tensor_dict[tk_name] is None: logger.info( "Cannot save model for round %s. Continuing...", @@ -267,6 +366,19 @@ def _save_model(self, round_number, file_path): if file_path == self.best_state_path: self.best_tensor_dict = tensor_dict if file_path == self.last_state_path: + # Transaction to persist/delete all data needed to increment the round + if self.persistent_db: + if self.next_model_round_number > 0: + next_round_tensors = self.tensor_db.get_tensors_by_round_and_tags( + self.next_model_round_number, ("model",) + ) + self.persistent_db.finalize_round( + tensor_tuple_dict, next_round_tensors, self.round_number, self.best_model_score + ) + logger.info( + "Persist model and clean task result for round %s", + round_number, + ) self.last_tensor_dict = tensor_dict self.model = utils.construct_model_proto( tensor_dict, round_number, self.compression_pipeline @@ -366,7 +478,7 @@ def get_tasks(self, collaborator_name): # if no tasks, tell the collaborator to sleep if len(tasks) == 0: tasks = None - sleep_time = self._get_sleep_time() + sleep_time = Aggregator._get_sleep_time() return tasks, self.round_number, sleep_time, time_to_quit @@ -396,7 +508,7 @@ def get_tasks(self, collaborator_name): # been completed if len(tasks) == 0: tasks = None - sleep_time = self._get_sleep_time() + sleep_time = Aggregator._get_sleep_time() return tasks, self.round_number, sleep_time, time_to_quit @@ -606,6 +718,31 @@ def send_local_task_results( Returns: None """ + # Save task and its metadata for recovery + serialized_tensors = [tensor.SerializeToString() for tensor in named_tensors] + if self.persistent_db: + self.persistent_db.save_task_results( + collaborator_name, round_number, task_name, data_size, serialized_tensors + ) + logger.debug( + f"Persisting task results {task_name} from {collaborator_name} round {round_number}" + ) + logger.info( + f"Collaborator {collaborator_name} is sending task results " + f"for {task_name}, round {round_number}" + ) + self.process_task_results( + collaborator_name, round_number, task_name, data_size, named_tensors + ) + + def process_task_results( + self, + collaborator_name, + round_number, + task_name, + data_size, + named_tensors, + ): if self._time_to_quit() or collaborator_name in self.stragglers: logger.warning( f"STRAGGLER: Collaborator {collaborator_name} is reporting results " @@ -620,11 +757,6 @@ def send_local_task_results( ) return - logger.info( - f"Collaborator {collaborator_name} is sending task results " - f"for {task_name}, round {round_number}" - ) - task_key = TaskResultKey(task_name, collaborator_name, round_number) # we mustn't have results already @@ -729,9 +861,9 @@ def _process_named_tensor(self, named_tensor, collaborator_name): tuple(named_tensor.tags), ) tensor_name, origin, round_number, report, tags = tensor_key - assert ( - "compressed" in tags or "lossy_compressed" in tags - ), f"Named tensor {tensor_key} is not compressed" + assert "compressed" in tags or "lossy_compressed" in tags, ( + f"Named tensor {tensor_key} is not compressed" + ) if "compressed" in tags: dec_tk, decompressed_nparray = self.tensor_codec.decompress( tensor_key, @@ -864,7 +996,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result new_model_report, ("model",), ) - + self.next_model_round_number = new_model_round_number # Finally, cache the updated model tensor self.tensor_db.cache_tensor({final_model_tk: new_model_nparray}) @@ -913,9 +1045,9 @@ def _compute_validation_related_task_metrics(self, task_name) -> dict: metrics = {} for tensor_key in self.collaborator_tasks_results[task_key]: tensor_name, origin, round_number, report, tags = tensor_key - assert ( - collaborators_for_task[0] in tags - ), f"Tensor {tensor_key} in task {task_name} has not been processed correctly" + assert collaborators_for_task[0] in tags, ( + f"Tensor {tensor_key} in task {task_name} has not been processed correctly" + ) # Strip the collaborator label, and lookup aggregated tensor new_tags = change_tags(tags, remove_field=collaborators_for_task[0]) agg_tensor_key = TensorKey(tensor_name, origin, round_number, report, new_tags) @@ -944,11 +1076,10 @@ def _compute_validation_related_task_metrics(self, task_name) -> dict: # FIXME: Configurable logic for min/max criteria in saving best. if "validate_agg" in tags: - # Compare the accuracy of the model, potentially save it + # Compare the accuracy of the model, potentially save it. if self.best_model_score is None or self.best_model_score < agg_results: logger.info( - f"Round {round_number}: saved the best " - f"model with score {agg_results:f}" + f"Round {round_number}: saved the best model with score {agg_results:f}" ) self.best_model_score = agg_results self._save_model(round_number, self.best_state_path) diff --git a/openfl/component/assigner/random_grouped_assigner.py b/openfl/component/assigner/random_grouped_assigner.py index 7a1e20123c..dea00022a4 100644 --- a/openfl/component/assigner/random_grouped_assigner.py +++ b/openfl/component/assigner/random_grouped_assigner.py @@ -56,9 +56,9 @@ def define_task_assignments(self): Returns: None """ - assert ( - np.abs(1.0 - np.sum([group["percentage"] for group in self.task_groups])) < 0.01 - ), "Task group percentages must sum to 100%" + assert np.abs(1.0 - np.sum([group["percentage"] for group in self.task_groups])) < 0.01, ( + "Task group percentages must sum to 100%" + ) # Start by finding all of the tasks in all specified groups self.all_tasks_in_groups = list( diff --git a/openfl/component/assigner/static_grouped_assigner.py b/openfl/component/assigner/static_grouped_assigner.py index 5ccff16c67..fcb5a59034 100644 --- a/openfl/component/assigner/static_grouped_assigner.py +++ b/openfl/component/assigner/static_grouped_assigner.py @@ -62,8 +62,7 @@ def define_task_assignments(self): unique_authorized_cols = set(self.authorized_cols) assert cols_amount == authorized_cols_amount and unique_cols == unique_authorized_cols, ( - f"Collaborators in each group must be distinct: " - f"{unique_cols}, {unique_authorized_cols}" + f"Collaborators in each group must be distinct: {unique_cols}, {unique_authorized_cols}" ) # Start by finding all of the tasks in all specified groups diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index d4fd380998..4a5a78329a 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -141,7 +141,7 @@ def __init__( if hasattr(DevicePolicy, device_assignment_policy): self.device_assignment_policy = DevicePolicy[device_assignment_policy] else: - logger.error("Unknown device_assignment_policy: " f"{device_assignment_policy.name}.") + logger.error(f"Unknown device_assignment_policy: {device_assignment_policy.name}.") raise NotImplementedError( f"Unknown device_assignment_policy: {device_assignment_policy}." ) @@ -216,8 +216,7 @@ def run_simulation(self): for task in tasks: self.do_task(task, round_number) logger.info( - f"All tasks completed on {self.collaborator_name} " - f"for round {round_number}..." + f"All tasks completed on {self.collaborator_name} for round {round_number}..." ) break @@ -376,21 +375,22 @@ def get_data_for_tensorkey(self, tensor_key): ) if nparray is not None: logger.debug( - f"Found tensor {tensor_name} in local TensorDB " - f"for round {prior_round}" + f"Found tensor {tensor_name} in local TensorDB for round {prior_round}" ) return nparray prior_round -= 1 logger.info(f"Cannot find any prior version of tensor {tensor_name} locally...") - logger.debug( - "Unable to get tensor from local store..." "attempting to retrieve from client" - ) # Determine whether there are additional compression related # dependencies. # Typically, dependencies are only relevant to model layers tensor_dependencies = self.tensor_codec.find_dependencies( tensor_key, self.delta_updates ) + logger.debug( + "Unable to get tensor from local store..." + "attempting to retrieve from client len tensor_dependencies" + f" tensor_key {tensor_key}" + ) if len(tensor_dependencies) > 0: # Resolve dependencies # tensor_dependencies[0] corresponds to the prior version @@ -411,10 +411,9 @@ def get_data_for_tensorkey(self, tensor_key): self.tensor_db.cache_tensor({new_model_tk: nparray}) else: logger.info( - "Count not find previous model layer." - "Fetching latest layer from aggregator" + "Could not find previous model layer.Fetching latest layer from aggregator" ) - # The original model tensor should be fetched from client + # The original model tensor should be fetched from aggregator nparray = self.get_aggregated_tensor_from_aggregator( tensor_key, require_lossless=True ) @@ -423,6 +422,18 @@ def get_data_for_tensorkey(self, tensor_key): nparray = self.get_aggregated_tensor_from_aggregator( tensor_key, require_lossless=True ) + else: + # we should try fetching the tensor from aggregator + tensor_name, origin, round_number, report, tags = tensor_key + tags = (self.collaborator_name,) + tags + tensor_key = (tensor_name, origin, round_number, report, tags) + logger.info( + "Could not find previous model layer." + f"Fetching latest layer from aggregator {tensor_key}" + ) + nparray = self.get_aggregated_tensor_from_aggregator( + tensor_key, require_lossless=True + ) else: logger.debug("Found tensor %s in local TensorDB", tensor_key) diff --git a/openfl/component/director/experiment.py b/openfl/component/director/experiment.py index e2b56cf877..2e40f43247 100644 --- a/openfl/component/director/experiment.py +++ b/openfl/component/director/experiment.py @@ -106,7 +106,7 @@ async def start( """ self.status = Status.IN_PROGRESS try: - logger.info(f"New experiment {self.name} for " f"collaborators {self.collaborators}") + logger.info(f"New experiment {self.name} for collaborators {self.collaborators}") with ExperimentWorkspace( experiment_name=self.name, diff --git a/openfl/component/envoy/envoy.py b/openfl/component/envoy/envoy.py index c37ba29a7d..f98b7e21ba 100644 --- a/openfl/component/envoy/envoy.py +++ b/openfl/component/envoy/envoy.py @@ -237,7 +237,7 @@ def _get_cuda_device_info(self): ) except Exception as exc: logger.exception( - f"Failed to get cuda device info: {exc}. " f"Check your cuda device monitor plugin." + f"Failed to get cuda device info: {exc}. Check your cuda device monitor plugin." ) return cuda_devices_info diff --git a/openfl/databases/__init__.py b/openfl/databases/__init__.py index 849fcde7c9..0e64082d5f 100644 --- a/openfl/databases/__init__.py +++ b/openfl/databases/__init__.py @@ -2,4 +2,5 @@ # SPDX-License-Identifier: Apache-2.0 +from openfl.databases.persistent_db import PersistentTensorDB from openfl.databases.tensor_db import TensorDB diff --git a/openfl/databases/persistent_db.py b/openfl/databases/persistent_db.py new file mode 100644 index 0000000000..7fe0c6463f --- /dev/null +++ b/openfl/databases/persistent_db.py @@ -0,0 +1,365 @@ +import json +import logging +import pickle +import sqlite3 +from threading import Lock +from typing import Dict, Optional + +import numpy as np + +from openfl.utilities import TensorKey + +logger = logging.getLogger(__name__) + +__all__ = ["PersistentTensorDB"] + + +class PersistentTensorDB: + """ + The PersistentTensorDB class implements a database + for storing tensors and metadata using SQLite. + + Attributes: + conn: The SQLite connection object. + cursor: The SQLite cursor object. + lock: A threading Lock object used to ensure thread-safe operations. + """ + + TENSORS_TABLE = "tensors" + NEXT_ROUND_TENSORS_TABLE = "next_round_tensors" + TASK_RESULT_TABLE = "task_results" + KEY_VALUE_TABLE = "key_value_store" + + def __init__(self, db_path) -> None: + """Initializes a new instance of the PersistentTensorDB class.""" + + logger.info("Initializing persistent db at %s", db_path) + self.conn = sqlite3.connect(db_path, check_same_thread=False) + self.lock = Lock() + + cursor = self.conn.cursor() + self._create_model_tensors_table(cursor, PersistentTensorDB.TENSORS_TABLE) + self._create_model_tensors_table(cursor, PersistentTensorDB.NEXT_ROUND_TENSORS_TABLE) + self._create_task_results_table(cursor) + self._create_key_value_store(cursor) + self.conn.commit() + + def _create_model_tensors_table(self, cursor, table_name) -> None: + """Create the database table for storing tensors if it does not exist.""" + query = f""" + CREATE TABLE IF NOT EXISTS {table_name} ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + tensor_name TEXT NOT NULL, + origin TEXT NOT NULL, + round INTEGER NOT NULL, + report INTEGER NOT NULL, + tags TEXT, + nparray BLOB NOT NULL + ) + """ + cursor.execute(query) + + def _create_task_results_table(self, cursor) -> None: + """Creates a table for storing task results.""" + query = f""" + CREATE TABLE IF NOT EXISTS {PersistentTensorDB.TASK_RESULT_TABLE} ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + collaborator_name TEXT NOT NULL, + round_number INTEGER NOT NULL, + task_name TEXT NOT NULL, + data_size INTEGER NOT NULL, + named_tensors BLOB NOT NULL + ) + """ + cursor.execute(query) + + def _create_key_value_store(self, cursor) -> None: + """Create a key-value store table for storing additional metadata.""" + query = f""" + CREATE TABLE IF NOT EXISTS {PersistentTensorDB.KEY_VALUE_TABLE} ( + key TEXT PRIMARY KEY, + value REAL NOT NULL + ) + """ + cursor.execute(query) + + def save_task_results( + self, + collaborator_name: str, + round_number: int, + task_name: str, + data_size: int, + named_tensors, + ): + """ + Saves task results to the task_results table. + + Args: + collaborator_name (str): Collaborator name. + round_number (int): Round number. + task_name (str): Task name. + data_size (int): Data size. + named_tensors(List): list of binary representation of tensors. + """ + serialized_blob = pickle.dumps(named_tensors) + + # Insert into the database + insert_query = f""" + INSERT INTO {PersistentTensorDB.TASK_RESULT_TABLE} + (collaborator_name, round_number, task_name, data_size, named_tensors) + VALUES (?, ?, ?, ?, ?); + """ + with self.lock: + cursor = self.conn.cursor() + cursor.execute( + insert_query, + (collaborator_name, round_number, task_name, data_size, serialized_blob), + ) + self.conn.commit() + + def get_task_result_by_id(self, task_result_id: int): + """ + Retrieve a task result by its ID. + + Args: + task_result_id (int): The ID of the task result to retrieve. + + Returns: + A dictionary containing the task result details, or None if not found. + """ + with self.lock: + cursor = self.conn.cursor() + query = f""" + SELECT collaborator_name, round_number, task_name, data_size, named_tensors + FROM {PersistentTensorDB.TASK_RESULT_TABLE} + WHERE id = ? + """ + cursor.execute(query, (task_result_id,)) + result = cursor.fetchone() + if result: + collaborator_name, round_number, task_name, data_size, serialized_blob = result + serialized_tensors = pickle.loads(serialized_blob) + return { + "collaborator_name": collaborator_name, + "round_number": round_number, + "task_name": task_name, + "data_size": data_size, + "named_tensors": serialized_tensors, + } + return None + + def _serialize_array(self, array: np.ndarray) -> bytes: + """Serialize a NumPy array into bytes for storing in SQLite. + note: using pickle since in some cases the array is actually a scalar. + """ + return pickle.dumps(array) + + def _deserialize_array(self, blob: bytes, dtype: Optional[np.dtype] = None) -> np.ndarray: + """Deserialize bytes from SQLite into a NumPy array.""" + try: + return pickle.loads(blob) + except Exception as e: + raise ValueError(f"Failed to deserialize array: {e}") + + def __repr__(self) -> str: + """Returns a string representation of the PersistentTensorDB.""" + with self.lock: + cursor = self.conn.cursor() + cursor.execute("SELECT tensor_name, origin, round, report, tags FROM tensors") + rows = cursor.fetchall() + return f"PersistentTensorDB contents:\n{rows}" + + def finalize_round( + self, + tensor_key_dict: Dict[TensorKey, np.ndarray], + next_round_tensor_key_dict: Dict[TensorKey, np.ndarray], + round_number: int, + best_score: float, + ): + """Finalize a training round by saving tensors, preparing for the next round, + and updating metadata in the database. + + This function performs the following steps as a single transaction: + 1. Persist the tensors of the current round into the database. + 2. Persist the tensors for the next training round into the database. + 3. Reinitialize the task results table to prepare for new tasks. + 4. Update the round number and best score in the key-value store. + + If any step fails, the transaction is rolled back to ensure data integrity. + + Args: + tensor_key_dict (Dict[TensorKey, np.ndarray]): + A dictionary mapping tensor keys to their corresponding + NumPy arrays for the current round. + next_round_tensor_key_dict (Dict[TensorKey, np.ndarray]): + A dictionary mapping tensor keys to their corresponding + NumPy arrays for the next round. + round_number (int): + The current training round number. + best_score (float): + The best score achieved during the current round. + + Raises: + RuntimeError: If an error occurs during the transaction, the transaction is rolled back, + and a RuntimeError is raised with the details of the failure. + """ + with self.lock: + try: + # Begin transaction + cursor = self.conn.cursor() + cursor.execute("BEGIN TRANSACTION") + self._persist_tensors(cursor, PersistentTensorDB.TENSORS_TABLE, tensor_key_dict) + self._persist_next_round_tensors(cursor, next_round_tensor_key_dict) + self._init_task_results_table(cursor) + self._save_round_and_best_score(cursor, round_number, best_score) + # Commit transaction + self.conn.commit() + logger.info( + f"Committed model for round {round_number}, saved {len(tensor_key_dict)}" + f" model tensors and {len(next_round_tensor_key_dict)}" + f" next round model tensors with best_score {best_score}" + ) + except Exception as e: + # Rollback transaction in case of an error + self.conn.rollback() + raise RuntimeError(f"Failed to finalize round: {e}") + + def _persist_tensors( + self, cursor, table_name, tensor_key_dict: Dict[TensorKey, np.ndarray] + ) -> None: + """Insert a dictionary of tensors into the SQLite as part of transaction""" + for tensor_key, nparray in tensor_key_dict.items(): + tensor_name, origin, fl_round, report, tags = tensor_key + serialized_array = self._serialize_array(nparray) + serialized_tags = json.dumps(tags) + query = f""" + INSERT INTO {table_name} (tensor_name, origin, round, report, tags, nparray) + VALUES (?, ?, ?, ?, ?, ?) + """ + cursor.execute( + query, + (tensor_name, origin, fl_round, int(report), serialized_tags, serialized_array), + ) + + def _persist_next_round_tensors( + self, cursor, tensor_key_dict: Dict[TensorKey, np.ndarray] + ) -> None: + """Persisting the last round next_round tensors.""" + drop_table_query = f"DROP TABLE IF EXISTS {PersistentTensorDB.NEXT_ROUND_TENSORS_TABLE}" + cursor.execute(drop_table_query) + self._create_model_tensors_table(cursor, PersistentTensorDB.NEXT_ROUND_TENSORS_TABLE) + self._persist_tensors(cursor, PersistentTensorDB.NEXT_ROUND_TENSORS_TABLE, tensor_key_dict) + + def _init_task_results_table(self, cursor): + """ + Creates a table for storing task results. Drops the table first if it already exists. + """ + drop_table_query = "DROP TABLE IF EXISTS task_results" + cursor.execute(drop_table_query) + self._create_task_results_table(cursor) + + def _save_round_and_best_score(self, cursor, round_number: int, best_score: float) -> None: + """Save the round number and best score as key-value pairs in the database.""" + # Create a table with key-value structure where values can be integer or float + # Insert or update the round_number + cursor.execute( + """ + INSERT OR REPLACE INTO key_value_store (key, value) + VALUES (?, ?) + """, + ("round_number", float(round_number)), + ) + + # Insert or update the best_score + cursor.execute( + """ + INSERT OR REPLACE INTO key_value_store (key, value) + VALUES (?, ?) + """, + ("best_score", float(best_score)), + ) + + def get_tensors_table_name(self) -> str: + return PersistentTensorDB.TENSORS_TABLE + + def get_next_round_tensors_table_name(self) -> str: + return PersistentTensorDB.NEXT_ROUND_TENSORS_TABLE + + def load_tensors(self, tensor_table) -> Dict[TensorKey, np.ndarray]: + """Load all tensors from the SQLite database and return them as a dictionary.""" + tensor_dict = {} + with self.lock: + cursor = self.conn.cursor() + query = f"SELECT tensor_name, origin, round, report, tags, nparray FROM {tensor_table}" + cursor.execute(query) + rows = cursor.fetchall() + for row in rows: + tensor_name, origin, fl_round, report, tags, nparray = row + # Deserialize the JSON string back to a Python list + deserialized_tags = tuple(json.loads(tags)) + tensor_key = TensorKey(tensor_name, origin, fl_round, report, deserialized_tags) + tensor_dict[tensor_key] = self._deserialize_array(nparray) + return tensor_dict + + def get_round_and_best_score(self) -> tuple[int, float]: + """Retrieve the round number and best score from the database.""" + with self.lock: + cursor = self.conn.cursor() + # Fetch the round_number + cursor.execute( + """ + SELECT value FROM key_value_store WHERE key = ? + """, + ("round_number",), + ) + round_number = cursor.fetchone() + if round_number is None: + round_number = -1 + else: + round_number = int(round_number[0]) # Cast to int + + # Fetch the best_score + cursor.execute( + """ + SELECT value FROM key_value_store WHERE key = ? + """, + ("best_score",), + ) + best_score = cursor.fetchone() + if best_score is None: + best_score = 0 + else: + best_score = float(best_score[0]) # Cast to float + return round_number, best_score + + def clean_up(self, remove_older_than: int = 1) -> None: + """Remove old entries from the database.""" + if remove_older_than < 0: + return + with self.lock: + cursor = self.conn.cursor() + query = f"SELECT MAX(round) FROM {PersistentTensorDB.TENSORS_TABLE}" + cursor.execute(query) + current_round = cursor.fetchone()[0] + if current_round is None: + return + cursor.execute( + """ + DELETE FROM tensors + WHERE round <= ? AND report = 0 + """, + (current_round - remove_older_than,), + ) + self.conn.commit() + + def close(self) -> None: + """Close the SQLite database connection.""" + self.conn.close() + + def is_task_table_empty(self) -> bool: + """Check if the task table is empty.""" + with self.lock: + cursor = self.conn.cursor() + cursor.execute("SELECT COUNT(*) FROM task_results") + count = cursor.fetchone()[0] + return count == 0 diff --git a/openfl/databases/tensor_db.py b/openfl/databases/tensor_db.py index 1b9d5ea132..f64f4d783d 100644 --- a/openfl/databases/tensor_db.py +++ b/openfl/databases/tensor_db.py @@ -151,6 +151,39 @@ def get_tensor_from_cache(self, tensor_key: TensorKey) -> Optional[np.ndarray]: return None return np.array(df["nparray"].iloc[0]) + def get_tensors_by_round_and_tags(self, fl_round: int, tags: tuple) -> dict: + """Retrieve all tensors that match the specified round and tags. + + Args: + fl_round (int): The round number to filter tensors. + tags (tuple): The tags to filter tensors. + + Returns: + dict: A dictionary where the keys are TensorKey objects and the values are numpy arrays. + """ + # Filter the DataFrame based on the round and tags + df = self.tensor_db[ + (self.tensor_db["round"] == fl_round) & (self.tensor_db["tags"] == tags) + ] + + # Check if any tensors match the criteria + if len(df) == 0: + return {} + + # Construct a dictionary mapping TensorKey to np.ndarray + tensor_dict = {} + for _, row in df.iterrows(): + tensor_key = TensorKey( + tensor_name=row["tensor_name"], + origin=row["origin"], + round_number=row["round"], + report=row["report"], + tags=row["tags"], + ) + tensor_dict[tensor_key] = np.array(row["nparray"]) + + return tensor_dict + def get_aggregated_tensor( self, tensor_key: TensorKey, @@ -180,9 +213,9 @@ def get_aggregated_tensor( None: if not all values are present. """ if len(collaborator_weight_dict) != 0: - assert ( - np.abs(1.0 - sum(collaborator_weight_dict.values())) < 0.01 - ), f"Collaborator weights do not sum to 1.0: {collaborator_weight_dict}" + assert np.abs(1.0 - sum(collaborator_weight_dict.values())) < 0.01, ( + f"Collaborator weights do not sum to 1.0: {collaborator_weight_dict}" + ) collaborator_names = collaborator_weight_dict.keys() agg_tensor_dict = {} diff --git a/openfl/experimental/workflow/component/aggregator/aggregator.py b/openfl/experimental/workflow/component/aggregator/aggregator.py index 568c3246fa..1e818b528a 100644 --- a/openfl/experimental/workflow/component/aggregator/aggregator.py +++ b/openfl/experimental/workflow/component/aggregator/aggregator.py @@ -461,9 +461,7 @@ def send_task_results( f" for the wrong round: {round_number}. Ignoring..." ) else: - logger.info( - f"Collaborator {collab_name} sent task results" f" for round {round_number}." - ) + logger.info(f"Collaborator {collab_name} sent task results for round {round_number}.") # Unpickle the clone (FLSpec object) clone = dill.loads(clone_bytes) # Update the clone in clones_dict dictionary diff --git a/openfl/experimental/workflow/component/collaborator/collaborator.py b/openfl/experimental/workflow/component/collaborator/collaborator.py index 0cbb1de069..b5d112c8da 100644 --- a/openfl/experimental/workflow/component/collaborator/collaborator.py +++ b/openfl/experimental/workflow/component/collaborator/collaborator.py @@ -161,7 +161,7 @@ def send_task_results(self, next_step: str, clone: Any) -> None: None """ self.logger.info( - f"Round {self.round_number}," f" collaborator {self.name} is sending results..." + f"Round {self.round_number}, collaborator {self.name} is sending results..." ) self.client.send_task_results(self.name, self.round_number, next_step, dill.dumps(clone)) diff --git a/openfl/experimental/workflow/federated/plan/plan.py b/openfl/experimental/workflow/federated/plan/plan.py index 5e81a91a9d..dc03c2d0a9 100644 --- a/openfl/experimental/workflow/federated/plan/plan.py +++ b/openfl/experimental/workflow/federated/plan/plan.py @@ -169,8 +169,7 @@ def parse( except Exception: Plan.logger.exception( - f"Parsing Federated Learning Plan : " - f"[red]FAILURE[/] : [blue]{plan_config_path}[/].", + f"Parsing Federated Learning Plan : [red]FAILURE[/] : [blue]{plan_config_path}[/].", extra={"markup": True}, ) raise @@ -235,8 +234,7 @@ def import_(template) -> object: class_name = splitext(template)[1].strip(".") module_path = splitext(template)[0] Plan.logger.info( - f"Importing [red]🡆[/] Object [red]{class_name}[/] " - f"from [red]{module_path}[/] Module.", + f"Importing [red]🡆[/] Object [red]{class_name}[/] from [red]{module_path}[/] Module.", extra={"markup": True}, ) module = import_module(module_path) diff --git a/openfl/experimental/workflow/interface/cli/aggregator.py b/openfl/experimental/workflow/interface/cli/aggregator.py index b51b78f480..5db5d30212 100644 --- a/openfl/experimental/workflow/interface/cli/aggregator.py +++ b/openfl/experimental/workflow/interface/cli/aggregator.py @@ -99,7 +99,7 @@ def start_(plan, authorized_cols, secure): "--fqdn", required=False, type=click_types.FQDN, - help=f"The fully qualified domain name of" f" aggregator node [{getfqdn_env()}]", + help=f"The fully qualified domain name of aggregator node [{getfqdn_env()}]", default=getfqdn_env(), ) def _generate_cert_request(fqdn): @@ -118,8 +118,8 @@ def generate_cert_request(fqdn): echo( f"Creating AGGREGATOR certificate key pair with following settings: " - f'CN={style(common_name, fg="red")},' - f' SAN={style(subject_alternative_name, fg="red")}' + f"CN={style(common_name, fg='red')}," + f" SAN={style(subject_alternative_name, fg='red')}" ) server_private_key, server_csr = generate_csr(common_name, server=True) diff --git a/openfl/experimental/workflow/interface/cli/collaborator.py b/openfl/experimental/workflow/interface/cli/collaborator.py index fe0cb32940..57e282faa4 100644 --- a/openfl/experimental/workflow/interface/cli/collaborator.py +++ b/openfl/experimental/workflow/interface/cli/collaborator.py @@ -126,8 +126,8 @@ def generate_cert_request(collaborator_name, silent, skip_package): echo( f"Creating COLLABORATOR certificate key pair with following settings: " - f'CN={style(common_name, fg="red")},' - f' SAN={style(subject_alternative_name, fg="red")}' + f"CN={style(common_name, fg='red')}," + f" SAN={style(subject_alternative_name, fg='red')}" ) client_private_key, client_csr = generate_csr(common_name, server=False) @@ -164,7 +164,7 @@ def generate_cert_request(collaborator_name, silent, skip_package): make_archive(archive_name, archive_type, tmp_dir) rmtree(tmp_dir) - echo(f"Archive {archive_file_name} with certificate signing" f" request created") + echo(f"Archive {archive_file_name} with certificate signing request created") echo( "This file should be sent to the certificate authority" " (typically hosted by the aggregator) for signing" @@ -233,14 +233,14 @@ def register_collaborator(file_name): "-r", "--request-pkg", type=ClickPath(exists=True), - help="The archive containing the certificate signing" " request (*.zip) for a collaborator", + help="The archive containing the certificate signing request (*.zip) for a collaborator", ) @option( "-i", "--import", "import_", type=ClickPath(exists=True), - help="Import the archive containing the collaborator's" " certificate (signed by the CA)", + help="Import the archive containing the collaborator's certificate (signed by the CA)", ) def certify_(collaborator_name, silent, request_pkg, import_): """Certify the collaborator.""" diff --git a/openfl/experimental/workflow/interface/cli/plan.py b/openfl/experimental/workflow/interface/cli/plan.py index 9f40db67de..5c30955398 100644 --- a/openfl/experimental/workflow/interface/cli/plan.py +++ b/openfl/experimental/workflow/interface/cli/plan.py @@ -85,8 +85,7 @@ def initialize(context, plan_config, cols_config, data_config, aggregator_addres plan_origin["network"]["settings"]["agg_addr"] = aggregator_address or getfqdn_env() logger.warn( - f"Patching Aggregator Addr in Plan" - f" 🠆 {plan_origin['network']['settings']['agg_addr']}" + f"Patching Aggregator Addr in Plan 🠆 {plan_origin['network']['settings']['agg_addr']}" ) Plan.dump(plan_config, plan_origin) diff --git a/openfl/experimental/workflow/interface/cli/workspace.py b/openfl/experimental/workflow/interface/cli/workspace.py index 1d65d0dde0..fb38786b46 100644 --- a/openfl/experimental/workflow/interface/cli/workspace.py +++ b/openfl/experimental/workflow/interface/cli/workspace.py @@ -258,7 +258,7 @@ def export_(pip_install_options: Tuple[str]): if confirm("Create a default '.workspace' file?"): copy2(WORKSPACE / "workspace" / ".workspace", tmp_dir) else: - echo("To proceed, you must have a '.workspace' " "file in the current directory.") + echo("To proceed, you must have a '.workspace' file in the current directory.") raise # Create Zip archive of directory diff --git a/openfl/experimental/workflow/runtime/federated_runtime.py b/openfl/experimental/workflow/runtime/federated_runtime.py index 604f13ce88..efa90e2a24 100644 --- a/openfl/experimental/workflow/runtime/federated_runtime.py +++ b/openfl/experimental/workflow/runtime/federated_runtime.py @@ -193,8 +193,12 @@ def get_flow_state(self) -> Tuple[bool, Any]: return status, flow_object - def get_envoys(self) -> None: - """Prints the status of Envoys in a formatted way.""" + def get_envoys(self) -> List[str]: + """ + Prints the status of Envoys in a formatted way. + Returns: + online_envoys (List[str]): List of online envoys. + """ # Fetch envoy data envoys = self._dir_client.get_envoys() DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" @@ -204,6 +208,7 @@ def get_envoys(self) -> None: headers = ["Name", "Online", "Last Updated", "Experiment Running", "Experiment Name"] # Prepare the table rows rows = [] + online_envoys = [] for envoy in envoys.envoy_infos: rows.append( [ @@ -214,11 +219,15 @@ def get_envoys(self) -> None: envoy.experiment_name if envoy.experiment_name else "None", ] ) + if envoy.is_online: + online_envoys.append(envoy.envoy_name) + # Use tabulate to format the table result = tabulate(rows, headers=headers, tablefmt="grid") # Display the current timestamp print(f"Status of Envoys connected to Federation at: {now}\n") print(result) + return online_envoys def stream_experiment_stdout(self, experiment_name) -> None: """Stream experiment stdout. @@ -232,9 +241,9 @@ def stream_experiment_stdout(self, experiment_name) -> None: print(f"Getting standard output for experiment: {experiment_name}...") for stdout_message_dict in self._dir_client.stream_experiment_stdout(experiment_name): print( - f'Origin: {stdout_message_dict["stdout_origin"]}, ' - f'Task: {stdout_message_dict["task_name"]}' - f'\n{stdout_message_dict["stdout_value"]}' + f"Origin: {stdout_message_dict['stdout_origin']}, " + f"Task: {stdout_message_dict['task_name']}" + f"\n{stdout_message_dict['stdout_value']}" ) def __repr__(self) -> str: diff --git a/openfl/experimental/workflow/workspace_export/export.py b/openfl/experimental/workflow/workspace_export/export.py index 3975e83e44..809be19bd5 100644 --- a/openfl/experimental/workflow/workspace_export/export.py +++ b/openfl/experimental/workflow/workspace_export/export.py @@ -88,7 +88,6 @@ def __init__(self, notebook_path: str, output_workspace: str) -> None: f"{export_filename}.py", ) ).resolve() - print_tree(self.created_workspace_path, level=2) # Generated python script name without .py extension self.script_name = self.script_path.name.split(".")[0].strip() @@ -290,6 +289,8 @@ def export_federated(cls, notebook_path: str, output_workspace: str) -> Tuple[st instance = cls(notebook_path, output_workspace) instance.generate_requirements() instance.generate_plan_yaml() + instance._clean_generated_workspace() + print_tree(output_workspace, level=2) return instance.generate_experiment_archive() @classmethod @@ -304,6 +305,7 @@ def export(cls, notebook_path: str, output_workspace: str) -> None: instance.generate_requirements() instance.generate_plan_yaml() instance.generate_data_yaml() + print_tree(output_workspace, level=2) def generate_experiment_archive(self) -> Tuple[str, str]: """ @@ -357,6 +359,20 @@ def generate_requirements(self) -> None: if i not in line_nos: f.write(line) + def _clean_generated_workspace(self) -> None: + """ + Remove cols.yaml and data.yaml from the generated workspace + as these are not needed in FederatedRuntime (Director based workflow) + + """ + cols_file = self.output_workspace_path.joinpath("plan", "cols.yaml") + data_file = self.output_workspace_path.joinpath("plan", "data.yaml") + + if cols_file.exists(): + cols_file.unlink() + if data_file.exists(): + data_file.unlink() + def generate_plan_yaml(self) -> None: """ Generates plan.yaml @@ -503,8 +519,7 @@ def generate_data_yaml(self) -> None: # noqa: C901 runtime_created = True if not runtime_collab_created: f.write( - f"\nruntime_collaborators = " - f"{runtime_name}._LocalRuntime__collaborators" + f"\nruntime_collaborators = {runtime_name}._LocalRuntime__collaborators" ) runtime_collab_created = True f.write( @@ -512,8 +527,7 @@ def generate_data_yaml(self) -> None: # noqa: C901 f"runtime_collaborators['{collab_name}'].private_attributes" ) data[collab_name] = { - "private_attributes": f"src." - f"{self.script_name}.{collab_name}_private_attributes" + "private_attributes": f"src.{self.script_name}.{collab_name}_private_attributes" } self.__write_yaml(data_yaml, data) diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index 69ff36c19c..13d446e145 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -163,8 +163,7 @@ def parse( # noqa: C901 if gandlf_config_path is not None: Plan.logger.info( - f"Importing GaNDLF Config into plan " - f"from file [red]{gandlf_config_path}[/].", + f"Importing GaNDLF Config into plan from file [red]{gandlf_config_path}[/].", extra={"markup": True}, ) @@ -201,8 +200,7 @@ def parse( # noqa: C901 except Exception: Plan.logger.exception( - f"Parsing Federated Learning Plan : " - f"[red]FAILURE[/] : [blue]{plan_config_path}[/].", + f"Parsing Federated Learning Plan : [red]FAILURE[/] : [blue]{plan_config_path}[/].", extra={"markup": True}, ) raise @@ -248,8 +246,7 @@ def import_(template): class_name = splitext(template)[1].strip(".") module_path = splitext(template)[0] Plan.logger.info( - f"Importing [red]🡆[/] Object [red]{class_name}[/] " - f"from [red]{module_path}[/] Module.", + f"Importing [red]🡆[/] Object [red]{class_name}[/] from [red]{module_path}[/] Module.", extra={"markup": True}, ) module = import_module(module_path) diff --git a/openfl/federated/task/runner_gandlf.py b/openfl/federated/task/runner_gandlf.py index b352d01439..e8662ce1da 100644 --- a/openfl/federated/task/runner_gandlf.py +++ b/openfl/federated/task/runner_gandlf.py @@ -738,7 +738,7 @@ def to_cpu_numpy(state): # When restoring, we currently assume all values are tensors. if not pt.is_tensor(v): raise ValueError( - "We do not currently support non-tensors " "coming from model.state_dict()" + "We do not currently support non-tensors coming from model.state_dict()" ) # get as a numpy array, making sure is on cpu state[k] = v.cpu().numpy() diff --git a/openfl/federated/task/runner_keras.py b/openfl/federated/task/runner_keras.py index e2dd069f72..c7803cb0eb 100644 --- a/openfl/federated/task/runner_keras.py +++ b/openfl/federated/task/runner_keras.py @@ -182,7 +182,16 @@ def train_(self, batch_generator, metrics: list = None, **kwargs): # initialization (build_model). # If metrics are added (i.e. not a subset of what was originally # defined) then the model must be recompiled. - results = self.model.get_metrics_result() + try: + results = self.model.get_metrics_result() + except ValueError: + if "batch_size" in kwargs: + batch_size = kwargs["batch_size"] + else: + batch_size = 1 + # evaluation needed before metrics can be resolved + self.model.evaluate(self.data_loader.get_valid_loader(batch_size), verbose=1) + results = self.model.get_metrics_result() # TODO if there are new metrics in the flplan that were not included # in the originally diff --git a/openfl/federated/task/runner_pt.py b/openfl/federated/task/runner_pt.py index 30b56a4120..9b1a8021f4 100644 --- a/openfl/federated/task/runner_pt.py +++ b/openfl/federated/task/runner_pt.py @@ -714,7 +714,7 @@ def to_cpu_numpy(state): # When restoring, we currently assume all values are tensors. if not torch.is_tensor(v): raise ValueError( - "We do not currently support non-tensors " "coming from model.state_dict()" + "We do not currently support non-tensors coming from model.state_dict()" ) # get as a numpy array, making sure is on cpu state[k] = v.cpu().numpy() diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index a5f5101b2e..0b6de32d18 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -29,9 +29,9 @@ def check_precision_loss(logger, converted_data, original_data): reconstructed_json = reconstructed_bytes.decode("utf-8") reconstructed_data = json.loads(reconstructed_json) - assert type(original_data) is type( - reconstructed_data - ), "Reconstructed datatype does not match original." + assert type(original_data) is type(reconstructed_data), ( + "Reconstructed datatype does not match original." + ) # Compare the original and reconstructed data if original_data != reconstructed_data: diff --git a/openfl/interface/aggregator.py b/openfl/interface/aggregator.py index 80ce56e32e..2297cc4d2b 100644 --- a/openfl/interface/aggregator.py +++ b/openfl/interface/aggregator.py @@ -1,18 +1,33 @@ # Copyright 2020-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - """Aggregator module.""" import sys from logging import getLogger from pathlib import Path -from click import Path as ClickPath -from click import confirm, echo, group, option, pass_context, style +from click import ( + Path as ClickPath, +) +from click import ( + confirm, + echo, + group, + option, + pass_context, + style, +) from openfl.cryptography.ca import sign_certificate -from openfl.cryptography.io import get_csr_hash, read_crt, read_csr, read_key, write_crt, write_key +from openfl.cryptography.io import ( + get_csr_hash, + read_crt, + read_csr, + read_key, + write_crt, + write_key, +) from openfl.cryptography.participant import generate_csr from openfl.federated import Plan from openfl.interface.cli_helper import CERT_DIR @@ -52,9 +67,20 @@ def aggregator(context): default="plan/cols.yaml", type=ClickPath(exists=True), ) -def start_(plan, authorized_cols): - """Start the aggregator service.""" +@option( + "--task_group", + required=False, + default="learning", + help="Selected task-group for assignment - defaults to learning", +) +def start_(plan, authorized_cols, task_group): + """Start the aggregator service. + Args: + plan (str): Path to plan config file + authorized_cols (str): Path to authorized collaborators file + task_group (str): Selected task-group for assignement - defaults to 'learning' + """ if is_directory_traversal(plan): echo("Federated learning plan path is out of the openfl workspace scope.") sys.exit(1) @@ -62,14 +88,21 @@ def start_(plan, authorized_cols): echo("Authorized collaborator list file path is out of the openfl workspace scope.") sys.exit(1) - plan = Plan.parse( + # Parse plan and override mode if specified + parsed_plan = Plan.parse( plan_config_path=Path(plan).absolute(), cols_config_path=Path(authorized_cols).absolute(), ) + # Set task_group in aggregator settings + if "settings" not in parsed_plan.config["aggregator"]: + parsed_plan.config["aggregator"]["settings"] = {} + parsed_plan.config["aggregator"]["settings"]["task_group"] = task_group + logger.info(f"Setting aggregator to assign: {task_group} task_group") + logger.info("🧿 Starting the Aggregator Service.") - plan.get_server().serve() + parsed_plan.get_server().serve() @aggregator.command(name="generate-cert-request") @@ -77,7 +110,7 @@ def start_(plan, authorized_cols): "--fqdn", required=False, type=click_types.FQDN, - help=f"The fully qualified domain name of" f" aggregator node [{getfqdn_env()}]", + help=f"The fully qualified domain name of aggregator node [{getfqdn_env()}]", default=getfqdn_env(), ) def _generate_cert_request(fqdn): @@ -101,8 +134,8 @@ def generate_cert_request(fqdn): echo( f"Creating AGGREGATOR certificate key pair with following settings: " - f'CN={style(common_name, fg="red")},' - f' SAN={style(subject_alternative_name, fg="red")}' + f"CN={style(common_name, fg='red')}," + f" SAN={style(subject_alternative_name, fg='red')}" ) server_private_key, server_csr = generate_csr(common_name, server=True) diff --git a/openfl/interface/cli.py b/openfl/interface/cli.py index 314cfebf3a..75cbbac803 100755 --- a/openfl/interface/cli.py +++ b/openfl/interface/cli.py @@ -148,13 +148,12 @@ def format_help(self, ctx, formatter): help_str = cmd.get_short_help_str() if level == 0: formatter.write( - f'\n{style(name, fg="blue", bold=True):<30}' - f" {style(help_str, bold=True)}" + "\n" + f"\n{style(name, fg='blue', bold=True):<30} {style(help_str, bold=True)}" + "\n" ) formatter.write("─" * 80 + "\n") if level == 1: formatter.write( - f' {style("*", fg="green")}' f' {style(name, fg="cyan"):<21} {help_str}' + "\n" + f" {style('*', fg='green')} {style(name, fg='cyan'):<21} {help_str}" + "\n" ) diff --git a/openfl/interface/collaborator.py b/openfl/interface/collaborator.py index 862ae4db84..81b76b68f8 100644 --- a/openfl/interface/collaborator.py +++ b/openfl/interface/collaborator.py @@ -208,8 +208,8 @@ def generate_cert_request(collaborator_name, silent, skip_package): echo( f"Creating COLLABORATOR certificate key pair with following settings: " - f'CN={style(common_name, fg="red")},' - f' SAN={style(subject_alternative_name, fg="red")}' + f"CN={style(common_name, fg='red')}," + f" SAN={style(subject_alternative_name, fg='red')}" ) client_private_key, client_csr = generate_csr(common_name, server=False) @@ -246,7 +246,7 @@ def generate_cert_request(collaborator_name, silent, skip_package): make_archive(archive_name, archive_type, tmp_dir) rmtree(tmp_dir) - echo(f"Archive {archive_file_name} with certificate signing" f" request created") + echo(f"Archive {archive_file_name} with certificate signing request created") echo( "This file should be sent to the certificate authority" " (typically hosted by the aggregator) for signing" @@ -322,14 +322,14 @@ def register_collaborator(file_name): "-r", "--request-pkg", type=ClickPath(exists=True), - help="The archive containing the certificate signing" " request (*.zip) for a collaborator", + help="The archive containing the certificate signing request (*.zip) for a collaborator", ) @option( "-i", "--import", "import_", type=ClickPath(exists=True), - help="Import the archive containing the collaborator's" " certificate (signed by the CA)", + help="Import the archive containing the collaborator's certificate (signed by the CA)", ) def certify_(collaborator_name, silent, request_pkg, import_): """Certify the collaborator.""" diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index 4e35391bba..0fd5eb0cd7 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -147,7 +147,7 @@ def start_( for plugin_name, plugin_settings in optional_plugins_section.items(): template = plugin_settings.get("template") if not template: - raise Exception("You should put a template" f"for plugin {plugin_name}") + raise Exception(f"You should put a templatefor plugin {plugin_name}") module_path, _, class_name = template.rpartition(".") plugin_params = plugin_settings.get("params", {}) @@ -221,7 +221,7 @@ def shard_descriptor_from_config(shard_config: dict): """ template = shard_config.get("template") if not template: - raise Exception("You should define a shard " "descriptor template in the envoy config") + raise Exception("You should define a shard descriptor template in the envoy config") class_name = template.split(".")[-1] module_path = ".".join(template.split(".")[:-1]) params = shard_config.get("params", {}) diff --git a/openfl/interface/interactive_api/experiment.py b/openfl/interface/interactive_api/experiment.py index d8096df8ad..0a9b383b2f 100644 --- a/openfl/interface/interactive_api/experiment.py +++ b/openfl/interface/interactive_api/experiment.py @@ -103,6 +103,7 @@ def _initialize_plan(self): # Create a folder to store plans os.makedirs("./plan", exist_ok=True) os.makedirs("./save", exist_ok=True) + os.makedirs("./local_state", exist_ok=True) # Load the default plan base_plan_path = WORKSPACE / "workspace/plan/plans/default/base_plan_interactive_api.yaml" plan = Plan.parse(base_plan_path, resolve=False) @@ -115,7 +116,7 @@ def _assert_experiment_submitted(self): """Assure experiment is sent to director and accepted.""" if not self.experiment_submitted: self.logger.error("The experiment was not submitted to a Director service.") - self.logger.error("Report the experiment first: " "use the Experiment.start() method.") + self.logger.error("Report the experiment first: use the Experiment.start() method.") return False return True @@ -192,10 +193,10 @@ def stream_metrics(self, tensorboard_logs: bool = True) -> None: return for metric_message_dict in self.federation.dir_client.stream_metrics(self.experiment_name): self.logger.metric( - f'Round {metric_message_dict["round"]}, ' - f'collaborator {metric_message_dict["metric_origin"]} ' - f'{metric_message_dict["task_name"]} result ' - f'{metric_message_dict["metric_name"]}:\t{metric_message_dict["metric_value"]:f}' + f"Round {metric_message_dict['round']}, " + f"collaborator {metric_message_dict['metric_origin']} " + f"{metric_message_dict['task_name']} result " + f"{metric_message_dict['metric_name']}:\t{metric_message_dict['metric_value']:f}" ) if tensorboard_logs: @@ -209,7 +210,7 @@ def write_tensorboard_metric(self, metric: dict) -> None: self.summary_writer = SummaryWriter(f"./logs/{self.experiment_name}", flush_secs=5) self.summary_writer.add_scalar( - f'{metric["metric_origin"]}/{metric["task_name"]}/{metric["metric_name"]}', + f"{metric['metric_origin']}/{metric['task_name']}/{metric['metric_name']}", metric["metric_value"], metric["round"], ) @@ -393,8 +394,7 @@ def define_task_assigner(self, task_keeper, rounds_to_train): # noqa: C901 if not is_train_task_exist and rounds_to_train != 1: # Since we have only validation tasks, we do not have to train it multiple times raise Exception( - "Variable rounds_to_train must be equal 1, " - "because only validation tasks were given" + "Variable rounds_to_train must be equal 1, because only validation tasks were given" ) if is_train_task_exist and self.is_validate_task_exist: @@ -816,7 +816,7 @@ def set_aggregation_function(self, aggregation_function: AggregationFunction): def decorator_with_args(training_method): if not isinstance(aggregation_function, AggregationFunction): raise Exception( - "aggregation_function must implement " "AggregationFunction interface." + "aggregation_function must implement AggregationFunction interface." ) self.aggregation_functions[training_method.__name__] = aggregation_function return training_method diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index 93d09c02a5..503693e581 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -169,7 +169,7 @@ def initialize( # This is needed to bypass data being locally available if input_shape is not None: logger.info( - "Attempting to generate initial model weights with" f" custom input shape {input_shape}" + f"Attempting to generate initial model weights with custom input shape {input_shape}" ) data_loader = get_dataloader(plan, prefer_minimal=True, input_shape=input_shape) @@ -237,7 +237,7 @@ def freeze_plan(plan_config): init_state_path = plan.config["aggregator"]["settings"]["init_state_path"] if not Path(init_state_path).exists(): - logger.info("Plan has not been initialized! Run 'fx plan" " initialize' before proceeding") + logger.info("Plan has not been initialized! Run 'fx plan initialize' before proceeding") return Plan.dump(Path(plan_config), plan.config, freeze=True) diff --git a/openfl/interface/workspace.py b/openfl/interface/workspace.py index d3cb1713c5..522ff99b5f 100644 --- a/openfl/interface/workspace.py +++ b/openfl/interface/workspace.py @@ -66,6 +66,7 @@ def create_dirs(prefix): (prefix / "data").mkdir(parents=True, exist_ok=True) # training data (prefix / "logs").mkdir(parents=True, exist_ok=True) # training logs (prefix / "save").mkdir(parents=True, exist_ok=True) # model weight saves / initialization + (prefix / "local_state").mkdir(parents=True, exist_ok=True) # persistent state (prefix / "src").mkdir(parents=True, exist_ok=True) # model code shutil.copyfile(WORKSPACE / "workspace" / ".workspace", prefix / ".workspace") @@ -354,6 +355,7 @@ def export_() -> str: # os.makedirs(os.path.join(tmp_dir, 'save'), exist_ok=True) os.makedirs(os.path.join(tmp_dir, "logs"), exist_ok=True) os.makedirs(os.path.join(tmp_dir, "data"), exist_ok=True) + os.makedirs(os.path.join(tmp_dir, "local_state"), exist_ok=True) shutil.copytree("src", os.path.join(tmp_dir, "src"), ignore=ignore) shutil.copytree("plan", os.path.join(tmp_dir, "plan"), ignore=ignore) shutil.copytree("save", os.path.join(tmp_dir, "save")) @@ -363,7 +365,7 @@ def export_() -> str: if not os.path.isfile(_ws_identifier_file): openfl_ws_identifier_file = os.path.join(WORKSPACE, "workspace", _ws_identifier_file) logging.warning( - f"`{_ws_identifier_file}` is missing, " f"copying {openfl_ws_identifier_file} as-is." + f"`{_ws_identifier_file}` is missing, copying {openfl_ws_identifier_file} as-is." ) shutil.copy2(openfl_ws_identifier_file, tmp_dir) shutil.copy2(_ws_identifier_file, tmp_dir) @@ -428,10 +430,7 @@ def dockerize_(context, save: bool, rebuild: bool, enclave_key: str, revision: s # Build OpenFL base image. logging.info("Building OpenFL Base image") base_image_build_cmd = ( - "DOCKER_BUILDKIT=1 docker build {options} " - "-t {image_name} " - "-f {dockerfile} " - "{build_context}" + "DOCKER_BUILDKIT=1 docker build {options} -t {image_name} -f {dockerfile} {build_context}" ).format( options=options, image_name="openfl", diff --git a/openfl/native/fastestimator.py b/openfl/native/fastestimator.py index a95db185d0..d91b1ebc01 100644 --- a/openfl/native/fastestimator.py +++ b/openfl/native/fastestimator.py @@ -82,7 +82,7 @@ def fit(self): # noqa: C901 tensor_dict=tensor_dict, round_number=0, tensor_pipe=tensor_pipe ) - self.logger.info(f"Creating Initial Weights File" f" 🠆 {init_state_path}") + self.logger.info(f"Creating Initial Weights File 🠆 {init_state_path}") utils.dump_proto(model_proto=model_snap, fpath=init_state_path) diff --git a/openfl/pipelines/tensor_codec.py b/openfl/pipelines/tensor_codec.py index be08bbbc1c..15edde0965 100644 --- a/openfl/pipelines/tensor_codec.py +++ b/openfl/pipelines/tensor_codec.py @@ -114,9 +114,9 @@ def decompress( tensor_name, origin, round_number, report, tags = tensor_key assert len(transformer_metadata) > 0, "metadata must be included for decompression" - assert ("compressed" in tags) or ( - "lossy_compressed" in tags - ), "Cannot decompress an uncompressed tensor" + assert ("compressed" in tags) or ("lossy_compressed" in tags), ( + "Cannot decompress an uncompressed tensor" + ) if require_lossless: assert "compressed" in tags, "Cannot losslessly decompress lossy tensor" @@ -169,8 +169,7 @@ def generate_delta(tensor_key, nparray, base_model_nparray): f"layer shape of ({base_model_nparray.shape})" ) assert "model" not in tags, ( - "The tensorkey should be provided " - "from the layer with new weights, not the base model" + "The tensorkey should be provided from the layer with new weights, not the base model" ) new_tags = change_tags(tags, add_field="delta") delta_tensor_key = TensorKey(tensor_name, origin, round_number, report, new_tags) diff --git a/openfl/plugins/frameworks_adapters/pytorch_adapter.py b/openfl/plugins/frameworks_adapters/pytorch_adapter.py index 60a6db54f4..21ff26c7fb 100644 --- a/openfl/plugins/frameworks_adapters/pytorch_adapter.py +++ b/openfl/plugins/frameworks_adapters/pytorch_adapter.py @@ -271,7 +271,7 @@ def to_cpu_numpy(state): # When restoring, we currently assume all values are tensors. if not pt.is_tensor(v): raise ValueError( - "We do not currently support non-tensors " "coming from model.state_dict()" + "We do not currently support non-tensors coming from model.state_dict()" ) # get as a numpy array, making sure is on cpu state[k] = v.cpu().numpy() diff --git a/openfl/utilities/fed_timer.py b/openfl/utilities/fed_timer.py index 3d8770ec24..320fff5a02 100644 --- a/openfl/utilities/fed_timer.py +++ b/openfl/utilities/fed_timer.py @@ -116,8 +116,7 @@ async def async_execute(self): await asyncio.wait_for(task, timeout=self._max_timeout) except asyncio.TimeoutError: raise asyncio.TimeoutError( - f"Timeout after {self._max_timeout} second(s), " - f"Exception method: ({self._fn_name})" + f"Timeout after {self._max_timeout} second(s), Exception method: ({self._fn_name})" ) except Exception: raise Exception(f"Generic Exception: {self._fn_name}") @@ -151,8 +150,7 @@ def sync_execute(self): # exception. if task.is_alive(): raise TimeoutError( - f"Timeout after {self._max_timeout} second(s), " - f"Exception method: ({self._fn_name})" + f"Timeout after {self._max_timeout} second(s), Exception method: ({self._fn_name})" ) return task.result() @@ -276,6 +274,6 @@ def wrapper(self, func, *args, **kwargs): logger.info(f"({self.task._fn_name}) Elapsed Time: {time.perf_counter() - start}") except Exception as e: logger.exception( - f"An exception of type {type(e).__name__} occurred. " f"Arguments:\n{e.args[0]!r}" + f"An exception of type {type(e).__name__} occurred. Arguments:\n{e.args[0]!r}" ) os._exit(status=os.EX_TEMPFAIL) diff --git a/openfl/utilities/split.py b/openfl/utilities/split.py index ee2e4654ac..be66934fe8 100644 --- a/openfl/utilities/split.py +++ b/openfl/utilities/split.py @@ -92,7 +92,7 @@ def split_tensor_dict_for_holdouts( holdout_tensors[tensor_name] = tensors_to_send.pop(tensor_name) except KeyError: logger.warning( - f"tried to remove tensor: {tensor_name} not present " f"in the tensor dict" + f"tried to remove tensor: {tensor_name} not present in the tensor dict" ) continue diff --git a/openfl/utilities/workspace.py b/openfl/utilities/workspace.py index e19b03cf16..15e7a3a339 100644 --- a/openfl/utilities/workspace.py +++ b/openfl/utilities/workspace.py @@ -116,12 +116,10 @@ def __exit__(self, exc_type, exc_value, traceback): if self.remove_archive: logger.debug( - "Exiting from the workspace context manager" - f" for {self.experiment_name} experiment" + f"Exiting from the workspace context manager for {self.experiment_name} experiment" ) logger.debug( - "Exiting from the workspace context manager" - f" for {self.experiment_name} experiment" + f"Exiting from the workspace context manager for {self.experiment_name} experiment" ) logger.debug("Archive still exists: %s", self.data_file_path.exists()) self.data_file_path.unlink(missing_ok=False) diff --git a/pre_commit.toml b/pre_commit.toml new file mode 100644 index 0000000000..6a748a6b02 --- /dev/null +++ b/pre_commit.toml @@ -0,0 +1,7 @@ +[tool.bandit] +# Exclude specific directories or files from the scan +# exclude = ["tests/", "docs/"] + +# Set the severity and confidence levels +severity = "HIGH" +confidence = "HIGH" diff --git a/precommit-doc.md b/precommit-doc.md new file mode 100644 index 0000000000..9782a5d01e --- /dev/null +++ b/precommit-doc.md @@ -0,0 +1,66 @@ +## Pre-commit with Bandit + +To ensure code quality and security, we use [pre-commit](https://pre-commit.com/) with [Bandit](https://bandit.readthedocs.io/en/latest/) to automatically scan for security issues before commits. + +Follow the steps below to set up and use pre-commit in your local development environment. + +### Setup + +1. **Clone the repository**: + + ```sh + git clone https://github.com/intel-innersource/frameworks.ai.openfl.openfl-security.git + cd frameworks.ai.openfl.openfl-security + ``` + +2. **Run the setup script**: + + We have provided a `precommit-setup.sh` script to simplify the installation process. This script will install pre-commit and set up the pre-commit hooks. + + ```sh + ./precommit-setup.sh + ``` + + The `setup.sh` script performs the following actions: + - Check for prerequisties in local: (python, pip) + - Installs pre-commit if it is not already installed. + - Installs the pre-commit hooks defined in the .pre-commit-config.yaml file. + +3. **Verify the installation**: + + After running the setup script, you can verify that pre-commit is installed and the hooks are set up correctly by running: + + ```sh + pre-commit --version + pre-commit install + ``` + +### Usage + +Once the pre-commit hooks are installed, Bandit scans will automatically run before each commit. If any issues are found, the commit will be aborted, and you will need to fix the issues before committing again. + +1. **Make changes to your code**: + + Edit your files as needed. + +2. **Stage your changes**: + + ```sh + git add + ``` + +3. **Commit your changes**: + + ```sh + git commit -m "Your commit message" + ``` + + During the commit process, pre-commit will automatically run the Bandit scan. If the scan is successful, the commit will proceed. If any issues are found, the commit will be aborted, and you will need to address the issues before committing again. + +### How to bypass precommit hooks? + +To exclude the bandit pre-commit hook when making a Git commit, you can use the --no-verify option. This bypasses any pre-commit hooks that are set up in your repository. + +```sh +git commit --no-verify -m "Your commit message" +``` diff --git a/precommit-setup.sh b/precommit-setup.sh new file mode 100644 index 0000000000..9b55289bc9 --- /dev/null +++ b/precommit-setup.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# Function to add the installation path to PATH +add_to_path() { + if [[ ":$PATH:" != *":$1:"* ]]; then + export PATH="$PATH:$1" + echo "Added $1 to PATH" + else + echo "$1 is already in PATH" + fi +} + +# Function to check if Python and pip are installed +check_python_and_pip() { + if ! command -v python3 &> /dev/null; then + echo "Python3 is not installed. Please install Python3 and try again." + exit 1 + fi + + if ! command -v pip &> /dev/null; then + echo "pip is not installed. Please install pip and try again." + exit 1 + fi +} + +# Function to install pre-commit +install_precommit() { + if ! command -v pre-commit &> /dev/null; then + echo "pre-commit not found, installing..." + pip install --user pre-commit + else + echo "pre-commit is already installed" + fi +} + +# Check if Python and pip are installed +check_python_and_pip + +# Detect the operating system +OS="$(uname -s)" +case "$OS" in + Linux*) + echo "Detected Linux" + INSTALL_PATH="$HOME/.local/bin" + install_precommit + add_to_path "$INSTALL_PATH" + ;; + Darwin*) + echo "Detected MacOS" + INSTALL_PATH="$HOME/.local/bin" + install_precommit + add_to_path "$INSTALL_PATH" + ;; + CYGWIN*|MINGW32*|MSYS*|MINGW*) + echo "Detected Windows" + INSTALL_PATH="$HOME/AppData/Local/Packages/PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0/LocalCache/local-packages/Python312/Scripts" + install_precommit + add_to_path "$INSTALL_PATH" + ;; + *) + echo "Unknown OS" + exit 1 + ;; +esac + +# Add the installation path to the shell profile for persistence +if [[ "$OS" == "Linux" || "$OS" == "Darwin" ]]; then + SHELL_PROFILE="$HOME/.bashrc" + if [[ -f "$HOME/.zshrc" ]]; then + SHELL_PROFILE="$HOME/.zshrc" + fi + echo "export PATH=\$PATH:$INSTALL_PATH" >> "$SHELL_PROFILE" + source "$SHELL_PROFILE" +elif [[ "$OS" == "CYGWIN"* || "$OS" == "MINGW"* || "$OS" == "MSYS"* ]]; then + SHELL_PROFILE="$HOME/.bash_profile" + echo "export PATH=\$PATH:$INSTALL_PATH" >> "$SHELL_PROFILE" + source "$SHELL_PROFILE" +fi + +# Verify the installation +if command -v pre-commit &> /dev/null; then + echo "pre-commit installation successful" + pre-commit --version +else + echo "pre-commit installation failed" + exit 1 +fi diff --git a/scripts/lint.sh b/scripts/lint.sh index ac0b97e0a5..98261135bc 100755 --- a/scripts/lint.sh +++ b/scripts/lint.sh @@ -4,7 +4,7 @@ set -Eeuo pipefail base_dir=$(dirname $(dirname $0)) # Run the pre-commit checks -pre-commit run --all-files +SKIP=bandit pre-commit run --all-files ruff check --config "${base_dir}/pyproject.toml" openfl/ diff --git a/test-requirements.txt b/test-requirements.txt index c17ddf1364..b07ea268b9 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -2,8 +2,9 @@ docker lxml==5.3.0 paramiko pytest==8.3.4 -pytest-asyncio==0.25.1 +pytest-asyncio==0.25.2 pytest-mock==3.14.0 defusedxml==0.7.1 matplotlib==3.10.0 -fpdf==1.7.2 \ No newline at end of file +fpdf==1.7.2 +papermill==2.6.0 diff --git a/tests/end_to_end/pytest.ini b/tests/end_to_end/pytest.ini index ed865c99c6..e26b1337c6 100644 --- a/tests/end_to_end/pytest.ini +++ b/tests/end_to_end/pytest.ini @@ -8,5 +8,6 @@ markers = log_memory_usage: mark a test as a log memory usage test. task_runner_basic: mark a test as a task runner basic test. task_runner_dockerized_ws: mark a test as a task runner dockerized workspace test. + federated_runtime_301_watermarking: mark a test as a federated runtime 301 watermarking test. asyncio_mode=auto asyncio_default_fixture_loop_scope="function" diff --git a/tests/end_to_end/test_suites/memory_logs_tests.py b/tests/end_to_end/test_suites/memory_logs_tests.py index 662099273f..e4b8c3f128 100644 --- a/tests/end_to_end/test_suites/memory_logs_tests.py +++ b/tests/end_to_end/test_suites/memory_logs_tests.py @@ -5,7 +5,7 @@ import logging import os -from tests.end_to_end.utils.common_fixtures import fx_federation_tr, fx_federation_tr_dws +from tests.end_to_end.utils.tr_common_fixtures import fx_federation_tr, fx_federation_tr_dws import tests.end_to_end.utils.constants as constants from tests.end_to_end.utils import federation_helper as fed_helper, ssh_helper as ssh from tests.end_to_end.utils.generate_report import generate_memory_report, convert_to_json diff --git a/tests/end_to_end/test_suites/sample_tests.py b/tests/end_to_end/test_suites/sample_tests.py index 01d5fd9394..cea7add2ec 100644 --- a/tests/end_to_end/test_suites/sample_tests.py +++ b/tests/end_to_end/test_suites/sample_tests.py @@ -4,7 +4,7 @@ import pytest import logging -from tests.end_to_end.utils.common_fixtures import ( +from tests.end_to_end.utils.tr_common_fixtures import ( fx_federation_tr, fx_federation_tr_dws, ) diff --git a/tests/end_to_end/test_suites/task_runner_tests.py b/tests/end_to_end/test_suites/task_runner_tests.py index eb9c344da8..a6df29af3a 100644 --- a/tests/end_to_end/test_suites/task_runner_tests.py +++ b/tests/end_to_end/test_suites/task_runner_tests.py @@ -4,7 +4,7 @@ import pytest import logging -from tests.end_to_end.utils.common_fixtures import ( +from tests.end_to_end.utils.tr_common_fixtures import ( fx_federation_tr, fx_federation_tr_dws, ) diff --git a/tests/end_to_end/test_suites/wf_federated_runtime_tests.py b/tests/end_to_end/test_suites/wf_federated_runtime_tests.py new file mode 100644 index 0000000000..18d9f681b2 --- /dev/null +++ b/tests/end_to_end/test_suites/wf_federated_runtime_tests.py @@ -0,0 +1,94 @@ +# Copyright 2020-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import logging +import os +import time +import concurrent.futures + +import tests.end_to_end.utils.federation_helper as fh + +log = logging.getLogger(__name__) + + +@pytest.mark.federated_runtime_301_watermarking +def test_federated_runtime_301_watermarking(request): + """ + Test federated runtime without TLS. + Args: + request (Fixture): Pytest fixture + """ + envoys = ["Bangalore", "Chandler"] + workspace_path = os.path.join( + os.getcwd(), + "openfl-tutorials/experimental/workflow/FederatedRuntime/301_MNIST_Watermarking", + ) + # Activate the experimental feature + cmd = f"fx experimental activate" + error_msg = "Failed to activate the experimental feature" + return_code, output, error = fh.run_command( + cmd, + workspace_path=workspace_path, + error_msg=error_msg, + return_error=True, + ) + + if error: + # Check if the experimental feature is already activated + if [err for err in error if "No such command 'activate'" in err]: + log.info("Experimental feature already activated. Ignore the error.") + else: + log.error(f"{error_msg}: {error}") + raise Exception(error) + + log.info(f"Activated the experimental feature.") + + # Create result log files for the director and envoys + result_path, participant_res_files = fh.create_federated_runtime_participant_res_files( + request.config.results_dir, envoys + ) + + # Start the director + fh.start_director(workspace_path, participant_res_files["director"]) + + # Start envoys Bangalore and Chandler and connect them to the director + executor = concurrent.futures.ThreadPoolExecutor() + results = [ + executor.submit( + fh.start_envoy, + envoy_name=envoy, + workspace_path=workspace_path, + res_file=participant_res_files[envoy.lower()], + ) + for envoy in envoys + ] + assert all([f.result() for f in results]), "Failed to start one or more envoys" + + # Based on the pattern, the envoys take time to connect to the director + # Hence, adding a sleep of 10 seconds anyways. + time.sleep(10) + nb_workspace_path = os.path.join(workspace_path, "workspace") + notebook_path = nb_workspace_path + "/" + "MNIST_Watermarking.ipynb" + + assert fh.check_envoys_director_conn_federated_runtime( + notebook_path=notebook_path, expected_envoys=envoys + ), "Envoys are not connected to the director" + + # IMP - Notebook 301 Watermarking has hard coded notebook path set, hence changing the directory + # This might not be true for all notebooks, thus keeping it as a separate step + os.chdir(nb_workspace_path) + + assert fh.run_notebook( + notebook_path=notebook_path, + output_notebook_path=result_path + "/" + "MNIST_Watermarking_output.ipynb" + ), "Notebook run failed" + + # Change the directory back to the original directory + os.chdir(os.getcwd()) + + assert fh.verify_federated_runtime_experiment_completion( + participant_res_files + ), "Experiment failed" + + log.info("Experiment completed successfully") diff --git a/tests/end_to_end/test_suites/wf_local_func_tests.py b/tests/end_to_end/test_suites/wf_local_func_tests.py index 223ecbfdaa..5f00b7631f 100644 --- a/tests/end_to_end/test_suites/wf_local_func_tests.py +++ b/tests/end_to_end/test_suites/wf_local_func_tests.py @@ -8,7 +8,7 @@ import random from metaflow import Step -from tests.end_to_end.utils.common_fixtures import fx_local_federated_workflow, fx_local_federated_workflow_prvt_attr +from tests.end_to_end.utils.wf_common_fixtures import fx_local_federated_workflow, fx_local_federated_workflow_prvt_attr from tests.end_to_end.workflow.exclude_flow import TestFlowExclude from tests.end_to_end.workflow.include_exclude_flow import TestFlowIncludeExclude from tests.end_to_end.workflow.include_flow import TestFlowInclude diff --git a/tests/end_to_end/utils/exceptions.py b/tests/end_to_end/utils/exceptions.py index 4cccce0e5f..31fa596ac0 100644 --- a/tests/end_to_end/utils/exceptions.py +++ b/tests/end_to_end/utils/exceptions.py @@ -71,3 +71,18 @@ class WorkspaceLoadException(Exception): class ReferenceFlowException(Exception): """Exception for reference flow""" pass + + +class NotebookRunException(Exception): + """Exception for notebook run""" + pass + + +class EnvoyStartException(Exception): + """Exception for envoy start""" + pass + + +class DirectorStartException(Exception): + """Exception for director start""" + pass diff --git a/tests/end_to_end/utils/federation_helper.py b/tests/end_to_end/utils/federation_helper.py index 698e580179..50910c4f2e 100644 --- a/tests/end_to_end/utils/federation_helper.py +++ b/tests/end_to_end/utils/federation_helper.py @@ -7,6 +7,7 @@ import os import json import re +import papermill as pm from pathlib import Path import tests.end_to_end.utils.constants as constants @@ -16,6 +17,7 @@ from tests.end_to_end.models import collaborator as col_model log = logging.getLogger(__name__) +home_dir = Path().home() def setup_pki_for_collaborators(collaborators, model_owner, local_bind_path): @@ -542,6 +544,7 @@ def run_command( bg_file=None, print_output=False, with_docker=False, + return_error=False, ): """ Run the command @@ -553,6 +556,7 @@ def run_command( bg_file (str): Background file (with path) print_output (bool): Print the output with_docker (bool): Flag specific to dockerized workspace scenario. Default is False. + return_error (bool): Return error message Returns: tuple: Return code, output and error """ @@ -591,7 +595,7 @@ def run_command( ) else: return_code, output, error = ssh.run_command(command) - if return_code != 0: + if return_code != 0 and not return_error: log.error(f"{error_msg}: {error}") raise Exception(f"{error_msg}: {error}") @@ -752,3 +756,185 @@ def start_docker_containers_for_dws( raise ex.DockerException( f"Failed to start {participant.name} docker environment: {e}" ) + + +def start_director(workspace_path, dir_res_file): + """ + Start the director. + Args: + workspace_path (str): Workspace path + dir_res_file (str): Director result file + Returns: + bool: True if successful, else False + """ + try: + error_msg = "Failed to start the director" + return_code, output, error = run_command( + "./start_director.sh", + error_msg=error_msg, + workspace_path=os.path.join(workspace_path, "director"), + run_in_background=True, + bg_file=dir_res_file, + ) + log.debug(f"Director start: Return code: {return_code}, Output: {output}, Error: {error}") + log.info( + "Waiting for 30s for the director to start. With no retry mechanism in place, " + "envoys will fail immediately if the director is not ready." + ) + time.sleep(30) + except ex.DirectorStartException as e: + raise e + return True + + +def start_envoy(envoy_name, workspace_path, res_file): + """ + Start given envoy. + Args: + envoy_name (str): Name of the envoy. For e.g. Bangalore, Chandler (case sensitive) + workspace_path (str): Workspace path + res_file (str): Result file to track the logs. + Returns: + bool: True if successful, else False + """ + try: + error_msg = f"Failed to start {envoy_name} envoy" + return_code, output, error = run_command( + f"./start_envoy.sh {envoy_name} {envoy_name}_config.yaml", + error_msg=error_msg, + workspace_path=os.path.join(workspace_path, envoy_name), + run_in_background=True, + bg_file=res_file, + ) + log.debug(f"{envoy_name} start: Return code: {return_code}, Output: {output}, Error: {error}") + except ex.EnvoyStartException as e: + raise e + return True + + +def create_federated_runtime_participant_res_files(results_dir, envoys, model_name="301_mnist_watermarking"): + """ + Create result log files for the director and envoys. + Args: + results_dir (str): Results directory + envoys (list): List of envoys + model_name (str): Model name + Returns: + tuple: Result path and participant result files (including director) + """ + participant_res_files = {} + result_path = os.path.join( + home_dir, results_dir, model_name + ) + os.makedirs(result_path, exist_ok=True) + + for participant in envoys + ["director"]: + res_file = os.path.join(result_path, f"{participant.lower()}.log") + participant_res_files[participant.lower()] = res_file + # Create the file + open(res_file, 'w').close() + + + return result_path, participant_res_files + + +def check_envoys_director_conn_federated_runtime( + notebook_path, expected_envoys, director_node_fqdn="localhost", director_port=50050 +): + """ + Function to check if the envoys are connected to the director for Federated Runtime notebooks. + Args: + notebook_path (str): Path to the notebook + expected_envoys (list): List of expected envoys + director_node_fqdn (str): Director node FQDN + director_port (int): Director port + Returns: + bool: True if all the envoys are connected to the director, else False + """ + from openfl.experimental.workflow.runtime import FederatedRuntime + + # Number of retries and delay between retries in seconds + MAX_RETRIES = RETRY_DELAY = 5 + + federated_runtime = FederatedRuntime( + collaborators=expected_envoys, + director={ + "director_node_fqdn": director_node_fqdn, + "director_port": director_port, + }, + notebook_path=notebook_path, + ) + # Retry logic + for attempt in range(MAX_RETRIES): + actual_envoys = federated_runtime.get_envoys() + if all( + sorted(expected_envoys) == sorted(actual_envoys) + for expected_envoys, actual_envoys in [(expected_envoys, actual_envoys)] + ): + log.info("All the envoys are connected to the director") + return True + else: + log.warning( + f"Attempt {attempt + 1}/{MAX_RETRIES}: Not all envoys are connected. Retrying in {RETRY_DELAY} seconds..." + ) + time.sleep(RETRY_DELAY) + + return False + + +def run_notebook(notebook_path, output_notebook_path): + """ + Function to run the notebook. + Args: + notebook_path (str): Path to the notebook + participant_res_files (dict): Dictionary containing participant names and their result log files + Returns: + bool: True if successful, else False + """ + try: + log.info(f"Running the notebook: {notebook_path} with output notebook path: {output_notebook_path}") + output = pm.execute_notebook( + input_path=notebook_path, + output_path=output_notebook_path, + request_save_on_cell_execute=True, + autosave_cell_every=5, # autosave every 5 seconds + log_output=True, + ) + except pm.exceptions.PapermillExecutionError as e: + log.error(f"PapermillExecutionError: {e}") + raise e + + except ex.NotebookRunException as e: + log.error(f"Failed to run the notebook: {e}") + raise e + return True + + +def verify_federated_runtime_experiment_completion(participant_res_files): + """ + Verify the completion of the experiment using the participant logs. + Args: + participant_res_files (dict): Dictionary containing participant names and their result log files + Returns: + bool: True if successful, else False + """ + # Check participant logs for successful completion + for name, result_file in participant_res_files.items(): + # Do not open file here as it will be opened in the loop below + # Also it takes time for the federation run to start and write the logs + with open(result_file, "r") as file: + lines = [line.strip() for line in file.readlines()] + last_7_lines = list(filter(str.rstrip, lines))[-7:] + if ( + name == "director" + and [1 for content in last_7_lines if "Experiment FederatedFlow_MNIST_Watermarking was finished successfully" in content] + ): + log.debug(f"Process completed for {name}") + continue + elif name != "director" and [1 for content in last_7_lines if "End of Federation reached." in content]: + log.debug(f"Process completed for {name}") + continue + else: + log.error(f"Process failed for {name}") + return False + return True diff --git a/tests/end_to_end/utils/generate_report.py b/tests/end_to_end/utils/generate_report.py index 1e014fa3d1..bba5a79052 100644 --- a/tests/end_to_end/utils/generate_report.py +++ b/tests/end_to_end/utils/generate_report.py @@ -24,10 +24,14 @@ def chapter_body(self, body): def generate_memory_report(memory_usage_dict, workspace_path): """ - Generates a memory usage report from a CSV file. + Generates a memory usage report using input dictionary + and saves it to a PDF file. + Content of memory_usage_dict comes from reading the aggregator + and collaborator memory usage json files inside respective logs folder. Parameters: - file_path (str): The path to the CSV file containing memory usage data. + memory_usage_dict (dict): A dictionary containing memory usage data. + workspace_path (str): The path to the workspace where the report will be saved. Returns: None @@ -37,22 +41,22 @@ def generate_memory_report(memory_usage_dict, workspace_path): # Plotting the chart plt.figure(figsize=(10, 5)) - plt.plot(data["round_number"], data["virtual_memory/used"], marker="o") + plt.plot(data["round_number"], data["process_memory"], marker="o") plt.title("Memory Usage per Round") plt.xlabel("round_number") - plt.ylabel("Virtual Memory Used (MB)") + plt.ylabel("Process Memory Used (MB)") plt.grid(True) output_path = f"{workspace_path}/mem_usage_plot.png" plt.savefig(output_path) plt.close() # Calculate statistics - min_mem = round(data["virtual_memory/used"].min(), 2) - max_mem = round(data["virtual_memory/used"].max(), 2) - mean_mem = round(data["virtual_memory/used"].mean(), 2) - variance_mem = round(data["virtual_memory/used"].var(), 2) - std_dev_mem = round(data["virtual_memory/used"].std(), 2) - slope, _, _, _, _ = linregress(data.index, data["virtual_memory/used"]) + min_mem = round(data["process_memory"].min(), 2) + max_mem = round(data["process_memory"].max(), 2) + mean_mem = round(data["process_memory"].mean(), 2) + variance_mem = round(data["process_memory"].var(), 2) + std_dev_mem = round(data["process_memory"].std(), 2) + slope, _, _, _, _ = linregress(data.index, data["process_memory"]) slope = round(slope, 2) stats_path = f"{workspace_path}/mem_stats.txt" with open(stats_path, "w") as file: @@ -87,7 +91,7 @@ def add_introduction(pdf): def add_chart_analysis(pdf, output_path, data): pdf.chapter_title("Chart Analysis") pdf.image(output_path, w=180) - diffs = data["virtual_memory/used"].diff().round(2) + diffs = data["process_memory"].diff().round(2) significant_changes = diffs[diffs.abs() > 500] for index, value in significant_changes.items(): pdf.chapter_body( diff --git a/tests/end_to_end/utils/summary_helper.py b/tests/end_to_end/utils/summary_helper.py index cfdbc17a9e..ce2e4bac0d 100644 --- a/tests/end_to_end/utils/summary_helper.py +++ b/tests/end_to_end/utils/summary_helper.py @@ -1,9 +1,12 @@ -# Copyright 2020-2023 Intel Corporation +# Copyright 2020-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + +import argparse from defusedxml.ElementTree import parse as defused_parse from lxml import etree import os +import re from pathlib import Path import tests.end_to_end.utils.constants as constants @@ -100,7 +103,7 @@ def get_testcase_result(): return database_list -def main(): +def print_task_runner_score(): """ Main function to get the test case results and aggregator logs And write the results to GitHub step summary @@ -166,5 +169,68 @@ def main(): ) +def print_federated_runtime_score(): + summary_file = os.getenv("GITHUB_STEP_SUMMARY") + + search_string = "Aggregated model validation score" + + last_occurrence = aggregated_model_score = None + + # Assumption - result directory is present in the home directory + dir_res_file = os.path.join( + result_path, + "301_mnist_watermarking", + "director.log", + ) + + # Open and read the log file + with open(dir_res_file, "r") as file: + for line in file: + if search_string in line: + last_occurrence = line + + # Extract the value from the last occurrence + if last_occurrence: + match = re.search( + r"Aggregated model validation score = (\d+\.\d+)", last_occurrence + ) + if match: + aggregated_model_score = match.group(1) + print(f"Last Aggregated model validation score: {aggregated_model_score}") + else: + print("No valid score found in the last occurrence.") + else: + print(f"No occurrences of '{search_string}' found in the log file.") + + # Write the results to GitHub step summary file + # This file is created at runtime by the GitHub action, thus we cannot verify its existence beforehand + with open(summary_file, "a") as fh: + # DO NOT change the print statements + print("| Aggregated model validation score |", file=fh) + print("| ------------- |", file=fh) + print(f"| {aggregated_model_score} |", file=fh) + + +def fetch_args(): + """ + Function to fetch the commandline arguments. + Returns: + Parsed arguments + """ + # Initialize the parser and add arguments + parser = argparse.ArgumentParser() + parser.add_argument( + "--func_name", required=True, default="", type=str, help="Name of function to be called" + ) + args = parser.parse_args() + return args + + if __name__ == "__main__": - main() + # Fetch input arguments + args = fetch_args() + func_name = args.func_name + if func_name in ["print_task_runner_score", "print_local_runtime_score"]: + print_task_runner_score() + elif func_name == "print_federated_runtime_score": + print_federated_runtime_score() diff --git a/tests/end_to_end/utils/common_fixtures.py b/tests/end_to_end/utils/tr_common_fixtures.py similarity index 58% rename from tests/end_to_end/utils/common_fixtures.py rename to tests/end_to_end/utils/tr_common_fixtures.py index 951670bc69..593e57e473 100644 --- a/tests/end_to_end/utils/common_fixtures.py +++ b/tests/end_to_end/utils/tr_common_fixtures.py @@ -5,7 +5,6 @@ import collections import concurrent.futures import logging -import numpy as np import tests.end_to_end.utils.constants as constants import tests.end_to_end.utils.federation_helper as fh @@ -21,10 +20,6 @@ "model_owner, aggregator, collaborators, workspace_path, local_bind_path", ) -workflow_local_fixture = collections.namedtuple( - "workflow_local_fixture", - "aggregator, collaborators, runtime", -) @pytest.fixture(scope="function") def fx_federation_tr(request): @@ -235,136 +230,3 @@ def fx_federation_tr_dws(request): workspace_path=workspace_path, local_bind_path=local_bind_path, ) - - -@pytest.fixture(scope="function") -def fx_local_federated_workflow(request): - """ - Fixture to set up a local federated workflow for testing. - This fixture initializes an `Aggregator` and sets up a list of collaborators - based on the number specified in the test configuration. It also configures - a `LocalRuntime` with the aggregator, collaborators, and an optional backend - if specified in the test configuration. - Args: - request (FixtureRequest): The pytest request object that provides access - to the test configuration. - Yields: - LocalRuntime: An instance of `LocalRuntime` configured with the aggregator, - collaborators, and backend. - """ - # Import is done inline because Task Runner does not support importing below openfl packages - from openfl.experimental.workflow.interface import Aggregator, Collaborator - from openfl.experimental.workflow.runtime import LocalRuntime - from tests.end_to_end.utils.wf_helper import ( - init_collaborator_private_attr_index, - init_collaborator_private_attr_name, - init_collaborate_pvt_attr_np, - init_agg_pvt_attr_np - ) - collab_callback_func = request.param[0] if hasattr(request, 'param') and request.param else None - collab_value = request.param[1] if hasattr(request, 'param') and request.param else None - agg_callback_func = request.param[2] if hasattr(request, 'param') and request.param else None - - # Get the callback functions from the locals using string - collab_callback_func_name = locals()[collab_callback_func] if collab_callback_func else None - agg_callback_func_name = locals()[agg_callback_func] if agg_callback_func else None - collaborators_list = [] - - if agg_callback_func_name: - aggregator = Aggregator( name="agg", - private_attributes_callable=agg_callback_func_name) - else: - aggregator = Aggregator() - - # Setup collaborators - for i in range(request.config.num_collaborators): - func_var = i if collab_value == "int" else f"collaborator{i}" if collab_value == "str" else None - collaborators_list.append( - Collaborator( - name=f"collaborator{i}", - private_attributes_callable=collab_callback_func_name, - param = func_var - ) - ) - - backend = request.config.backend if hasattr(request.config, 'backend') else None - if backend: - local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators_list, backend=backend) - local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators_list) - - # Return the federation fixture - return workflow_local_fixture( - aggregator=aggregator, - collaborators=collaborators_list, - runtime=local_runtime, - ) - - -@pytest.fixture(scope="function") -def fx_local_federated_workflow_prvt_attr(request): - """ - Fixture to set up a local federated workflow for testing. - This fixture initializes an `Aggregator` and sets up a list of collaborators - based on the number specified in the test configuration. It also configures - a `LocalRuntime` with the aggregator, collaborators, and an optional backend - if specified in the test configuration. - Args: - request (FixtureRequest): The pytest request object that provides access - to the test configuration. - Yields: - LocalRuntime: An instance of `LocalRuntime` configured with the aggregator, - collaborators, and backend. - """ - # Import is done inline because Task Runner does not support importing below openfl packages - from openfl.experimental.workflow.interface import Aggregator, Collaborator - from openfl.experimental.workflow.runtime import LocalRuntime - from tests.end_to_end.utils.wf_helper import ( - init_collaborator_private_attr_index, - init_collaborator_private_attr_name, - init_collaborate_pvt_attr_np, - init_agg_pvt_attr_np - ) - collab_callback_func = request.param[0] if hasattr(request, 'param') and request.param else None - collab_value = request.param[1] if hasattr(request, 'param') and request.param else None - agg_callback_func = request.param[2] if hasattr(request, 'param') and request.param else None - - # Get the callback functions from the locals using string - collab_callback_func_name = locals()[collab_callback_func] if collab_callback_func else None - agg_callback_func_name = locals()[agg_callback_func] if agg_callback_func else None - collaborators_list = [] - - # Setup aggregator - if agg_callback_func_name: - aggregator = Aggregator(name="agg", - private_attributes_callable=agg_callback_func_name) - else: - aggregator = Aggregator() - - aggregator.private_attributes = { - "test_loader_pvt": np.random.rand(10, 28, 28) # Random data - } - # Setup collaborators - for i in range(request.config.num_collaborators): - func_var = i if collab_value == "int" else f"collaborator{i}" if collab_value == "str" else None - collab = Collaborator( - name=f"collaborator{i}", - private_attributes_callable=collab_callback_func_name, - param = func_var - ) - collab.private_attributes = { - "train_loader_pvt": np.random.rand(i * 50, 28, 28), - "test_loader_pvt": np.random.rand(i * 10, 28, 28), - } - collaborators_list.append(collab) - - backend = request.config.backend if hasattr(request.config, 'backend') else None - if backend: - local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators_list, backend=backend) - local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators_list) - - # Return the federation fixture - return workflow_local_fixture( - aggregator=aggregator, - collaborators=collaborators_list, - runtime=local_runtime, - ) diff --git a/tests/end_to_end/utils/wf_common_fixtures.py b/tests/end_to_end/utils/wf_common_fixtures.py new file mode 100644 index 0000000000..2243ec5ccd --- /dev/null +++ b/tests/end_to_end/utils/wf_common_fixtures.py @@ -0,0 +1,144 @@ +# Copyright 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import collections +import numpy as np + +from openfl.experimental.workflow.interface import Aggregator, Collaborator +from openfl.experimental.workflow.runtime import LocalRuntime + +# Define a named tuple to store the objects for model owner, aggregator, and collaborators +workflow_local_fixture = collections.namedtuple( + "workflow_local_fixture", + "aggregator, collaborators, runtime", +) + + +@pytest.fixture(scope="function") +def fx_local_federated_workflow(request): + """ + Fixture to set up a local federated workflow for testing. + This fixture initializes an `Aggregator` and sets up a list of collaborators + based on the number specified in the test configuration. It also configures + a `LocalRuntime` with the aggregator, collaborators, and an optional backend + if specified in the test configuration. + Args: + request (FixtureRequest): The pytest request object that provides access + to the test configuration. + Yields: + LocalRuntime: An instance of `LocalRuntime` configured with the aggregator, + collaborators, and backend. + """ + # Inline import + from tests.end_to_end.utils.wf_helper import ( + init_collaborator_private_attr_index, + init_collaborator_private_attr_name, + init_collaborate_pvt_attr_np, + init_agg_pvt_attr_np + ) + collab_callback_func = request.param[0] if hasattr(request, 'param') and request.param else None + collab_value = request.param[1] if hasattr(request, 'param') and request.param else None + agg_callback_func = request.param[2] if hasattr(request, 'param') and request.param else None + + # Get the callback functions from the locals using string + collab_callback_func_name = locals()[collab_callback_func] if collab_callback_func else None + agg_callback_func_name = locals()[agg_callback_func] if agg_callback_func else None + collaborators_list = [] + + if agg_callback_func_name: + aggregator = Aggregator( name="agg", + private_attributes_callable=agg_callback_func_name) + else: + aggregator = Aggregator() + + # Setup collaborators + for i in range(request.config.num_collaborators): + func_var = i if collab_value == "int" else f"collaborator{i}" if collab_value == "str" else None + collaborators_list.append( + Collaborator( + name=f"collaborator{i}", + private_attributes_callable=collab_callback_func_name, + param = func_var + ) + ) + + backend = request.config.backend if hasattr(request.config, 'backend') else None + if backend: + local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators_list, backend=backend) + local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators_list) + + # Return the federation fixture + return workflow_local_fixture( + aggregator=aggregator, + collaborators=collaborators_list, + runtime=local_runtime, + ) + + +@pytest.fixture(scope="function") +def fx_local_federated_workflow_prvt_attr(request): + """ + Fixture to set up a local federated workflow for testing. + This fixture initializes an `Aggregator` and sets up a list of collaborators + based on the number specified in the test configuration. It also configures + a `LocalRuntime` with the aggregator, collaborators, and an optional backend + if specified in the test configuration. + Args: + request (FixtureRequest): The pytest request object that provides access + to the test configuration. + Yields: + LocalRuntime: An instance of `LocalRuntime` configured with the aggregator, + collaborators, and backend. + """ + # Inline import + from tests.end_to_end.utils.wf_helper import ( + init_collaborator_private_attr_index, + init_collaborator_private_attr_name, + init_collaborate_pvt_attr_np, + init_agg_pvt_attr_np + ) + collab_callback_func = request.param[0] if hasattr(request, 'param') and request.param else None + collab_value = request.param[1] if hasattr(request, 'param') and request.param else None + agg_callback_func = request.param[2] if hasattr(request, 'param') and request.param else None + + # Get the callback functions from the locals using string + collab_callback_func_name = locals()[collab_callback_func] if collab_callback_func else None + agg_callback_func_name = locals()[agg_callback_func] if agg_callback_func else None + collaborators_list = [] + + # Setup aggregator + if agg_callback_func_name: + aggregator = Aggregator(name="agg", + private_attributes_callable=agg_callback_func_name) + else: + aggregator = Aggregator() + + aggregator.private_attributes = { + "test_loader_pvt": np.random.rand(10, 28, 28) # Random data + } + # Setup collaborators + for i in range(request.config.num_collaborators): + func_var = i if collab_value == "int" else f"collaborator{i}" if collab_value == "str" else None + collab = Collaborator( + name=f"collaborator{i}", + private_attributes_callable=collab_callback_func_name, + param = func_var + ) + collab.private_attributes = { + "train_loader_pvt": np.random.rand(i * 50, 28, 28), + "test_loader_pvt": np.random.rand(i * 10, 28, 28), + } + collaborators_list.append(collab) + + backend = request.config.backend if hasattr(request.config, 'backend') else None + if backend: + local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators_list, backend=backend) + local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators_list) + + # Return the federation fixture + return workflow_local_fixture( + aggregator=aggregator, + collaborators=collaborators_list, + runtime=local_runtime, + ) diff --git a/tests/end_to_end/workflow/exclude_flow.py b/tests/end_to_end/workflow/exclude_flow.py index f364dcbbaa..3f68033363 100644 --- a/tests/end_to_end/workflow/exclude_flow.py +++ b/tests/end_to_end/workflow/exclude_flow.py @@ -12,6 +12,7 @@ class TestFlowExclude(FLSpec): """ Testflow to validate exclude functionality in Federated Flow """ + __test__ = False # to prevent pytest from trying to discover tests in the class @aggregator def start(self): diff --git a/tests/end_to_end/workflow/include_exclude_flow.py b/tests/end_to_end/workflow/include_exclude_flow.py index b30e00d8d1..4a78787ac4 100644 --- a/tests/end_to_end/workflow/include_exclude_flow.py +++ b/tests/end_to_end/workflow/include_exclude_flow.py @@ -11,6 +11,7 @@ class TestFlowIncludeExclude(FLSpec): """ Testflow to validate include and exclude functionality in Federated Flow. """ + __test__ = False # to prevent pytest from trying to discover tests in the class @aggregator def start(self): diff --git a/tests/end_to_end/workflow/include_flow.py b/tests/end_to_end/workflow/include_flow.py index 7009e50a46..4cb83cf25a 100644 --- a/tests/end_to_end/workflow/include_flow.py +++ b/tests/end_to_end/workflow/include_flow.py @@ -12,6 +12,7 @@ class TestFlowInclude(FLSpec): """ Testflow to validate include functionality in Federated Flow """ + __test__ = False # to prevent pytest from trying to discover tests in the class @aggregator def start(self): diff --git a/tests/end_to_end/workflow/internal_loop.py b/tests/end_to_end/workflow/internal_loop.py index 8c506018eb..709121876f 100644 --- a/tests/end_to_end/workflow/internal_loop.py +++ b/tests/end_to_end/workflow/internal_loop.py @@ -13,6 +13,8 @@ log = logging.getLogger(__name__) class TestFlowInternalLoop(FLSpec): + __test__ = False # to prevent pytest from trying to discover tests in the class + def __init__(self, model=None, optimizer=None, rounds=None, **kwargs): super().__init__(**kwargs) self.training_rounds = rounds diff --git a/tests/end_to_end/workflow/private_attr_both.py b/tests/end_to_end/workflow/private_attr_both.py index 44f171f723..a44f6332b5 100644 --- a/tests/end_to_end/workflow/private_attr_both.py +++ b/tests/end_to_end/workflow/private_attr_both.py @@ -15,6 +15,7 @@ class TestFlowPrivateAttributesBoth(FLSpec): Testflow to validate Aggregator private attributes are not accessible to collaborators and vice versa """ + __test__ = False # to prevent pytest from trying to discover tests in the class @aggregator def start(self): diff --git a/tests/end_to_end/workflow/private_attr_wo_callable.py b/tests/end_to_end/workflow/private_attr_wo_callable.py index b32758178f..171594d914 100644 --- a/tests/end_to_end/workflow/private_attr_wo_callable.py +++ b/tests/end_to_end/workflow/private_attr_wo_callable.py @@ -14,6 +14,7 @@ class TestFlowPrivateAttributesWoCallable(FLSpec): Testflow to validate Aggregator private attributes are not accessible to collaborators and vice versa """ + __test__ = False # to prevent pytest from trying to discover tests in the class @aggregator def start(self): diff --git a/tests/end_to_end/workflow/private_attributes_flow.py b/tests/end_to_end/workflow/private_attributes_flow.py index 92c9f90d2a..713aee6f04 100644 --- a/tests/end_to_end/workflow/private_attributes_flow.py +++ b/tests/end_to_end/workflow/private_attributes_flow.py @@ -16,6 +16,7 @@ class TestFlowPrivateAttributes(FLSpec): Testflow to validate Aggregator private attributes are not accessible to collaborators and vice versa """ + __test__ = False # to prevent pytest from trying to discover tests in the class @aggregator def start(self): @@ -98,6 +99,7 @@ def end(self): log.info("Testing FederatedFlow - Ending Test for accessibility of private attributes") log.info("...Test case passed...") + def validate_collab_private_attr(self, private_attr, step_name): """ Validates the private attributes of the aggregator and collaborators. diff --git a/tests/end_to_end/workflow/reference_exclude.py b/tests/end_to_end/workflow/reference_exclude.py index 6f8ee766d0..d650e1d6c3 100644 --- a/tests/end_to_end/workflow/reference_exclude.py +++ b/tests/end_to_end/workflow/reference_exclude.py @@ -33,7 +33,7 @@ class TestFlowReferenceWithExclude(FLSpec): """ Testflow to validate references of collaborator attributes in Federated Flow with exclude. """ - + __test__ = False # to prevent pytest from trying to discover tests in the class step_one_collab_attrs = [] step_two_collab_attrs = [] diff --git a/tests/end_to_end/workflow/reference_flow.py b/tests/end_to_end/workflow/reference_flow.py index 24f2cc9850..8e15c24a5c 100644 --- a/tests/end_to_end/workflow/reference_flow.py +++ b/tests/end_to_end/workflow/reference_flow.py @@ -33,6 +33,7 @@ class TestFlowReference(FLSpec): """ Testflow to validate references of collaborator attributes in Federated Flow. """ + __test__ = False # to prevent pytest from trying to discover tests in the class step_one_collab_attrs = [] step_two_collab_attrs = [] all_ref_error_dict = {} diff --git a/tests/end_to_end/workflow/reference_include_flow.py b/tests/end_to_end/workflow/reference_include_flow.py index 65acccc866..187beb3148 100644 --- a/tests/end_to_end/workflow/reference_include_flow.py +++ b/tests/end_to_end/workflow/reference_include_flow.py @@ -29,6 +29,7 @@ def forward(self, x): class TestFlowReferenceWithInclude(FLSpec): + __test__ = False # to prevent pytest from trying to discover tests in the class step_one_collab_attrs = [] step_two_collab_attrs = [] diff --git a/tests/end_to_end/workflow/subset_flow.py b/tests/end_to_end/workflow/subset_flow.py index c4fa8cae39..b84498b582 100644 --- a/tests/end_to_end/workflow/subset_flow.py +++ b/tests/end_to_end/workflow/subset_flow.py @@ -14,6 +14,7 @@ class TestFlowSubsetCollaborators(FLSpec): """ Testflow to validate working of Subset Collaborators in Federated Flow. """ + __test__ = False # to prevent pytest from trying to discover tests in the class def __init__(self, random_ints=[], **kwargs) -> None: """ diff --git a/tests/github/pki_wrong_cn.py b/tests/github/pki_wrong_cn.py deleted file mode 100644 index 33eaeb0a9d..0000000000 --- a/tests/github/pki_wrong_cn.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (C) 2020-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -import grpc -import subprocess -import os -import time -from multiprocessing import Process -import sys -import importlib - -import openfl -import openfl.native as fx -from openfl.utilities.utils import getfqdn_env - - -def prepare_workspace(): - subprocess.check_call(['fx', 'workspace', 'certify']) - subprocess.check_call(['fx', 'plan', 'initialize']) - - subprocess.check_call([ - 'fx', 'aggregator', 'generate-cert-request' - ]) - subprocess.check_call([ - 'fx', 'aggregator', 'certify', - '-s' - ]) - for col in ['one', 'two']: - subprocess.check_call([ - 'fx', 'collaborator', 'create', - '-n', col, - '-d', '1', - '-s' - ]) - subprocess.check_call([ - 'fx', 'collaborator', 'generate-cert-request', - '-n', col, - '-s', '-x' - ]) - subprocess.check_call([ - 'fx', 'collaborator', 'certify', - '-n', col, - '-s' - ]) - - sys.path.append(os.getcwd()) - - -def start_invalid_collaborator(): - ''' - We choose the gRPC client of another collaborator - to check if aggregator accepts certificate - that does not correspond to the collaborator's name. - ''' - importlib.reload(openfl.federated.task) # fetch TF-based task runner - importlib.reload(openfl.federated.data) # fetch TF-based data loader - importlib.reload(openfl.federated) # allow imports from parent module - col_name = 'one' - plan = fx.setup_plan() - plan.resolve() - client = plan.get_client('two', plan.aggregator_uuid, plan.federation_uuid) - collaborator = plan.get_collaborator(col_name, client=client) - collaborator.run() - - -def start_aggregator(): - agg = Process(target=subprocess.check_call, args=[['fx', 'aggregator', 'start']]) - agg.start() - time.sleep(3) # wait for initialization - return agg - - -if __name__ == '__main__': - origin_dir = os.getcwd() - prefix = 'fed_workspace' - subprocess.check_call([ - 'fx', 'workspace', 'create', - '--prefix', prefix, - '--template', 'torch_cnn_mnist' - ]) - os.chdir(prefix) - fqdn = getfqdn_env() - prepare_workspace() - agg = start_aggregator() - try: - start_invalid_collaborator() - agg.join() - except grpc.RpcError as e: - if e.code() == grpc.StatusCode.UNAUTHENTICATED: - pass - else: - raise - else: - print('Aggregator accepted invalid collaborator certificate.') - sys.exit(1) - finally: - agg.kill() diff --git a/tests/github/python_native_tf.py b/tests/github/python_native_tf.py deleted file mode 100644 index c5e44f75af..0000000000 --- a/tests/github/python_native_tf.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (C) 2020-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -"""Python native tests.""" - -import numpy as np - -import openfl.native as fx - - -def one_hot(labels, classes): - """ - One Hot encode a vector. - - Args: - labels (list): List of labels to onehot encode - classes (int): Total number of categorical classes - - Returns: - np.array: Matrix of one-hot encoded labels - """ - return np.eye(classes)[labels] - - -def build_model(input_shape, - num_classes, - conv_kernel_size=(4, 4), - conv_strides=(2, 2), - conv1_channels_out=16, - conv2_channels_out=32, - final_dense_inputsize=100, - **kwargs): - """ - Define the model architecture. - - Args: - input_shape (numpy.ndarray): The shape of the data - num_classes (int): The number of classes of the dataset - - Returns: - tensorflow.python.keras.engine.sequential.Sequential: The model defined in Keras - - """ - import tensorflow as tf # NOQA - import tensorflow.keras as ke # NOQA - - from tensorflow.keras import Sequential # NOQA - from tensorflow.keras.layers import Conv2D, Flatten, Dense # NOQA - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - config.intra_op_parallelism_threads = 112 - config.inter_op_parallelism_threads = 1 - sess = tf.compat.v1.Session(config=config) - model = Sequential() - - model.add(Conv2D(conv1_channels_out, - kernel_size=conv_kernel_size, - strides=conv_strides, - activation='relu', - input_shape=input_shape)) - - model.add(Conv2D(conv2_channels_out, - kernel_size=conv_kernel_size, - strides=conv_strides, - activation='relu')) - - model.add(Flatten()) - - model.add(Dense(final_dense_inputsize, activation='relu')) - - model.add(Dense(num_classes, activation='softmax')) - - model.compile(loss=ke.losses.categorical_crossentropy, - optimizer=ke.optimizers.Adam(), - metrics=['accuracy']) - - # initialize the optimizer variables - opt_vars = model.optimizer.variables() - - for v in opt_vars: - v.initializer.run(session=sess) - - return model - - -if __name__ == '__main__': - fx.init('keras_cnn_mnist') - from openfl.federated import FederatedDataSet - from openfl.federated import FederatedModel - from tensorflow.python.keras.utils.data_utils import get_file - - origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/' - path = get_file('mnist.npz', - origin=origin_folder + 'mnist.npz', - file_hash='731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1') - - with np.load(path) as f: - # get all of mnist - X_train = f['x_train'] - y_train = f['y_train'] - - X_valid = f['x_test'] - y_valid = f['y_test'] - img_rows, img_cols = 28, 28 - X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1) - X_valid = X_valid.reshape(X_valid.shape[0], img_rows, img_cols, 1) - X_train = X_train.astype('float32') - X_valid = X_valid.astype('float32') - X_train /= 255 - X_valid /= 255 - - classes = 10 - y_train = one_hot(y_train, classes) - y_valid = one_hot(y_valid, classes) - - feature_shape = X_train.shape[1] - - fl_data = FederatedDataSet(X_train, y_train, X_valid, y_valid, - batch_size=32, num_classes=classes) - fl_model = FederatedModel(build_model=build_model, data_loader=fl_data) - collaborator_models = fl_model.setup(num_collaborators=2) - collaborators = {'one': collaborator_models[0], 'two': collaborator_models[1]} - print(f'Original training data size: {len(X_train)}') - print(f'Original validation data size: {len(X_valid)}\n') - - # Collaborator one's data - print(f'Collaborator one\'s training data size: ' - f'{len(collaborator_models[0].data_loader.X_train)}') - print(f'Collaborator one\'s validation data size: ' - f'{len(collaborator_models[0].data_loader.X_valid)}\n') - - # Collaborator two's data - print(f'Collaborator two\'s training data size: ' - f'{len(collaborator_models[1].data_loader.X_train)}') - print(f'Collaborator two\'s validation data size: ' - f'{len(collaborator_models[1].data_loader.X_valid)}\n') - - print(fx.get_plan()) - final_fl_model = fx.run_experiment(collaborators, {'aggregator.settings.rounds_to_train': 5}) - final_fl_model.save_native('final_pytorch_model.h5') diff --git a/tests/github/python_native_torch.py b/tests/github/python_native_torch.py deleted file mode 100644 index 402110c9ea..0000000000 --- a/tests/github/python_native_torch.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright (C) 2020-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -"""Python native tests.""" - -import numpy as np - -import openfl.native as fx - - -def one_hot(labels, classes): - """One-hot encode `labels` using `classes` classes.""" - return np.eye(classes)[labels] - - -fx.init('torch_cnn_mnist') - -if __name__ == '__main__': - import torch - import torch.nn as nn - import torch.nn.functional as F - import torch.optim as optim - from torchvision import datasets - from torchvision import transforms - - from openfl.federated import FederatedDataSet - from openfl.federated import FederatedModel - - def cross_entropy(output, target): - """Binary cross-entropy metric.""" - return F.cross_entropy(input=output, target=target) - - class Net(nn.Module): - """PyTorch Neural Network.""" - - def __init__(self): - """Initialize.""" - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 16, 3) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(16, 32, 3) - self.fc1 = nn.Linear(32 * 5 * 5, 32) - self.fc2 = nn.Linear(32, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - """Forward pass of the network.""" - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(x.size(0), -1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - transform = transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - - trainset = datasets.MNIST(root='./data', train=True, - download=True, transform=transform) - - train_images, train_labels = trainset.train_data, np.array(trainset.train_labels) - train_images = torch.from_numpy(np.expand_dims(train_images, axis=1)).float() - - validset = datasets.MNIST(root='./data', train=False, - download=True, transform=transform) - - valid_images, valid_labels = validset.test_data, np.array(validset.test_labels) - valid_images = torch.from_numpy(np.expand_dims(valid_images, axis=1)).float() - valid_labels = one_hot(valid_labels, 10) - feature_shape = train_images.shape[1] - classes = 10 - - fl_data = FederatedDataSet(train_images, train_labels, valid_images, valid_labels, - batch_size=32, num_classes=classes) - fl_model = FederatedModel(build_model=Net, optimizer=lambda x: optim.Adam(x, lr=1e-4), - loss_fn=cross_entropy, data_loader=fl_data) - collaborator_models = fl_model.setup(num_collaborators=2) - collaborators = {'one': collaborator_models[0], 'two': collaborator_models[1]} - print(f'Original training data size: {len(train_images)}') - print(f'Original validation data size: {len(valid_images)}\n') - - # Collaborator one's data - print(f'Collaborator one\'s training data size: ' - f'{len(collaborator_models[0].data_loader.X_train)}') - print(f'Collaborator one\'s validation data size: ' - f'{len(collaborator_models[0].data_loader.X_valid)}\n') - - # Collaborator two's data - print(f'Collaborator two\'s training data size: ' - f'{len(collaborator_models[1].data_loader.X_train)}') - print(f'Collaborator two\'s validation data size: ' - f'{len(collaborator_models[1].data_loader.X_valid)}\n') - - print(fx.get_plan()) - final_fl_model = fx.run_experiment(collaborators, {'aggregator.settings.rounds_to_train': 5}) - final_fl_model.save_native('final_pytorch_model') diff --git a/tests/openfl/federated/plan/plan_example.yaml b/tests/openfl/federated/plan/plan_example.yaml index af976f3f43..8afe672b11 100644 --- a/tests/openfl/federated/plan/plan_example.yaml +++ b/tests/openfl/federated/plan/plan_example.yaml @@ -9,6 +9,7 @@ aggregator : best_state_path : save/best.pbuf last_state_path : save/last.pbuf rounds_to_train : 10 + persistent_db_path: tensor.db collaborator : defaults : plan/defaults/collaborator.yaml diff --git a/tests/openfl/interface/test_aggregator_api.py b/tests/openfl/interface/test_aggregator_api.py index 14572cf8ab..7986634368 100644 --- a/tests/openfl/interface/test_aggregator_api.py +++ b/tests/openfl/interface/test_aggregator_api.py @@ -17,7 +17,19 @@ def test_aggregator_start(mock_parse): plan_config = plan_path.joinpath('plan.yaml') cols_config = plan_path.joinpath('cols.yaml') - mock_parse.return_value = mock.Mock() + # Create a mock plan with the required fields + mock_plan = mock.MagicMock() + mock_plan.__getitem__.side_effect = {'task_group': 'learning'}.get + mock_plan.get = {'task_group': 'learning'}.get + # Add the config attribute with proper nesting + mock_plan.config = { + 'aggregator': { + 'settings': { + 'task_group': 'learning' + } + } + } + mock_parse.return_value = mock_plan ret = start_(['-p', plan_config, '-c', cols_config], standalone_mode=False) @@ -32,7 +44,20 @@ def test_aggregator_start_illegal_plan(mock_parse, mock_is_directory_traversal): plan_config = plan_path.joinpath('plan.yaml') cols_config = plan_path.joinpath('cols.yaml') - mock_parse.return_value = mock.Mock() + # Create a mock plan with the required fields + mock_plan = mock.MagicMock() + mock_plan.__getitem__.side_effect = {'task_group': 'learning'}.get + mock_plan.get = {'task_group': 'learning'}.get + # Add the config attribute with proper nesting + mock_plan.config = { + 'aggregator': { + 'settings': { + 'task_group': 'learning' + } + } + } + mock_parse.return_value = mock_plan + mock_is_directory_traversal.side_effect = [True, False] with TestCase.assertRaises(test_aggregator_start_illegal_plan, SystemExit): @@ -48,7 +73,20 @@ def test_aggregator_start_illegal_cols(mock_parse, mock_is_directory_traversal): plan_config = plan_path.joinpath('plan.yaml') cols_config = plan_path.joinpath('cols.yaml') - mock_parse.return_value = mock.Mock() + # Create a mock plan with the required fields + mock_plan = mock.MagicMock() + mock_plan.__getitem__.side_effect = {'task_group': 'learning'}.get + mock_plan.get = {'task_group': 'learning'}.get + # Add the config attribute with proper nesting + mock_plan.config = { + 'aggregator': { + 'settings': { + 'task_group': 'learning' + } + } + } + mock_parse.return_value = mock_plan + mock_is_directory_traversal.side_effect = [False, True] with TestCase.assertRaises(test_aggregator_start_illegal_cols, SystemExit): diff --git a/tests/openfl/native/__init__.py b/tests/openfl/native/__init__.py deleted file mode 100644 index 319114d31f..0000000000 --- a/tests/openfl/native/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (C) 2020-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -"""tests.openfl.native package.""" diff --git a/tests/openfl/native/base_example.yaml b/tests/openfl/native/base_example.yaml deleted file mode 100644 index bd0d342898..0000000000 --- a/tests/openfl/native/base_example.yaml +++ /dev/null @@ -1,8 +0,0 @@ -Planet: - Earth: - Continent: - North-America: - USA: - Oregon: 'Portland' - Mars: ['Water', 'Ice'] - Pluto: [] \ No newline at end of file diff --git a/tests/openfl/native/test_update_plan.py b/tests/openfl/native/test_update_plan.py deleted file mode 100644 index e92c2a5cc0..0000000000 --- a/tests/openfl/native/test_update_plan.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (C) 2020-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -"""Update plan test module.""" -import pytest -from pathlib import Path - -from openfl.federated import Plan -from openfl.native import update_plan - - -@pytest.mark.parametrize( - 'override_config,expected_result', [ - ({}, - {'Planet': {'Earth': {'Continent': {'North-America': {'USA': {'Oregon': 'Portland'}}}}, - 'Mars': ['Water', 'Ice'], - 'Pluto': []}}), - ({'Planet.Earth.Continent.Australia': 'Sydney'}, - {'Planet': {'Earth': {'Continent': {'Australia': 'Sydney', - 'North-America': {'USA': {'Oregon': 'Portland'}}}}, - 'Mars': ['Water', 'Ice'], - 'Pluto': []}}) - ]) -def test_update_plan_new_key_value_addition(override_config, expected_result): - """Test update_plan for adding a new key value pair.""" - plan = Plan() - plan.config = Plan.load(Path('./tests/openfl/native/base_example.yaml')) - result = update_plan(override_config, plan=plan, resolve=False) - assert result.config == expected_result - - -@pytest.mark.parametrize( - 'override_config,expected_result', [ - ({'Planet.Jupiter': ['Sun', 'Rings']}, - {'Planet': {'Earth': {'Continent': {'North-America': {'USA': {'Oregon': 'Portland'}}}}, - 'Mars': ['Water', 'Ice'], - 'Pluto': [], - 'Jupiter': ['Sun', 'Rings']}}), - ({'Planet.Earth.Continent.Australia': ['Sydney', 'Melbourne']}, - {'Planet': {'Earth': {'Continent': {'Australia': ['Sydney', 'Melbourne'], - 'North-America': {'USA': {'Oregon': 'Portland'}}}}, - 'Mars': ['Water', 'Ice'], - 'Pluto': []}}) - ]) -def test_update_plan_new_key_list_value_addition(override_config, expected_result): - """Test update_plan or adding a new key with value as a list.""" - plan = Plan() - plan.config = Plan.load(Path('./tests/openfl/native/base_example.yaml')) - result = update_plan(override_config, plan=plan, resolve=False) - assert result.config == expected_result - - -@pytest.mark.parametrize( - 'override_config,expected_result', [ - ({'Planet.Earth.Continent.North-America.USA.Oregon': 'Salem'}, - {'Planet': {'Earth': {'Continent': {'North-America': {'USA': {'Oregon': 'Salem'}}}}, - 'Mars': ['Water', 'Ice'], - 'Pluto': []}}), - ({'Planet.Mars': 'Moon'}, - {'Planet': {'Earth': {'Continent': {'North-America': {'USA': {'Oregon': 'Portland'}}}}, - 'Mars': 'Moon', - 'Pluto': []}}), - ({'Planet.Pluto': 'Tiny'}, - {'Planet': {'Earth': {'Continent': {'North-America': {'USA': {'Oregon': 'Portland'}}}}, - 'Mars': ['Water', 'Ice'], - 'Pluto': 'Tiny'}}) - ]) -def test_update_plan_existing_key_value_updation(override_config, expected_result): - """Test update_plan for adding a new key value pair.""" - plan = Plan() - plan.config = Plan.load(Path('./tests/openfl/native/base_example.yaml')) - result = update_plan(override_config, plan=plan, resolve=False) - assert result.config == expected_result - - -@pytest.mark.parametrize( - 'override_config,expected_result', [ - ({'Planet.Mars': ['Water', 'Moon', 'Ice']}, - {'Planet': {'Earth': {'Continent': {'North-America': {'USA': {'Oregon': 'Portland'}}}}, - 'Mars': ['Water', 'Moon', 'Ice'], - 'Pluto': []}}), - ({'Planet.Mars': ['Water']}, - {'Planet': {'Earth': {'Continent': {'North-America': {'USA': {'Oregon': 'Portland'}}}}, - 'Mars': ['Water'], - 'Pluto': []}}), - ({'Planet.Earth.Continent.North-America.USA.Oregon': ['Portland', 'Salem']}, - {'Planet': {'Earth': {'Continent': {'North-America': - {'USA': {'Oregon': ['Portland', 'Salem']}}}}, - 'Mars': ['Water', 'Ice'], - 'Pluto': []}}), - ({'Planet.Earth.Continent.North-America.USA.Oregon': ['Salem']}, - {'Planet': {'Earth': {'Continent': {'North-America': {'USA': {'Oregon': ['Salem']}}}}, - 'Mars': ['Water', 'Ice'], - 'Pluto': []}}), - ({'Planet.Pluto': ['Tiny', 'Far']}, - {'Planet': {'Earth': {'Continent': {'North-America': {'USA': {'Oregon': 'Portland'}}}}, - 'Mars': ['Water', 'Ice'], - 'Pluto': ['Tiny', 'Far']}}) - ]) -def test_update_plan_existing_key_list_value_updation(override_config, expected_result): - """Test update_plan or adding a new key with value as a list.""" - plan = Plan() - plan.config = Plan.load(Path('./tests/openfl/native/base_example.yaml')) - result = update_plan(override_config, plan=plan, resolve=False) - assert result.config == expected_result