diff --git a/.github/workflows/task_runner_basic_e2e.yml b/.github/workflows/task_runner_basic_e2e.yml index b50eedd526..b9f7832d98 100644 --- a/.github/workflows/task_runner_basic_e2e.yml +++ b/.github/workflows/task_runner_basic_e2e.yml @@ -36,7 +36,7 @@ jobs: matrix: # There are open issues for some of the models, so excluding them for now: # model_name: [ "torch_cnn_mnist", "keras_cnn_mnist", "torch_cnn_histology" ] - model_name: ["torch_cnn_mnist", "keras_cnn_mnist"] + model_name: ["torch_cnn_mnist", "keras/cnn_mnist"] python_version: ["3.10", "3.11", "3.12"] fail-fast: false # do not immediately fail if one of the combinations fail diff --git a/.github/workflows/taskrunner.yml b/.github/workflows/taskrunner.yml index a9093be4c1..77a87a16d5 100644 --- a/.github/workflows/taskrunner.yml +++ b/.github/workflows/taskrunner.yml @@ -42,4 +42,4 @@ jobs: pip install . - name: Test TaskRunner API run: | - python -m tests.github.test_hello_federation --template keras_cnn_mnist --fed_workspace aggregator --col1 col1 --col2 col2 --rounds-to-train 3 --save-model output_model + python -m tests.github.test_hello_federation --template keras/cnn_mnist --fed_workspace aggregator --col1 col1 --col2 col2 --rounds-to-train 3 --save-model output_model diff --git a/.github/workflows/tr_docker_gramine_direct.yml b/.github/workflows/tr_docker_gramine_direct.yml index 309351f385..2ae6936495 100644 --- a/.github/workflows/tr_docker_gramine_direct.yml +++ b/.github/workflows/tr_docker_gramine_direct.yml @@ -27,7 +27,7 @@ jobs: - name: Create workspace image run: | - fx workspace create --prefix example_workspace --template keras_cnn_mnist + fx workspace create --prefix example_workspace --template keras/cnn_mnist cd example_workspace fx plan initialize -a localhost diff --git a/.github/workflows/tr_docker_native.yml b/.github/workflows/tr_docker_native.yml index f5af424a18..2b5f490a30 100644 --- a/.github/workflows/tr_docker_native.yml +++ b/.github/workflows/tr_docker_native.yml @@ -27,7 +27,7 @@ jobs: - name: Create workspace image run: | - fx workspace create --prefix example_workspace --template keras_cnn_mnist + fx workspace create --prefix example_workspace --template keras/cnn_mnist cd example_workspace fx plan initialize -a localhost fx workspace dockerize --save --revision https://github.com/${GITHUB_REPOSITORY}.git@${{ github.event.pull_request.head.sha }} diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index c968e85f11..80f3ea2678 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -50,4 +50,4 @@ jobs: pip install . - name: Test TaskRunner API run: | - python -m tests.github.test_hello_federation --template keras_cnn_mnist --fed_workspace aggregator --col1 col1 --col2 col2 --rounds-to-train 3 --save-model output_model + python -m tests.github.test_hello_federation --template keras/cnn_mnist --fed_workspace aggregator --col1 col1 --col2 col2 --rounds-to-train 3 --save-model output_model diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 341b93b7f1..5486ea56fc 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -48,4 +48,4 @@ jobs: pip install . - name: Test TaskRunner API run: | - python -m tests.github.test_hello_federation --template keras_cnn_mnist --fed_workspace aggregator --col1 col1 --col2 col2 --rounds-to-train 3 --save-model output_model \ No newline at end of file + python -m tests.github.test_hello_federation --template keras/cnn_mnist --fed_workspace aggregator --col1 col1 --col2 col2 --rounds-to-train 3 --save-model output_model \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 73f919c844..8b9155410c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -6,13 +6,12 @@ def snykData = [ // CN-14619 snyk test CLI does not support -f in requirements.txt file // 'openfl-workspace_torch_cnn_histology': 'openfl-workspace/torch_cnn_histology/requirements.txt', 'openfl-workspace_torch_cnn_histology_src': 'openfl-workspace/torch_cnn_histology/src/requirements.txt', - 'openfl-workspace_keras_nlp': 'openfl-workspace/keras_nlp/requirements.txt', + 'openfl-workspace_keras_nlp': 'openfl-workspace/keras/nlp/requirements.txt', 'openfl-workspace_torch_cnn_mnist': 'openfl-workspace/torch_cnn_mnist/requirements.txt', 'openfl-workspace_torch_unet_kvasir': 'openfl-workspace/torch_unet_kvasir/requirements.txt', 'openfl-workspace_tf_cnn_histology': 'openfl-workspace/tf_cnn_histology/requirements.txt', 'openfl-workspace_tf_3dunet_brats': 'openfl-workspace/tf_3dunet_brats/requirements.txt', - 'openfl-workspace_keras_cnn_with_compression': 'openfl-workspace/keras_cnn_with_compression/requirements.txt', - 'openfl-workspace_keras_cnn_mnist': 'openfl-workspace/keras_cnn_mnist/requirements.txt', + 'openfl-workspace_keras_cnn_mnist': 'openfl-workspace/keras/cnn_mnist/requirements.txt', 'openfl-tutorials_interactive_api_pytorch_medmnist_2d_envoy': 'openfl-tutorials/interactive_api/PyTorch_MedMNIST_2D/envoy/requirements.txt', 'openfl-tutorials_interactive_api_pytorch_dogscats_vit_workspace': 'openfl-tutorials/interactive_api/PyTorch_DogsCats_ViT/workspace/requirements.txt', 'openfl-tutorials_interactive_api_pytorch_histology_envoy': 'openfl-tutorials/interactive_api/PyTorch_Histology/envoy/requirements.txt', diff --git a/docs/tutorials/taskrunner.ipynb b/docs/tutorials/taskrunner.ipynb index d19fcdc6d0..a95236e17b 100644 --- a/docs/tutorials/taskrunner.ipynb +++ b/docs/tutorials/taskrunner.ipynb @@ -36,7 +36,7 @@ "metadata": {}, "outputs": [], "source": [ - "!fx workspace create --prefix ./mnist_example --template keras_cnn_mnist\n", + "!fx workspace create --prefix ./mnist_example --template keras/cnn_mnist\n", "%cd ./mnist_example" ] }, diff --git a/openfl-workspace/keras_2dunet/.workspace b/openfl-workspace/gandlf_seg_test/.workspace similarity index 100% rename from openfl-workspace/keras_2dunet/.workspace rename to openfl-workspace/gandlf_seg_test/.workspace diff --git a/openfl-workspace/keras_cnn_mnist/.workspace b/openfl-workspace/keras/2dunet/.workspace similarity index 100% rename from openfl-workspace/keras_cnn_mnist/.workspace rename to openfl-workspace/keras/2dunet/.workspace diff --git a/openfl-workspace/keras_2dunet/README.md b/openfl-workspace/keras/2dunet/README.md similarity index 100% rename from openfl-workspace/keras_2dunet/README.md rename to openfl-workspace/keras/2dunet/README.md diff --git a/openfl-workspace/keras_2dunet/plan/cols.yaml b/openfl-workspace/keras/2dunet/plan/cols.yaml similarity index 100% rename from openfl-workspace/keras_2dunet/plan/cols.yaml rename to openfl-workspace/keras/2dunet/plan/cols.yaml diff --git a/openfl-workspace/keras_2dunet/plan/data.yaml b/openfl-workspace/keras/2dunet/plan/data.yaml similarity index 100% rename from openfl-workspace/keras_2dunet/plan/data.yaml rename to openfl-workspace/keras/2dunet/plan/data.yaml diff --git a/openfl-workspace/keras_2dunet/plan/defaults b/openfl-workspace/keras/2dunet/plan/defaults similarity index 100% rename from openfl-workspace/keras_2dunet/plan/defaults rename to openfl-workspace/keras/2dunet/plan/defaults diff --git a/openfl-workspace/keras_2dunet/plan/plan.yaml b/openfl-workspace/keras/2dunet/plan/plan.yaml similarity index 100% rename from openfl-workspace/keras_2dunet/plan/plan.yaml rename to openfl-workspace/keras/2dunet/plan/plan.yaml diff --git a/openfl-workspace/keras_2dunet/requirements.txt b/openfl-workspace/keras/2dunet/requirements.txt similarity index 100% rename from openfl-workspace/keras_2dunet/requirements.txt rename to openfl-workspace/keras/2dunet/requirements.txt diff --git a/openfl-workspace/keras_2dunet/src/__init__.py b/openfl-workspace/keras/2dunet/src/__init__.py similarity index 100% rename from openfl-workspace/keras_2dunet/src/__init__.py rename to openfl-workspace/keras/2dunet/src/__init__.py diff --git a/openfl-workspace/keras_2dunet/src/brats_utils.py b/openfl-workspace/keras/2dunet/src/brats_utils.py similarity index 100% rename from openfl-workspace/keras_2dunet/src/brats_utils.py rename to openfl-workspace/keras/2dunet/src/brats_utils.py diff --git a/openfl-workspace/keras_2dunet/src/dataloader.py b/openfl-workspace/keras/2dunet/src/dataloader.py similarity index 100% rename from openfl-workspace/keras_2dunet/src/dataloader.py rename to openfl-workspace/keras/2dunet/src/dataloader.py diff --git a/openfl-workspace/keras_2dunet/src/nii_reader.py b/openfl-workspace/keras/2dunet/src/nii_reader.py similarity index 100% rename from openfl-workspace/keras_2dunet/src/nii_reader.py rename to openfl-workspace/keras/2dunet/src/nii_reader.py diff --git a/openfl-workspace/keras_2dunet/src/taskrunner.py b/openfl-workspace/keras/2dunet/src/taskrunner.py similarity index 100% rename from openfl-workspace/keras_2dunet/src/taskrunner.py rename to openfl-workspace/keras/2dunet/src/taskrunner.py diff --git a/openfl-workspace/keras_nlp/.workspace b/openfl-workspace/keras/cnn_mnist/.workspace similarity index 100% rename from openfl-workspace/keras_nlp/.workspace rename to openfl-workspace/keras/cnn_mnist/.workspace diff --git a/openfl-workspace/keras_cnn_mnist/plan/cols.yaml b/openfl-workspace/keras/cnn_mnist/plan/cols.yaml similarity index 100% rename from openfl-workspace/keras_cnn_mnist/plan/cols.yaml rename to openfl-workspace/keras/cnn_mnist/plan/cols.yaml diff --git a/openfl-workspace/keras_cnn_mnist/plan/data.yaml b/openfl-workspace/keras/cnn_mnist/plan/data.yaml similarity index 100% rename from openfl-workspace/keras_cnn_mnist/plan/data.yaml rename to openfl-workspace/keras/cnn_mnist/plan/data.yaml diff --git a/openfl-workspace/keras_cnn_mnist/plan/defaults b/openfl-workspace/keras/cnn_mnist/plan/defaults similarity index 100% rename from openfl-workspace/keras_cnn_mnist/plan/defaults rename to openfl-workspace/keras/cnn_mnist/plan/defaults diff --git a/openfl-workspace/keras_cnn_mnist/plan/plan.yaml b/openfl-workspace/keras/cnn_mnist/plan/plan.yaml similarity index 100% rename from openfl-workspace/keras_cnn_mnist/plan/plan.yaml rename to openfl-workspace/keras/cnn_mnist/plan/plan.yaml diff --git a/openfl-workspace/keras/cnn_mnist/requirements.txt b/openfl-workspace/keras/cnn_mnist/requirements.txt new file mode 100644 index 0000000000..34a7b94009 --- /dev/null +++ b/openfl-workspace/keras/cnn_mnist/requirements.txt @@ -0,0 +1,2 @@ +keras==3.6.0 +tensorflow==2.18.0 \ No newline at end of file diff --git a/openfl-workspace/keras_cnn_mnist/src/__init__.py b/openfl-workspace/keras/cnn_mnist/src/__init__.py similarity index 100% rename from openfl-workspace/keras_cnn_mnist/src/__init__.py rename to openfl-workspace/keras/cnn_mnist/src/__init__.py diff --git a/openfl-workspace/keras_cnn_mnist/src/dataloader.py b/openfl-workspace/keras/cnn_mnist/src/dataloader.py similarity index 100% rename from openfl-workspace/keras_cnn_mnist/src/dataloader.py rename to openfl-workspace/keras/cnn_mnist/src/dataloader.py diff --git a/openfl-workspace/keras_cnn_mnist/src/mnist_utils.py b/openfl-workspace/keras/cnn_mnist/src/mnist_utils.py similarity index 100% rename from openfl-workspace/keras_cnn_mnist/src/mnist_utils.py rename to openfl-workspace/keras/cnn_mnist/src/mnist_utils.py diff --git a/openfl-workspace/keras_cnn_mnist/src/taskrunner.py b/openfl-workspace/keras/cnn_mnist/src/taskrunner.py similarity index 100% rename from openfl-workspace/keras_cnn_mnist/src/taskrunner.py rename to openfl-workspace/keras/cnn_mnist/src/taskrunner.py diff --git a/openfl-workspace/keras/nlp/.workspace b/openfl-workspace/keras/nlp/.workspace new file mode 100644 index 0000000000..3c2c5d08b4 --- /dev/null +++ b/openfl-workspace/keras/nlp/.workspace @@ -0,0 +1,2 @@ +current_plan_name: default + diff --git a/openfl-workspace/keras_nlp/plan/cols.yaml b/openfl-workspace/keras/nlp/plan/cols.yaml similarity index 100% rename from openfl-workspace/keras_nlp/plan/cols.yaml rename to openfl-workspace/keras/nlp/plan/cols.yaml diff --git a/openfl-workspace/keras_nlp/plan/data.yaml b/openfl-workspace/keras/nlp/plan/data.yaml similarity index 100% rename from openfl-workspace/keras_nlp/plan/data.yaml rename to openfl-workspace/keras/nlp/plan/data.yaml diff --git a/openfl-workspace/keras_nlp/plan/plan.yaml b/openfl-workspace/keras/nlp/plan/plan.yaml similarity index 100% rename from openfl-workspace/keras_nlp/plan/plan.yaml rename to openfl-workspace/keras/nlp/plan/plan.yaml diff --git a/openfl-workspace/keras_nlp/requirements.txt b/openfl-workspace/keras/nlp/requirements.txt similarity index 100% rename from openfl-workspace/keras_nlp/requirements.txt rename to openfl-workspace/keras/nlp/requirements.txt diff --git a/openfl-workspace/keras_nlp/src/__init__.py b/openfl-workspace/keras/nlp/src/__init__.py similarity index 100% rename from openfl-workspace/keras_nlp/src/__init__.py rename to openfl-workspace/keras/nlp/src/__init__.py diff --git a/openfl-workspace/keras_nlp/src/dataloader.py b/openfl-workspace/keras/nlp/src/dataloader.py similarity index 100% rename from openfl-workspace/keras_nlp/src/dataloader.py rename to openfl-workspace/keras/nlp/src/dataloader.py diff --git a/openfl-workspace/keras_nlp/src/dataloader_utils.py b/openfl-workspace/keras/nlp/src/dataloader_utils.py similarity index 100% rename from openfl-workspace/keras_nlp/src/dataloader_utils.py rename to openfl-workspace/keras/nlp/src/dataloader_utils.py diff --git a/openfl-workspace/keras_nlp/src/taskrunner.py b/openfl-workspace/keras/nlp/src/taskrunner.py similarity index 100% rename from openfl-workspace/keras_nlp/src/taskrunner.py rename to openfl-workspace/keras/nlp/src/taskrunner.py diff --git a/openfl-workspace/keras_cnn_mnist/requirements.txt b/openfl-workspace/keras_cnn_mnist/requirements.txt deleted file mode 100644 index 5fa9907811..0000000000 --- a/openfl-workspace/keras_cnn_mnist/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -keras==3.6.0 -tensorflow==2.18.0 - diff --git a/openfl-workspace/keras_jax/nlp/.workspace b/openfl-workspace/keras_jax/nlp/.workspace new file mode 100644 index 0000000000..3c2c5d08b4 --- /dev/null +++ b/openfl-workspace/keras_jax/nlp/.workspace @@ -0,0 +1,2 @@ +current_plan_name: default + diff --git a/openfl-workspace/keras_jax/nlp/plan/cols.yaml b/openfl-workspace/keras_jax/nlp/plan/cols.yaml new file mode 100644 index 0000000000..95307de3bc --- /dev/null +++ b/openfl-workspace/keras_jax/nlp/plan/cols.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2020-2021 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +collaborators: + \ No newline at end of file diff --git a/openfl-workspace/keras_jax/nlp/plan/data.yaml b/openfl-workspace/keras_jax/nlp/plan/data.yaml new file mode 100644 index 0000000000..257c7825fe --- /dev/null +++ b/openfl-workspace/keras_jax/nlp/plan/data.yaml @@ -0,0 +1,7 @@ +# Copyright (C) 2020-2021 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +# collaborator_name,data_directory_path +one,1 + + diff --git a/openfl-workspace/keras_jax/nlp/plan/plan.yaml b/openfl-workspace/keras_jax/nlp/plan/plan.yaml new file mode 100644 index 0000000000..ce7476ab85 --- /dev/null +++ b/openfl-workspace/keras_jax/nlp/plan/plan.yaml @@ -0,0 +1,46 @@ +# Copyright (C) 2020-2021 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +aggregator : + defaults : plan/defaults/aggregator.yaml + template : openfl.component.Aggregator + settings : + init_state_path : save/init.pbuf + best_state_path : save/best.pbuf + last_state_path : save/last.pbuf + rounds_to_train : 10 + +collaborator : + defaults : plan/defaults/collaborator.yaml + template : openfl.component.Collaborator + settings : + db_store_rounds: 2 + delta_updates : false + opt_treatment : RESET + +data_loader : + defaults : plan/defaults/data_loader.yaml + template : src.dataloader.NLPDataLoader + settings : + collaborator_count : 2 + batch_size : 64 + split_ratio: 0.2 + num_samples: 10000 + +task_runner : + defaults : plan/defaults/task_runner.yaml + template : src.taskrunner.KerasNLP + settings : + latent_dim : 256 + +network : + defaults : plan/defaults/network.yaml + +assigner : + defaults : plan/defaults/assigner.yaml + +tasks : + defaults : plan/defaults/tasks_keras.yaml + +compression_pipeline : + defaults : plan/defaults/compression_pipeline.yaml diff --git a/openfl-workspace/keras_jax/nlp/requirements.txt b/openfl-workspace/keras_jax/nlp/requirements.txt new file mode 100644 index 0000000000..38c991c679 --- /dev/null +++ b/openfl-workspace/keras_jax/nlp/requirements.txt @@ -0,0 +1,2 @@ +keras==3.6.0 +jax==0.4.38 \ No newline at end of file diff --git a/openfl-workspace/keras_jax/nlp/src/__init__.py b/openfl-workspace/keras_jax/nlp/src/__init__.py new file mode 100644 index 0000000000..8201974015 --- /dev/null +++ b/openfl-workspace/keras_jax/nlp/src/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2021-2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""openfl nlp keras template.""" diff --git a/openfl-workspace/keras_jax/nlp/src/dataloader.py b/openfl-workspace/keras_jax/nlp/src/dataloader.py new file mode 100644 index 0000000000..d7180b4d3c --- /dev/null +++ b/openfl-workspace/keras_jax/nlp/src/dataloader.py @@ -0,0 +1,142 @@ +"""Copyright (C) 2020-2021 Intel Corporation + SPDX-License-Identifier: Apache-2.0 + +Licensed subject to the terms of the separately executed evaluation +license agreement between Intel Corporation and you. +""" +from logging import getLogger +from typing import Optional +from typing import Iterator +from typing import Tuple +from typing import Union + +import numpy as np +import src.dataloader_utils as dlu + +from openfl.federated import KerasDataLoader + +logger = getLogger(__name__) + + +class NLPDataLoader(KerasDataLoader): + """NLP Dataloader template.""" + + def __init__(self, collaborator_count: int, split_ratio: float, + num_samples: int, data_path: str, batch_size: int, **kwargs) -> None: + """Instantiate the data object. + + Args: + data_path: The file path to the data Returns: + batch_size: The batch size of the data loader tuple: shape of an example feature array + **kwargs: Additional arguments, passed to super init and load_mnist_shard + + Returns: + none + """ + self.shard_num = data_path + self.data_path = dlu.download_data_() + + self.batch_size = batch_size + + train, valid, details = dlu.load_shard(collaborator_count, self.shard_num, + self.data_path, num_samples, split_ratio) + + self.num_samples = details['num_samples'] + self.num_encoder_tokens = details['num_encoder_tokens'] + self.num_decoder_tokens = details['num_decoder_tokens'] + self.max_encoder_seq_length = details['max_encoder_seq_length'] + self.max_decoder_seq_length = details['max_decoder_seq_length'] + + self.X_train = [train[0], train[1]] + self.y_train = train[2] + self.X_valid = [valid[0], valid[1]] + self.y_valid = valid[2] + + def get_feature_shape(self) -> Tuple[int, ...]: + """Get the shape of an example feature array.""" + return self.X_train[0].shape + + def get_train_loader(self, batch_size: Optional[int] = None) -> Iterator[Tuple[np.ndarray]]: + """ + Get training data loader. + + Returns + ------- + loader object + """ + return self._get_batch_generator(X1=self.X_train[0], X2=self.X_train[1], + y=self.y_train, batch_size=batch_size) + + def get_valid_loader(self, batch_size: Optional[int] = None) -> Iterator[Tuple[np.ndarray]]: + """ + Get validation data loader. + + Returns: + loader object + """ + return self._get_batch_generator(X1=self.X_valid[0], X2=self.X_valid[1], + y=self.y_valid, batch_size=batch_size) + + def get_train_data_size(self) -> int: + """ + Get total number of training samples. + + Returns: + int: number of training samples + """ + return self.X_train[0].shape[0] + + def get_valid_data_size(self) -> int: + """ + Get total number of validation samples. + + Returns: + int: number of validation samples + """ + return self.X_valid[0].shape[0] + + @staticmethod + def _batch_generator(X1: np.ndarray, X2: np.ndarray, + y: np.ndarray, idxs: np.ndarray, + batch_size: int, + num_batches: int) -> Iterator[Tuple[np.ndarray]]: + """ + Generate batch of data. + + Args: + X: input data + y: label data + idxs: The index of the dataset + batch_size: The batch size for the data loader + num_batches: The number of batches + Yields: + tuple: input data, label data + """ + for i in range(num_batches): + a = i * batch_size + b = a + batch_size + yield (X1[idxs[a:b]], X2[idxs[a:b]]), y[idxs[a:b]] + + def _get_batch_generator(self, X1: np.ndarray, X2: np.ndarray, + y: np.ndarray, + batch_size: Union[int, None]): + """ + Return the dataset generator. + + Args: + X1: input data (encoder) + X2: input data (decoder) + y: label data + batch_size: The batch size for the data loader + """ + if batch_size is None: + batch_size = self.batch_size + # shuffle data indices + idxs = np.random.permutation(np.arange(X1.shape[0])) + # compute the number of batches + num_batches = int(np.ceil(X1.shape[0] / batch_size)) + # build the generator and return it + # TODO: due to _batch_generator(X1, ...) has first param X1, all params here will be moved, + # X1 -> X2, X2 -> y, y -> idxs, idxs -> batch_size, batch_size -> num_batches, + # and num_batches -> should be unexpected in this function + return self._batch_generator(X1, X2, y, idxs, batch_size, num_batches) diff --git a/openfl-workspace/keras_jax/nlp/src/dataloader_utils.py b/openfl-workspace/keras_jax/nlp/src/dataloader_utils.py new file mode 100644 index 0000000000..6e86ee5dcb --- /dev/null +++ b/openfl-workspace/keras_jax/nlp/src/dataloader_utils.py @@ -0,0 +1,230 @@ +"""Copyright (C) 2020-2021 Intel Corporation + SPDX-License-Identifier: Apache-2.0 + +Licensed subject to the terms of the separately executed evaluation +license agreement between Intel Corporation and you. +""" +from logging import getLogger +from os import getcwd +from os import path +from os import remove +from typing import Dict +from typing import Tuple +from zipfile import ZipFile + +import numpy as np +import requests + +logger = getLogger(__name__) + + +def download_data_() -> str: + """Download data. + + Returns: + string: relative path to data file + """ + pkg = 'fra-eng.zip' # Language file: change this to change the language + data_dir = 'data' + url = 'https://www.manythings.org/anki/' + pkg + filename = pkg.split('-')[0] + '.txt' + + workspace_dir = getcwd() + default_path = path.join(workspace_dir, data_dir) + pkgpath = path.join(default_path, pkg) # path to downloaded zipfile + filepath = path.join(default_path, filename) # path to extracted file + + if path.isfile(filepath): + return path.join(data_dir, filename) + try: + response = requests.get(url, headers={'User-Agent': 'openfl'}) + if response.status_code == 200: + with open(pkgpath, 'wb') as f: + f.write(response.content) + else: + print(f'Error while downloading {pkg} from {url}: Aborting!') + exit() + except Exception: + print(f'Error while downloading {pkg} from {url}: Aborting!') + exit() + + try: + with ZipFile(pkgpath, 'r') as z: + z.extract(filename, default_path) + except Exception: + print(f'Error while extracting {pkgpath}: Aborting!') + exit() + + if path.isfile(filepath): + remove(pkgpath) + return path.join(data_dir, filename) + else: + return '' + + +def import_raw_data_( + data_path: str = '', + num_samples: int = 0 +) -> Tuple[Dict[str, int], np.ndarray, np.ndarray, np.ndarray]: + """Import data. + + Returns: + dict: variable details + numpy.ndarray: encoder input data + numpy.ndarray: decoder input data + numpy.ndarray: decoder labels + """ + # Vectorize the data. + input_texts = [] + target_texts = [] + input_characters = set() + target_characters = set() + with open(data_path, 'r', encoding='utf-8') as f: + lines = f.read().split('\n') + for line in lines[: min(num_samples, len(lines) - 1)]: + input_text, target_text, _ = line.split('\t') + # We use 'tab' as the 'start sequence' character + # for the targets, and '\n' as 'end sequence' character. + target_text = '\t' + target_text + '\n' + input_texts.append(input_text) + target_texts.append(target_text) + for char in input_text: + if char not in input_characters: + input_characters.add(char) + for char in target_text: + if char not in target_characters: + target_characters.add(char) + + input_characters = sorted(input_characters) + target_characters = sorted(target_characters) + num_encoder_tokens = len(input_characters) + num_decoder_tokens = len(target_characters) + max_encoder_seq_length = max([len(txt) for txt in input_texts]) + max_decoder_seq_length = max([len(txt) for txt in target_texts]) + + details = {'num_samples': len(input_texts), + 'num_encoder_tokens': num_encoder_tokens, + 'num_decoder_tokens': num_decoder_tokens, + 'max_encoder_seq_length': max_encoder_seq_length, + 'max_decoder_seq_length': max_decoder_seq_length} + + input_token_index = {char: i for i, char in enumerate(input_characters)} + target_token_index = {char: i for i, char in enumerate(target_characters)} + + encoder_input_data = np.zeros( + (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32') + + decoder_input_data = np.zeros( + (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') + + decoder_target_data = np.zeros( + (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') + + for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): + for t, char in enumerate(input_text): + encoder_input_data[i, t, input_token_index[char]] = 1.0 + encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.0 + for t, char in enumerate(target_text): + # decoder_target_data is ahead of decoder_input_data by one timestep + decoder_input_data[i, t, target_token_index[char]] = 1.0 + if t > 0: + # decoder_target_data will be ahead by one timestep + # and will not include the start character. + decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 + decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.0 + decoder_target_data[i, t:, target_token_index[' ']] = 1.0 + + logger.info(f'[DL]-import_raw_data: Number of samples = {len(input_texts)}') + logger.info(f'[DL]-import_raw_data: Number of unique input tokens = {num_encoder_tokens}') + logger.info(f'[DL]-import_raw_data: ' + f'Number of unique decoder tokens = {num_decoder_tokens}') + + logger.info(f'[DL]-import_raw_data: ' + f'Max sequence length for inputs = {max_encoder_seq_length}') + + logger.info(f'[DL]-import_raw_data: ' + f'Max sequence length for outputs = {max_decoder_seq_length}') + + logger.info(f'[DL]-import_raw_data: encoder_input_data = {encoder_input_data.shape}') + logger.info(f'[DL]-import_raw_data: decoder_input_data = {decoder_input_data.shape}') + logger.info(f'[DL]-import_raw_data: decoder_target_data = {decoder_target_data.shape}') + + return details, encoder_input_data, decoder_input_data, decoder_target_data + + +def get_datasets_(encoder_input_data: np.ndarray, decoder_input_data: np.ndarray, + decoder_target_data: np.ndarray, + num_samples: int, split_ratio: float) -> Dict[str, np.ndarray]: + """Create train/val. + + Returns: + dict: Results, containing the train-valid split of the dataset (split_ratio = 0.2) + """ + import random + + random.seed(42) + train_indexes = random.sample(range(num_samples), int(num_samples * (1 - split_ratio))) + valid_indexes = np.delete(range(num_samples), train_indexes) + + # Dataset creation (2 inputs , 1 output ) + encoder_train_input = encoder_input_data[train_indexes, :, :] + decoder_train_input = decoder_input_data[train_indexes, :, :] + decoder_train_labels = decoder_target_data[train_indexes, :, :] + + encoder_valid_input = encoder_input_data[valid_indexes, :, :] + decoder_valid_input = decoder_input_data[valid_indexes, :, :] + decoder_valid_labels = decoder_target_data[valid_indexes, :, :] + + results = {'encoder_train_input': encoder_train_input, + 'decoder_train_input': decoder_train_input, + 'decoder_train_labels': decoder_train_labels, + 'encoder_valid_input': encoder_valid_input, + 'decoder_valid_input': decoder_valid_input, + 'decoder_valid_labels': decoder_valid_labels} + + logger.info(f'[DL]get_datasets: encoder_train_input = {encoder_train_input.shape}') + logger.info(f'[DL]get_datasets: decoder_train_labels= {decoder_train_labels.shape}') + + return results + + +def load_shard( + collaborator_count: int, shard_num: str, data_path: str, + num_samples: int, split_ratio: float +) -> Tuple[Tuple[np.ndarray, ...], Tuple[np.ndarray, ...], Dict[str, int]]: + """Load data-shards. + + Returns: + Tuple: ( numpy.ndarray: X_train_encoder, + numpy.ndarray: X_train_decoder, + numpy.ndarray: y_train) + Tuple: ( numpy.ndarray: X_valid_encoder, + numpy.ndarray: X_valid_decoder, + numpy.ndarray: y_valid) + Dict: details, from DataLoader_utils.get_datasets + """ + details, encoder_input_data, decoder_input_data, decoder_target_data = import_raw_data_( + data_path, + num_samples + ) + + train_val_dataset = get_datasets_(encoder_input_data, decoder_input_data, + decoder_target_data, num_samples, split_ratio) + # Get the data shards + shard_num = int(shard_num) + X_train_encoder = train_val_dataset['encoder_train_input'][shard_num::collaborator_count] + X_train_decoder = train_val_dataset['decoder_train_input'][shard_num::collaborator_count] + y_train = train_val_dataset['decoder_train_labels'][shard_num::collaborator_count] + + X_valid_encoder = train_val_dataset['encoder_valid_input'][shard_num::collaborator_count] + X_valid_decoder = train_val_dataset['decoder_valid_input'][shard_num::collaborator_count] + y_valid = train_val_dataset['decoder_valid_labels'][shard_num::collaborator_count] + + logger.info(f'[DL]load_shard: X_train_encoder = {X_train_encoder.shape}') + logger.info(f'[DL]load_shard: y_train = {y_train.shape}') + + return ( + (X_train_encoder, X_train_decoder, y_train), + (X_valid_encoder, X_valid_decoder, y_valid), + details + ) diff --git a/openfl-workspace/keras_jax/nlp/src/taskrunner.py b/openfl-workspace/keras_jax/nlp/src/taskrunner.py new file mode 100644 index 0000000000..88563452f7 --- /dev/null +++ b/openfl-workspace/keras_jax/nlp/src/taskrunner.py @@ -0,0 +1,72 @@ +"""Copyright (C) 2020-2024 Intel Corporation + SPDX-License-Identifier: Apache-2.0 + +Licensed subject to the terms of the separately executed evaluation +license agreement between Intel Corporation and you. +""" +import keras + +from openfl.federated import KerasTaskRunner + + +class KerasNLP(KerasTaskRunner): + """A basic convolutional neural network model.""" + + def __init__(self, latent_dim, **kwargs): + """ + Init taskrunner. + + Args: + **kwargs: Additional parameters to pass to the function + """ + super().__init__(**kwargs) + + self.model = self.build_model(latent_dim, + self.data_loader.num_encoder_tokens, + self.data_loader.num_decoder_tokens, + **kwargs) + + self.initialize_tensorkeys_for_functions() + + self.model.summary(print_fn=self.logger.info) + + self.logger.info(f'Train Set Size : {self.get_train_data_size()}') + + def build_model(self, latent_dim, num_encoder_tokens, num_decoder_tokens, **kwargs): + """ + Define the model architecture. + + Args: + input_shape (numpy.ndarray): The shape of the data + num_classes (int): The number of classes of the dataset + Returns: + tensorflow.python.keras.engine.sequential.Sequential: The model defined in Keras + """ + encoder_inputs = keras.Input(shape=(None, num_encoder_tokens)) + encoder = keras.layers.LSTM(latent_dim, return_state=True) + encoder_outputs, state_h, state_c = encoder(encoder_inputs) + + # We discard `encoder_outputs` and only keep the states. + encoder_states = [state_h, state_c] + + # Set up the decoder, using `encoder_states` as initial state. + decoder_inputs = keras.Input(shape=(None, num_decoder_tokens)) + + # We set up our decoder to return full output sequences, + # and to return internal states as well. We don't use the + # return states in the training model, but we will use them in inference. + decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) + decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) + decoder_dense = keras.layers.Dense(num_decoder_tokens, activation='softmax') + decoder_outputs = decoder_dense(decoder_outputs) + + # Define the model that will turn + # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` + model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs) + + model.compile( + optimizer="RMSprop", + loss='categorical_crossentropy', metrics=['accuracy'] + ) + + return model diff --git a/openfl-workspace/keras_torch/nlp/.workspace b/openfl-workspace/keras_torch/nlp/.workspace new file mode 100644 index 0000000000..3c2c5d08b4 --- /dev/null +++ b/openfl-workspace/keras_torch/nlp/.workspace @@ -0,0 +1,2 @@ +current_plan_name: default + diff --git a/openfl-workspace/keras_torch/nlp/plan/cols.yaml b/openfl-workspace/keras_torch/nlp/plan/cols.yaml new file mode 100644 index 0000000000..95307de3bc --- /dev/null +++ b/openfl-workspace/keras_torch/nlp/plan/cols.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2020-2021 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +collaborators: + \ No newline at end of file diff --git a/openfl-workspace/keras_torch/nlp/plan/data.yaml b/openfl-workspace/keras_torch/nlp/plan/data.yaml new file mode 100644 index 0000000000..257c7825fe --- /dev/null +++ b/openfl-workspace/keras_torch/nlp/plan/data.yaml @@ -0,0 +1,7 @@ +# Copyright (C) 2020-2021 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +# collaborator_name,data_directory_path +one,1 + + diff --git a/openfl-workspace/keras_torch/nlp/plan/plan.yaml b/openfl-workspace/keras_torch/nlp/plan/plan.yaml new file mode 100644 index 0000000000..ce7476ab85 --- /dev/null +++ b/openfl-workspace/keras_torch/nlp/plan/plan.yaml @@ -0,0 +1,46 @@ +# Copyright (C) 2020-2021 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +aggregator : + defaults : plan/defaults/aggregator.yaml + template : openfl.component.Aggregator + settings : + init_state_path : save/init.pbuf + best_state_path : save/best.pbuf + last_state_path : save/last.pbuf + rounds_to_train : 10 + +collaborator : + defaults : plan/defaults/collaborator.yaml + template : openfl.component.Collaborator + settings : + db_store_rounds: 2 + delta_updates : false + opt_treatment : RESET + +data_loader : + defaults : plan/defaults/data_loader.yaml + template : src.dataloader.NLPDataLoader + settings : + collaborator_count : 2 + batch_size : 64 + split_ratio: 0.2 + num_samples: 10000 + +task_runner : + defaults : plan/defaults/task_runner.yaml + template : src.taskrunner.KerasNLP + settings : + latent_dim : 256 + +network : + defaults : plan/defaults/network.yaml + +assigner : + defaults : plan/defaults/assigner.yaml + +tasks : + defaults : plan/defaults/tasks_keras.yaml + +compression_pipeline : + defaults : plan/defaults/compression_pipeline.yaml diff --git a/openfl-workspace/keras_torch/nlp/requirements.txt b/openfl-workspace/keras_torch/nlp/requirements.txt new file mode 100644 index 0000000000..b35bba448a --- /dev/null +++ b/openfl-workspace/keras_torch/nlp/requirements.txt @@ -0,0 +1,2 @@ +keras==3.6.0 +torch==2.5.1 \ No newline at end of file diff --git a/openfl-workspace/keras_torch/nlp/src/__init__.py b/openfl-workspace/keras_torch/nlp/src/__init__.py new file mode 100644 index 0000000000..8201974015 --- /dev/null +++ b/openfl-workspace/keras_torch/nlp/src/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2021-2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""openfl nlp keras template.""" diff --git a/openfl-workspace/keras_torch/nlp/src/dataloader.py b/openfl-workspace/keras_torch/nlp/src/dataloader.py new file mode 100644 index 0000000000..d7180b4d3c --- /dev/null +++ b/openfl-workspace/keras_torch/nlp/src/dataloader.py @@ -0,0 +1,142 @@ +"""Copyright (C) 2020-2021 Intel Corporation + SPDX-License-Identifier: Apache-2.0 + +Licensed subject to the terms of the separately executed evaluation +license agreement between Intel Corporation and you. +""" +from logging import getLogger +from typing import Optional +from typing import Iterator +from typing import Tuple +from typing import Union + +import numpy as np +import src.dataloader_utils as dlu + +from openfl.federated import KerasDataLoader + +logger = getLogger(__name__) + + +class NLPDataLoader(KerasDataLoader): + """NLP Dataloader template.""" + + def __init__(self, collaborator_count: int, split_ratio: float, + num_samples: int, data_path: str, batch_size: int, **kwargs) -> None: + """Instantiate the data object. + + Args: + data_path: The file path to the data Returns: + batch_size: The batch size of the data loader tuple: shape of an example feature array + **kwargs: Additional arguments, passed to super init and load_mnist_shard + + Returns: + none + """ + self.shard_num = data_path + self.data_path = dlu.download_data_() + + self.batch_size = batch_size + + train, valid, details = dlu.load_shard(collaborator_count, self.shard_num, + self.data_path, num_samples, split_ratio) + + self.num_samples = details['num_samples'] + self.num_encoder_tokens = details['num_encoder_tokens'] + self.num_decoder_tokens = details['num_decoder_tokens'] + self.max_encoder_seq_length = details['max_encoder_seq_length'] + self.max_decoder_seq_length = details['max_decoder_seq_length'] + + self.X_train = [train[0], train[1]] + self.y_train = train[2] + self.X_valid = [valid[0], valid[1]] + self.y_valid = valid[2] + + def get_feature_shape(self) -> Tuple[int, ...]: + """Get the shape of an example feature array.""" + return self.X_train[0].shape + + def get_train_loader(self, batch_size: Optional[int] = None) -> Iterator[Tuple[np.ndarray]]: + """ + Get training data loader. + + Returns + ------- + loader object + """ + return self._get_batch_generator(X1=self.X_train[0], X2=self.X_train[1], + y=self.y_train, batch_size=batch_size) + + def get_valid_loader(self, batch_size: Optional[int] = None) -> Iterator[Tuple[np.ndarray]]: + """ + Get validation data loader. + + Returns: + loader object + """ + return self._get_batch_generator(X1=self.X_valid[0], X2=self.X_valid[1], + y=self.y_valid, batch_size=batch_size) + + def get_train_data_size(self) -> int: + """ + Get total number of training samples. + + Returns: + int: number of training samples + """ + return self.X_train[0].shape[0] + + def get_valid_data_size(self) -> int: + """ + Get total number of validation samples. + + Returns: + int: number of validation samples + """ + return self.X_valid[0].shape[0] + + @staticmethod + def _batch_generator(X1: np.ndarray, X2: np.ndarray, + y: np.ndarray, idxs: np.ndarray, + batch_size: int, + num_batches: int) -> Iterator[Tuple[np.ndarray]]: + """ + Generate batch of data. + + Args: + X: input data + y: label data + idxs: The index of the dataset + batch_size: The batch size for the data loader + num_batches: The number of batches + Yields: + tuple: input data, label data + """ + for i in range(num_batches): + a = i * batch_size + b = a + batch_size + yield (X1[idxs[a:b]], X2[idxs[a:b]]), y[idxs[a:b]] + + def _get_batch_generator(self, X1: np.ndarray, X2: np.ndarray, + y: np.ndarray, + batch_size: Union[int, None]): + """ + Return the dataset generator. + + Args: + X1: input data (encoder) + X2: input data (decoder) + y: label data + batch_size: The batch size for the data loader + """ + if batch_size is None: + batch_size = self.batch_size + # shuffle data indices + idxs = np.random.permutation(np.arange(X1.shape[0])) + # compute the number of batches + num_batches = int(np.ceil(X1.shape[0] / batch_size)) + # build the generator and return it + # TODO: due to _batch_generator(X1, ...) has first param X1, all params here will be moved, + # X1 -> X2, X2 -> y, y -> idxs, idxs -> batch_size, batch_size -> num_batches, + # and num_batches -> should be unexpected in this function + return self._batch_generator(X1, X2, y, idxs, batch_size, num_batches) diff --git a/openfl-workspace/keras_torch/nlp/src/dataloader_utils.py b/openfl-workspace/keras_torch/nlp/src/dataloader_utils.py new file mode 100644 index 0000000000..6e86ee5dcb --- /dev/null +++ b/openfl-workspace/keras_torch/nlp/src/dataloader_utils.py @@ -0,0 +1,230 @@ +"""Copyright (C) 2020-2021 Intel Corporation + SPDX-License-Identifier: Apache-2.0 + +Licensed subject to the terms of the separately executed evaluation +license agreement between Intel Corporation and you. +""" +from logging import getLogger +from os import getcwd +from os import path +from os import remove +from typing import Dict +from typing import Tuple +from zipfile import ZipFile + +import numpy as np +import requests + +logger = getLogger(__name__) + + +def download_data_() -> str: + """Download data. + + Returns: + string: relative path to data file + """ + pkg = 'fra-eng.zip' # Language file: change this to change the language + data_dir = 'data' + url = 'https://www.manythings.org/anki/' + pkg + filename = pkg.split('-')[0] + '.txt' + + workspace_dir = getcwd() + default_path = path.join(workspace_dir, data_dir) + pkgpath = path.join(default_path, pkg) # path to downloaded zipfile + filepath = path.join(default_path, filename) # path to extracted file + + if path.isfile(filepath): + return path.join(data_dir, filename) + try: + response = requests.get(url, headers={'User-Agent': 'openfl'}) + if response.status_code == 200: + with open(pkgpath, 'wb') as f: + f.write(response.content) + else: + print(f'Error while downloading {pkg} from {url}: Aborting!') + exit() + except Exception: + print(f'Error while downloading {pkg} from {url}: Aborting!') + exit() + + try: + with ZipFile(pkgpath, 'r') as z: + z.extract(filename, default_path) + except Exception: + print(f'Error while extracting {pkgpath}: Aborting!') + exit() + + if path.isfile(filepath): + remove(pkgpath) + return path.join(data_dir, filename) + else: + return '' + + +def import_raw_data_( + data_path: str = '', + num_samples: int = 0 +) -> Tuple[Dict[str, int], np.ndarray, np.ndarray, np.ndarray]: + """Import data. + + Returns: + dict: variable details + numpy.ndarray: encoder input data + numpy.ndarray: decoder input data + numpy.ndarray: decoder labels + """ + # Vectorize the data. + input_texts = [] + target_texts = [] + input_characters = set() + target_characters = set() + with open(data_path, 'r', encoding='utf-8') as f: + lines = f.read().split('\n') + for line in lines[: min(num_samples, len(lines) - 1)]: + input_text, target_text, _ = line.split('\t') + # We use 'tab' as the 'start sequence' character + # for the targets, and '\n' as 'end sequence' character. + target_text = '\t' + target_text + '\n' + input_texts.append(input_text) + target_texts.append(target_text) + for char in input_text: + if char not in input_characters: + input_characters.add(char) + for char in target_text: + if char not in target_characters: + target_characters.add(char) + + input_characters = sorted(input_characters) + target_characters = sorted(target_characters) + num_encoder_tokens = len(input_characters) + num_decoder_tokens = len(target_characters) + max_encoder_seq_length = max([len(txt) for txt in input_texts]) + max_decoder_seq_length = max([len(txt) for txt in target_texts]) + + details = {'num_samples': len(input_texts), + 'num_encoder_tokens': num_encoder_tokens, + 'num_decoder_tokens': num_decoder_tokens, + 'max_encoder_seq_length': max_encoder_seq_length, + 'max_decoder_seq_length': max_decoder_seq_length} + + input_token_index = {char: i for i, char in enumerate(input_characters)} + target_token_index = {char: i for i, char in enumerate(target_characters)} + + encoder_input_data = np.zeros( + (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32') + + decoder_input_data = np.zeros( + (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') + + decoder_target_data = np.zeros( + (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') + + for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): + for t, char in enumerate(input_text): + encoder_input_data[i, t, input_token_index[char]] = 1.0 + encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.0 + for t, char in enumerate(target_text): + # decoder_target_data is ahead of decoder_input_data by one timestep + decoder_input_data[i, t, target_token_index[char]] = 1.0 + if t > 0: + # decoder_target_data will be ahead by one timestep + # and will not include the start character. + decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 + decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.0 + decoder_target_data[i, t:, target_token_index[' ']] = 1.0 + + logger.info(f'[DL]-import_raw_data: Number of samples = {len(input_texts)}') + logger.info(f'[DL]-import_raw_data: Number of unique input tokens = {num_encoder_tokens}') + logger.info(f'[DL]-import_raw_data: ' + f'Number of unique decoder tokens = {num_decoder_tokens}') + + logger.info(f'[DL]-import_raw_data: ' + f'Max sequence length for inputs = {max_encoder_seq_length}') + + logger.info(f'[DL]-import_raw_data: ' + f'Max sequence length for outputs = {max_decoder_seq_length}') + + logger.info(f'[DL]-import_raw_data: encoder_input_data = {encoder_input_data.shape}') + logger.info(f'[DL]-import_raw_data: decoder_input_data = {decoder_input_data.shape}') + logger.info(f'[DL]-import_raw_data: decoder_target_data = {decoder_target_data.shape}') + + return details, encoder_input_data, decoder_input_data, decoder_target_data + + +def get_datasets_(encoder_input_data: np.ndarray, decoder_input_data: np.ndarray, + decoder_target_data: np.ndarray, + num_samples: int, split_ratio: float) -> Dict[str, np.ndarray]: + """Create train/val. + + Returns: + dict: Results, containing the train-valid split of the dataset (split_ratio = 0.2) + """ + import random + + random.seed(42) + train_indexes = random.sample(range(num_samples), int(num_samples * (1 - split_ratio))) + valid_indexes = np.delete(range(num_samples), train_indexes) + + # Dataset creation (2 inputs , 1 output ) + encoder_train_input = encoder_input_data[train_indexes, :, :] + decoder_train_input = decoder_input_data[train_indexes, :, :] + decoder_train_labels = decoder_target_data[train_indexes, :, :] + + encoder_valid_input = encoder_input_data[valid_indexes, :, :] + decoder_valid_input = decoder_input_data[valid_indexes, :, :] + decoder_valid_labels = decoder_target_data[valid_indexes, :, :] + + results = {'encoder_train_input': encoder_train_input, + 'decoder_train_input': decoder_train_input, + 'decoder_train_labels': decoder_train_labels, + 'encoder_valid_input': encoder_valid_input, + 'decoder_valid_input': decoder_valid_input, + 'decoder_valid_labels': decoder_valid_labels} + + logger.info(f'[DL]get_datasets: encoder_train_input = {encoder_train_input.shape}') + logger.info(f'[DL]get_datasets: decoder_train_labels= {decoder_train_labels.shape}') + + return results + + +def load_shard( + collaborator_count: int, shard_num: str, data_path: str, + num_samples: int, split_ratio: float +) -> Tuple[Tuple[np.ndarray, ...], Tuple[np.ndarray, ...], Dict[str, int]]: + """Load data-shards. + + Returns: + Tuple: ( numpy.ndarray: X_train_encoder, + numpy.ndarray: X_train_decoder, + numpy.ndarray: y_train) + Tuple: ( numpy.ndarray: X_valid_encoder, + numpy.ndarray: X_valid_decoder, + numpy.ndarray: y_valid) + Dict: details, from DataLoader_utils.get_datasets + """ + details, encoder_input_data, decoder_input_data, decoder_target_data = import_raw_data_( + data_path, + num_samples + ) + + train_val_dataset = get_datasets_(encoder_input_data, decoder_input_data, + decoder_target_data, num_samples, split_ratio) + # Get the data shards + shard_num = int(shard_num) + X_train_encoder = train_val_dataset['encoder_train_input'][shard_num::collaborator_count] + X_train_decoder = train_val_dataset['decoder_train_input'][shard_num::collaborator_count] + y_train = train_val_dataset['decoder_train_labels'][shard_num::collaborator_count] + + X_valid_encoder = train_val_dataset['encoder_valid_input'][shard_num::collaborator_count] + X_valid_decoder = train_val_dataset['decoder_valid_input'][shard_num::collaborator_count] + y_valid = train_val_dataset['decoder_valid_labels'][shard_num::collaborator_count] + + logger.info(f'[DL]load_shard: X_train_encoder = {X_train_encoder.shape}') + logger.info(f'[DL]load_shard: y_train = {y_train.shape}') + + return ( + (X_train_encoder, X_train_decoder, y_train), + (X_valid_encoder, X_valid_decoder, y_valid), + details + ) diff --git a/openfl-workspace/keras_torch/nlp/src/taskrunner.py b/openfl-workspace/keras_torch/nlp/src/taskrunner.py new file mode 100644 index 0000000000..88563452f7 --- /dev/null +++ b/openfl-workspace/keras_torch/nlp/src/taskrunner.py @@ -0,0 +1,72 @@ +"""Copyright (C) 2020-2024 Intel Corporation + SPDX-License-Identifier: Apache-2.0 + +Licensed subject to the terms of the separately executed evaluation +license agreement between Intel Corporation and you. +""" +import keras + +from openfl.federated import KerasTaskRunner + + +class KerasNLP(KerasTaskRunner): + """A basic convolutional neural network model.""" + + def __init__(self, latent_dim, **kwargs): + """ + Init taskrunner. + + Args: + **kwargs: Additional parameters to pass to the function + """ + super().__init__(**kwargs) + + self.model = self.build_model(latent_dim, + self.data_loader.num_encoder_tokens, + self.data_loader.num_decoder_tokens, + **kwargs) + + self.initialize_tensorkeys_for_functions() + + self.model.summary(print_fn=self.logger.info) + + self.logger.info(f'Train Set Size : {self.get_train_data_size()}') + + def build_model(self, latent_dim, num_encoder_tokens, num_decoder_tokens, **kwargs): + """ + Define the model architecture. + + Args: + input_shape (numpy.ndarray): The shape of the data + num_classes (int): The number of classes of the dataset + Returns: + tensorflow.python.keras.engine.sequential.Sequential: The model defined in Keras + """ + encoder_inputs = keras.Input(shape=(None, num_encoder_tokens)) + encoder = keras.layers.LSTM(latent_dim, return_state=True) + encoder_outputs, state_h, state_c = encoder(encoder_inputs) + + # We discard `encoder_outputs` and only keep the states. + encoder_states = [state_h, state_c] + + # Set up the decoder, using `encoder_states` as initial state. + decoder_inputs = keras.Input(shape=(None, num_decoder_tokens)) + + # We set up our decoder to return full output sequences, + # and to return internal states as well. We don't use the + # return states in the training model, but we will use them in inference. + decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) + decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) + decoder_dense = keras.layers.Dense(num_decoder_tokens, activation='softmax') + decoder_outputs = decoder_dense(decoder_outputs) + + # Define the model that will turn + # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` + model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs) + + model.compile( + optimizer="RMSprop", + loss='categorical_crossentropy', metrics=['accuracy'] + ) + + return model diff --git a/openfl-workspace/torch_cnn_histology/.workspace b/openfl-workspace/torch_cnn_histology/.workspace new file mode 100644 index 0000000000..3c2c5d08b4 --- /dev/null +++ b/openfl-workspace/torch_cnn_histology/.workspace @@ -0,0 +1,2 @@ +current_plan_name: default + diff --git a/openfl-workspace/torch_cnn_histology_fedcurv/.workspace b/openfl-workspace/torch_cnn_histology_fedcurv/.workspace new file mode 100644 index 0000000000..3c2c5d08b4 --- /dev/null +++ b/openfl-workspace/torch_cnn_histology_fedcurv/.workspace @@ -0,0 +1,2 @@ +current_plan_name: default + diff --git a/openfl-workspace/torch_cnn_mnist/.workspace b/openfl-workspace/torch_cnn_mnist/.workspace new file mode 100644 index 0000000000..3c2c5d08b4 --- /dev/null +++ b/openfl-workspace/torch_cnn_mnist/.workspace @@ -0,0 +1,2 @@ +current_plan_name: default + diff --git a/openfl-workspace/xgb_higgs/.workspace b/openfl-workspace/xgb_higgs/.workspace new file mode 100644 index 0000000000..3c2c5d08b4 --- /dev/null +++ b/openfl-workspace/xgb_higgs/.workspace @@ -0,0 +1,2 @@ +current_plan_name: default + diff --git a/openfl/federated/task/runner_keras.py b/openfl/federated/task/runner_keras.py index e2dd069f72..a9e5c8672e 100644 --- a/openfl/federated/task/runner_keras.py +++ b/openfl/federated/task/runner_keras.py @@ -9,6 +9,8 @@ """ import copy +import os +from importlib import util from warnings import catch_warnings, simplefilter import numpy as np @@ -17,6 +19,14 @@ from openfl.utilities import Metric, TensorKey, change_tags from openfl.utilities.split import split_tensor_dict_for_holdouts +# Set the KERAS_BACKEND environment variable based on the available deep learning framework +if util.find_spec("tensorflow") is not None: + os.environ["KERAS_BACKEND"] = "tensorflow" # Use TensorFlow as the backend +elif util.find_spec("torch") is not None: + os.environ["KERAS_BACKEND"] = "torch" # Use PyTorch as the backend +elif util.find_spec("jax") is not None: + os.environ["KERAS_BACKEND"] = "jax" # Use JAX as the backend + with catch_warnings(): simplefilter(action="ignore") import keras diff --git a/openfl/interface/workspace.py b/openfl/interface/workspace.py index d3cb1713c5..238ab101d1 100644 --- a/openfl/interface/workspace.py +++ b/openfl/interface/workspace.py @@ -95,12 +95,15 @@ def get_templates(): Returns: list: A list of default templates. """ - - return [ - d.name - for d in WORKSPACE.glob("*") - if d.is_dir() and d.name not in ["__pycache__", "workspace", "experimental"] - ] + templates = [] + excluded_dirs = ["workspace", "experimental"] + for root, _, files in os.walk(WORKSPACE): + if any(file.endswith(".workspace") for file in files): + dir_path = os.path.relpath(root, WORKSPACE) + dir_path = dir_path.replace(os.sep, "/") + if dir_path and not any(dir_path.startswith(prefix) for prefix in excluded_dirs): + templates.append(dir_path) + return templates @workspace.command(name="create") diff --git a/tests/end_to_end/README.md b/tests/end_to_end/README.md index 191cfd0db4..b71910632d 100644 --- a/tests/end_to_end/README.md +++ b/tests/end_to_end/README.md @@ -55,7 +55,7 @@ For example, to run Task runner (bare metal approach) with - torch_cnn_mnist mod python -m pytest -s tests/end_to_end/test_suites/task_runner_tests.py -m task_runner_basic --num_rounds 5 --num_collaborators 3 --model_name torch_cnn_mnist --disable_tls ``` -And, to run Task runner (via dockerized workspace) with keras_cnn_mnist, 2 collaborators, 3 rounds: +And, to run Task runner (via dockerized workspace) with keras/cnn_mnist, 2 collaborators, 3 rounds: ```sh python -m pytest -s tests/end_to_end/test_suites/task_runner_tests.py -m task_runner_dockerized_ws --num_rounds 3 --num_collaborators 2 --model_name keras_cnn_mnist diff --git a/tests/github/test_double_ws_export.py b/tests/github/test_double_ws_export.py index 95c9440b31..7e9b42bec3 100644 --- a/tests/github/test_double_ws_export.py +++ b/tests/github/test_double_ws_export.py @@ -22,7 +22,7 @@ def main(): for entry in iterator: if entry.name not in ['__init__.py', 'workspace', 'default']: workspace_choice.append(entry.name) - parser.add_argument('--template', default='keras_cnn_mnist', choices=workspace_choice) + parser.add_argument('--template', default='keras/cnn_mnist', choices=workspace_choice) parser.add_argument('--fed_workspace', default='fed_work12345alpha81671') parser.add_argument('--col1', default='one123dragons') parser.add_argument('--col2', default='beta34unicorns') diff --git a/tests/github/test_gandlf.py b/tests/github/test_gandlf.py index a57f9f53a0..08e80e2118 100644 --- a/tests/github/test_gandlf.py +++ b/tests/github/test_gandlf.py @@ -21,7 +21,7 @@ def exec(command, directory): def main(): parser = argparse.ArgumentParser() - parser.add_argument('--template', default='keras_cnn_mnist') + parser.add_argument('--template', default='keras/cnn_mnist') parser.add_argument('--fed_workspace', default='fed_work12345alpha81671') parser.add_argument('--col1', default='one') parser.add_argument('--col2', default='two') diff --git a/tests/github/test_hello_federation.py b/tests/github/test_hello_federation.py index e6b84b8de2..a3ef7296a8 100644 --- a/tests/github/test_hello_federation.py +++ b/tests/github/test_hello_federation.py @@ -17,11 +17,14 @@ def main(): # Test the pipeline parser = argparse.ArgumentParser() workspace_choice = [] - with os.scandir('openfl-workspace') as iterator: - for entry in iterator: - if entry.name not in ['__init__.py', 'workspace', 'default']: - workspace_choice.append(entry.name) - parser.add_argument('--template', default='keras_cnn_mnist', choices=workspace_choice) + excluded_dirs = ['workspace', 'default', "experimental"] + for root, _, files in os.walk('openfl-workspace'): + if any(file.endswith(".workspace") for file in files): + dir_path = os.path.relpath(root, 'openfl-workspace') + dir_path = dir_path.replace(os.sep, '/') + if dir_path and not any(dir_path.startswith(prefix) for prefix in excluded_dirs): + workspace_choice.append(dir_path) + parser.add_argument('--template', default='keras/cnn_mnist', choices=workspace_choice) parser.add_argument('--fed_workspace', default='fed_work12345alpha81671') parser.add_argument('--col1', default='one123dragons') parser.add_argument('--col2', default='beta34unicorns')