securefederatedai · manuelhsantana · Dec 30, 2023 · Feb 27, 2024 · Feb 27, 2024 · Mar 1, 2024
diff --git a/openfl-tutorials/experimental/Workflow_Interface_501_FineTuning_LLAMA2.ipynb b/openfl-tutorials/experimental/Workflow_Interface_501_FineTuning_LLAMA2.ipynb
@@ -69,8 +69,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install git+https://github.com/intel/openfl.git\n",
-    "!pip install -r requirements_workflow_interface.txt"
+    "!pip install git+https://github.com/intel/openfl.git"
    ]
   },
   {
@@ -120,7 +119,7 @@
    "id": "8a2f407c-58f1-4f6e-8ee2-7ab81b6582e5",
    "metadata": {},
    "source": [
-    "We begin with a basic example of a PyTorch model using the transformers library. The model employs AutoTokenizer and AutoModel for Large Language Models (LLMs), which are trained on a specific dataset. The initial steps involve defining our imports, importing the pre-trained model, and loading and splitting the dataset."
+    "The model employs AutoTokenizer and AutoModel for Large Language Models (LLMs), which are trained on a specific dataset. The initial steps involve defining our imports, importing the pre-trained model, and loading and splitting the dataset."
    ]
   },
   {
@@ -146,31 +145,57 @@
     "from trl import SFTTrainer\n",
     "from datasets import load_dataset, DatasetDict\n",
     "\n",
-    "# Other \n",
+    "# Importing the datasets library and setting the Huggingface paths for downloaded datasets and cache\n",
+    "import datasets\n",
+    "from pathlib import Path\n",
+    "datasets.config.DOWNLOADED_DATASETS_PATH = Path(\"./files/DOWNLOADED_DATASETS_PATH\")\n",
+    "datasets.config.HF_DATASETS_CACHE = Path(\"./files/HF_DATASETS_LOCAL\")\n",
+    "\n",
+    "# Importing other necessary modules\n",
     "from random import randrange\n",
     "import numpy as np\n",
     "import os"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "29f6202c-79c6-4f4e-84f6-8f7c4655f664",
+   "metadata": {},
+   "source": [
+    "Load the specified dataset and perform hash verification."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7e85e030",
+   "id": "31c7e8cb-c24c-4957-af0d-4189d15f79c9",
    "metadata": {},
    "outputs": [],
    "source": [
+    "from openfl.utilities import validate_folder_hash\n",
+    "\n",
     "# Llama2 model ID\n",
     "model_id = \"meta-llama/Llama-2-7b-hf\"\n",
     "# Dataset name\n",
-    "dataset_name = \"OpenAssistant/oasst1\""
+    "dataset_name = \"OpenAssistant/oasst1\"\n",
+    "\n",
+    "# Load the dataset using the specified dataset name\n",
+    "dataset = load_dataset(dataset_name) \n",
+    "\n",
+    "def verify_data():\n",
+    "    datapath = ('./files/DOWNLOADED_DATASETS_PATH')\n",
+    "    validate_folder_hash(datapath, '0d63ab13b316da3a8480234dfa1747fc58300ee4c83a0f954a049e212307c5dee82d4d0c26e68a22728533b0b1eccdb1')\n",
+    "    print('Verification passed')\n",
+    "\n",
+    "verify_data()"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "fb858ab6-9e09-41a3-acf3-ea12bcf2f2af",
    "metadata": {},
    "source": [
-    "Load the specified dataset and split it into training and testing datasets using an 80-20 split. We further split the testing dataset into testing and validation datasets using a 50-50 split. Finally, we combine these split datasets into a single DatasetDict for easy access and management."
+    "Split the dataset loaded into training and testing datasets using an 80-20 split. We further split the testing dataset into testing and validation datasets using a 50-50 split. Finally, we combine these split datasets into a single DatasetDict for easy access and management."
    ]
   },
   {
@@ -180,9 +205,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Load the dataset using the specified dataset name\n",
-    "dataset = load_dataset(dataset_name) \n",
-    "\n",
     "# Split the loaded dataset into training and testing datasets with a 80-20 split\n",
     "train_testvalid = dataset['train'].train_test_split(test_size=0.2)\n",
     "\n",

diff --git a/openfl/utilities/utils.py b/openfl/utilities/utils.py
@@ -115,6 +115,44 @@ def validate_file_hash(file_path, expected_hash, chunk_size=8192):
     if h.hexdigest() != expected_hash:
         raise SystemError('ZIP File hash doesn\'t match expected file hash.')
 
+def validate_folder_hash(folder_path, expected_hash, chunk_size=8192):
+    """Validate SHA384 hash for all files in the specified folder.
+
+    This function validates the SHA384 hash of all files in a folder against an expected hash.
+
+    Args:
+        folder_path (str): The path to the folder to validate.
+            (absolute or relative to the current working directory) of the folder to be opened.
+        expected_hash (str): The expected SHA384 hash of all files in the folder.
+        chunk_size (int, optional): The size of the chunks to read from the file. Defaults to 8192.
+
+    Raises:
+        SystemError: If the hash of the files does not match the expected hash.
+        FileNotFound: If the folder does not exist.
+        NotADirectoryError: If the path is not a directory.
+    """
+   # Check if the folder exists
+    if not os.path.exists(folder_path):
+        raise FileNotFoundError(f"The folder {folder_path} does not exist.")
+
+    # Check if the path is a folder
+    if not os.path.isdir(folder_path):
+        raise NotADirectoryError(f"The path {folder_path} is not a directory.")
+
+    h = hashlib.sha384()
+    for root, _, files in os.walk(folder_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            with open(file_path, 'rb') as file:
+                # Reading is buffered, so we can read smaller chunks.
+                while True:
+                    chunk = file.read(chunk_size)
+                    if not chunk:
+                        break
+                    h.update(chunk)
+
+    if h.hexdigest() != expected_hash:
+        raise SystemError('Folder hash doesn\'t match expected hash.')
 
 def tqdm_report_hook():
     """Visualize downloading."""