Merge branch 'main' into kdestin/add-30-minute-timeout

Azure · Jan 18, 2024 · 171f22d · 171f22d
2 parents 7ef4154 + 37a4dd2
commit 171f22d
Show file tree

Hide file tree

Showing 58 changed files with 6,289 additions and 83 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -218,6 +218,7 @@
 /cli/setup.sh @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
 /cli/train.sh @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
 /deploy-arm-templates-az-cli.sh @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
+/sdk/python/assets/environment/environment.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
 /sdk/python/assets/model/model.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
 /sdk/python/endpoints/batch/deploy-models/custom-outputs-parquet/custom-output-batch.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
 /sdk/python/endpoints/batch/deploy-models/heart-classifier-mlflow/mlflow-for-batch-tabular.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
@@ -243,7 +244,10 @@
 /sdk/python/featurestore_sample/notebooks/sdk_only/2.\ Experiment\ and\ train\ models\ using\ features.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
 /sdk/python/featurestore_sample/notebooks/sdk_only/3.\ Enable\ recurrent\ materialization\ and\ run\ batch\ inference.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
 /sdk/python/featurestore_sample/notebooks/sdk_only/4.\ Enable\ online\ store\ and\ run\ online\ inference.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
-/sdk/python/featurestore_sample/notebooks/sdk_only/5.\ Develop\ a\ feature\ set\ with\ custom\ source.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
+/sdk/python/featurestore_sample/notebooks/sdk_only/5.\ Develop\ a\ feature\ set\ with\ custom\ source.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
+/sdk/python/foundation-models/benchmarking/evaluating_claude_models.ipynb @arun-rajora
+/sdk/python/foundation-models/benchmarking/evaluating_oai_models.ipynb @arun-rajora
+/sdk/python/foundation-models/benchmarking/evaluation_pipelines/* @arun-rajora  
 /sdk/python/jobs/automl-standalone-jobs/automl-classification-task-bankmarketing/automl-classification-task-bankmarketing-serverless.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
 /sdk/python/jobs/automl-standalone-jobs/automl-image-object-detection-task-fridge-items/automl-image-object-detection-task-fridge-items.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  
 /sdk/python/jobs/configuration.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1  

diff --git a/cli/endpoints/online/managed/sample/blue-deployment-azureml.yml b/cli/endpoints/online/managed/sample/blue-deployment-azureml.yml
@@ -0,0 +1,11 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: blue
+endpoint_name: my-endpoint
+model:
+  path: ../../model-1/model/
+code_configuration:
+  code: ../../model-1/onlinescoring/
+  scoring_script: score.py
+environment: azureml://registries/azureml/environments/sklearn-1.1/versions/17
+instance_type: Standard_DS3_v2
+instance_count: 1
diff --git a/cli/endpoints/online/managed/sample/green-deployment-azureml.yml b/cli/endpoints/online/managed/sample/green-deployment-azureml.yml
@@ -0,0 +1,11 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: green
+endpoint_name: my-endpoint
+model:
+  path: ../../model-2/model/
+code_configuration:
+  code: ../../model-2/onlinescoring/
+  scoring_script: score.py
+environment: azureml://registries/azureml/environments/sklearn-1.1/versions/17
+instance_type: Standard_DS3_v2
+instance_count: 1
diff --git a/...ai-v2/openai_completions_finetune_pipeline/openai_completions_finetune_pipeline_spec.yaml b/...ai-v2/openai_completions_finetune_pipeline/openai_completions_finetune_pipeline_spec.yaml
@@ -16,7 +16,7 @@ inputs:
 jobs:
   pipeline_finetune:
     type: pipeline
-    component: azureml://registries/azureml/components/openai_completions_finetune_pipeline/versions/0.1.1
+    component: azureml://registries/azureml/components/openai_completions_finetune_pipeline/versions/0.1.2
     inputs:
       train_dataset:
         type: uri_folder

diff --git a/.../aacs-scoring-files/docker_env/Dockerfile → ...image/scoring-files/docker_env/Dockerfile b/.../aacs-scoring-files/docker_env/Dockerfile → ...image/scoring-files/docker_env/Dockerfile
diff --git a/...-files/docker_env/conda_dependencies.yaml → ...-files/docker_env/conda_dependencies.yaml b/...-files/docker_env/conda_dependencies.yaml → ...-files/docker_env/conda_dependencies.yaml
diff --git a/...les/score/parallel_run_step.settings.json → ...les/score/parallel_run_step.settings.json b/...les/score/parallel_run_step.settings.json → ...les/score/parallel_run_step.settings.json
diff --git a/...e/aacs-scoring-files/score/score_batch.py → ...-image/scoring-files/score/score_batch.py b/...e/aacs-scoring-files/score/score_batch.py → ...-image/scoring-files/score/score_batch.py
@@ -36,6 +36,7 @@ def init():
     global g_logger
     global g_file_loader_dictionary
     global aacs_client
+    global aacs_enabled
 
     g_logger = logging.getLogger("azureml")
     g_logger.setLevel(logging.INFO)
@@ -60,10 +61,15 @@ def init():
         ".pqt": load_parquet,
     }
 
-    endpoint = os.environ.get("CONTENT_SAFETY_ENDPOINT")
-    key = os.environ.get("CONTENT_SAFETY_KEY")
+    endpoint = os.environ.get("CONTENT_SAFETY_ENDPOINT", None)
+    key = os.environ.get("CONTENT_SAFETY_KEY", None)
     # Create an Content Safety client
-    aacs_client = ContentSafetyClient(endpoint, AzureKeyCredential(key))
+    if endpoint is not None and key is not None:
+        aacs_client = ContentSafetyClient(endpoint, AzureKeyCredential(key))
+        aacs_enabled = True
+    else:
+        aacs_enabled = False
+        g_logger.warn("Azure AI Content Safety (aacs) is disabled.")
 
 
 def get_input_schema(model_path):
@@ -372,13 +378,18 @@ def run(batch_input):
     predict_result = []
     try:
         aacs_threshold = int(os.environ.get("CONTENT_SAFETY_THRESHOLD", default=1))
-        blocked_input = analyze_data(
-            batch_input, aacs_threshold, blocked_input=None, is_input=True
-        )
+        if aacs_enabled:
+            blocked_input = analyze_data(
+                batch_input, aacs_threshold, blocked_input=None, is_input=True
+            )
         predict_result = g_model.predict(batch_input)
-        _ = analyze_data(
-            predict_result, aacs_threshold, blocked_input=blocked_input, is_input=False
-        )
+        if aacs_enabled:
+            _ = analyze_data(
+                predict_result,
+                aacs_threshold,
+                blocked_input=blocked_input,
+                is_input=False,
+            )
     except Exception as e:
         g_logger.error("Processing mini batch failed with exception: " + str(e))
         g_logger.error(traceback.format_exc())

diff --git a/cli/foundation-models/system/inference/text-to-image/text-to-image-batch-endpoint.sh b/cli/foundation-models/system/inference/text-to-image/text-to-image-batch-endpoint.sh
@@ -52,7 +52,7 @@ else
 fi
 
 # 4. Submit a sample request to endpoint
-data_path="./text_to_image_batch_data/batch_data"
+data_path="./text_to_image_batch_data"
 python utils/prepare_data.py --payload-path $data_path --mode "batch"
 # Path where the processes csvs are dumped. This is the input to the endpoint
 processed_data_path="./text_to_image_batch_data/processed_batch_data"
@@ -73,11 +73,32 @@ az ml batch-endpoint create --name $endpoint_name $workspace_info  || {
     echo "endpoint create failed"; exit 1;
 }
 
+# create a environment for batch deployment
+
+environment_name="text-to-image-model-env"
+environment_label="latest"
+
+if ! az ml environment show --name $environment_name --label $environment_label $workspace_info
+then
+    echo "Environment $environment_name:$environment_label does not exist in Workspace."
+    echo "---Creating environment---"
+    az ml environment create --name $environment_name  --build-context "./scoring-files/docker_env" \
+    $workspace_info || {
+    echo "environment create failed"; exit 1;
+}
+    exit 1
+fi
+
+environment_version=$(az ml environment show --name $environment_name --label $environment_label $workspace_info --query version --output tsv)
+
 # deploy model from registry to endpoint in workspace
 az ml batch-deployment create --file batch-deploy.yml $workspace_info --set \
   endpoint_name=$endpoint_name \
   name=$deployment_name \
   compute=$deployment_compute \
+  environment=azureml:$environment_name:$environment_version \
+  code_configuration.code="scoring-files/score" \
+  code_configuration.scoring_script="score_batch.py" \
   model=azureml://registries/$registry_name/models/$model_name/versions/$model_version || {
     echo "deployment create failed"; exit 1;
 }

diff --git a/sdk/python/foundation-models/azure_openai/oai-v2/openai_chat_finetune_pipeline.ipynb b/sdk/python/foundation-models/azure_openai/oai-v2/openai_chat_finetune_pipeline.ipynb
@@ -94,7 +94,7 @@
    "source": [
     "# Get a handle to registry\n",
     "ml_client = MLClient(\n",
-    "    credential=credential, registry_name=\"azureml\", registry_location=\"northcentralus\"\n",
+    "    credential=credential, registry_name=\"azureml\", region=\"northcentralus\"\n",
     ")"
    ]
   },
@@ -115,7 +115,7 @@
    "outputs": [],
    "source": [
     "finetune_pipeline = load_component(\n",
-    "    client=ml_client, name=\"openai_completions_finetune_pipeline\", version=\"0.1.1\"\n",
+    "    client=ml_client, name=\"openai_completions_finetune_pipeline\", version=\"0.1.2\"\n",
     ")"
    ]
   },
@@ -161,7 +161,7 @@
     "\n",
     "\n",
     "# Construct pipeline\n",
-    "@pipeline()\n",
+    "@pipeline(name=\"test-run\", compute=\"serverless\")\n",
     "def pipeline_with_registered_component(\n",
     "    train_dataset,\n",
     "    validation_dataset,\n",

diff --git a/sdk/python/foundation-models/benchmarking/evaluating_claude_models.ipynb b/sdk/python/foundation-models/benchmarking/evaluating_claude_models.ipynb
@@ -0,0 +1,216 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c90af50b-a3a3-42ea-b855-943777df2bee",
+   "metadata": {},
+   "source": [
+    "# Evaluating Claude models with AzureML\n",
+    "\n",
+    "In this notebook, you will learn how to run evaluations of Anthropic's Claude model using the AzureML SDK. Along with this notebook, we've included a preconfigured set of 12 evaluations using well-known, public datasets (e.g., MMLU, HellaSwag, Winogrande).\n",
+    "\n",
+    "Please see the [Azure AI Leaderboard](https://ai.azure.com/explore/leaderboard) for other supported model benchmarks and for more details on the eval datasets.\n",
+    "\n",
+    "*Disclaimer: This notebook has been tested against AWS Bedrock endpoints for Claude 2.1. Other deployments or model versions are not guaranteed to work with the evaluation pipelines distributed with this notebook.*  \n",
+    "\n",
+    "## Prerequistes\n",
+    "- An Azure account with an active subscription - [Create an account for free](https://azure.microsoft.com/free/?WT.mc_id=A261C142F)\n",
+    "- An Azure ML workspace - [Configure workspace](../../configuration.ipynb)\n",
+    "- Installed Azure Machine Learning Python SDK v2 - [install instructions](../../../README.md) - check the getting started section\n",
+    "- A python environment with [mlflow](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-mlflow-configure-tracking?view=azureml-api-2&tabs=python%2Cmlflow) for retrieving eval metrics\n",
+    "- Access keys for Claude endpoints on [Amazon Web Services Bedrock platform](https://aws.amazon.com/bedrock/claude/)  \n",
+    "\n",
+    "## Configuring a Workspace connection for Bedrock access\n",
+    "You will use a Workspace connection to securely store Bedrock access keys. Follow the steps below to create a custom-type connection:\n",
+    "- Follow directions for [creating a custom connection in the AzureML studio UI](https://learn.microsoft.com/en-us/azure/machine-learning/prompt-flow/tools-reference/python-tool?view=azureml-api-2#create-a-custom-connection)\n",
+    "- Add the following two key-value pairs to the custom connection:\n",
+    "  1. A key named `AccessKey` with a value containing your AWS access key\n",
+    "  2. A key named `SecretKey` with a value containing your AWS secret access key \n",
+    "\n",
+    "## Configuring and running an evaluation pipeline\n",
+    "Please set global values in the following cell for your AzureML Workspace, the Bedrock endpoint you want to call, the name of connection you created in the previous step, and the name of the eval you want to run.\n",
+    "\n",
+    "Supported evals are the following: `boolq`, `gsm8k`, `hellaswag`, `human_eval`, `mmlu_humanities`, `mmlu_other`, `mmlu_social_sciences`, `mmlu_stem`, `openbookqa`, `piqa`, `social_iqa`, `winogrande`.\n",
+    "\n",
+    "Note that evaluation pipelines automatically download relevant datasets from public sources. For `human_eval`, models are prompted to generate Python code that is exectued in the pipeline to measure coding capabilities of the model.\n",
+    "\n",
+    "You can also set the sample ratio, the fraction of the selected dataset to run for the eval.\n",
+    "\n",
+    "**Warning**: Many datasets contain thousands of examples which can lead to high endpoint usage costs. We advise starting with a small sample ratio (e.g., 1%) to verify the pipeline and then increasing the ratio if desired. Note that benchmark metrics obtained with small sample ratios may not be comparable between different models. Please use sample_ratio=1 for model comparisons."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ce3f8a4-ce03-462b-89fa-19cac827c30c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# AzureML settings - please fill in your values\n",
+    "subscription_id = \"<Azure subscription ID>\"\n",
+    "resource_group = \"<Resource group>\"\n",
+    "workspace_name = \"<Workspace name\"\n",
+    "experiment_name = \"<Experiment name>\"\n",
+    "\n",
+    "# Eval to run - you can change this to any of the 12 supported eval names\n",
+    "# Supported evals: boolq, gsm8k, hellaswag, human_eval, mmlu_humanities, mmlu_other, mmlu_social_sciences, mmlu_stem, openbookqa, piqa, social_iqa, winogrande\n",
+    "eval_name = \"mmlu_humanities\"\n",
+    "\n",
+    "# Bedrock URL - defaults to Claude 2.1 in us-east-1\n",
+    "bedrock_endpoint_url = (\n",
+    "    \"https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2:1/invoke\"\n",
+    ")\n",
+    "\n",
+    "# Name of the connection in your Workspace storing AWS access keys\n",
+    "connection_name = \"<Connection name>\"\n",
+    "\n",
+    "# Sample ratio - what fraction of the dataset to run for the eval?\n",
+    "# **WARNING** be aware of endpoint costs!\n",
+    "sample_ratio = 0.01"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2cdbe9be-df54-4035-b7a6-f6f8677f16dc",
+   "metadata": {},
+   "source": [
+    "Run the following cell to get an `MLClient` for communicating with your Workspace:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25f47da9-5c31-4729-a233-cece290415f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.identity import DefaultAzureCredential\n",
+    "from azure.ai.ml import MLClient\n",
+    "\n",
+    "# client for AzureML Workspace actions\n",
+    "ml_client = MLClient(\n",
+    "    credential=DefaultAzureCredential(),\n",
+    "    subscription_id=subscription_id,\n",
+    "    resource_group_name=resource_group,\n",
+    "    workspace_name=workspace_name,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e4bfaa2-fd99-4850-a9c4-8838c4ef9313",
+   "metadata": {},
+   "source": [
+    "The code in the next cell launches the evaluation pipeline job using [serverless compute](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-serverless-compute) by default. You can optionally [create your own compute cluster](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster) and use it to execute the job."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e4c5468-7585-481e-bb20-ad91373cac6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.ml import load_job\n",
+    "\n",
+    "# load the pipeline from the yaml def\n",
+    "pipeline_job = load_job(f\"./evaluation_pipelines/claude-2_1/{eval_name}.yaml\")\n",
+    "\n",
+    "# Set pipeline job inputs\n",
+    "pipeline_job.inputs.endpoint_url = bedrock_endpoint_url\n",
+    "pipeline_job.inputs.ws_connection_name = connection_name\n",
+    "pipeline_job.inputs.sample_ratio = sample_ratio\n",
+    "\n",
+    "# Optionally use your own compute cluster\n",
+    "# pipeline_job.settings.default_compute = \"<Your compute cluster name>\"\n",
+    "\n",
+    "# Start the job in the Workspace\n",
+    "returned_job = ml_client.jobs.create_or_update(\n",
+    "    pipeline_job, experiment_name=experiment_name\n",
+    ")\n",
+    "returned_job"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "095005d9-1deb-4097-a718-4059a335ec09",
+   "metadata": {},
+   "source": [
+    "Run the next cell to stream the job. Notebook execution will be paused until the job finishes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1376aa46-725d-4d6c-8725-3a2d4d52bdfc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Wait until the job completes\n",
+    "ml_client.jobs.stream(returned_job.name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07fa2c79-f875-4c4f-8f09-37158b56bdcd",
+   "metadata": {},
+   "source": [
+    "## Retrieve accuracy scores from the run\n",
+    "When the pipeline finishes, you can retrieve evaluation metrics from the run via mlflow. The primary measure of accuracy for the evals is `mean_exact_match`, with the exception of human_eval which uses `pass@1`. \n",
+    "\n",
+    "Mean exact match is the proportion of model predictions that exactly match the corresponding correct answers. Thus, it is applicable to question answering evaluations that are multiple choice or have a single, correct answer. The pass@1 metric is used for evaluating code generation and is the proportion of model generated code solutions that pass a set of unit tests given in the eval dataset. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "299bf8ea-49ab-44c3-96e6-51902fa03df4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mlflow\n",
+    "\n",
+    "accuracy_metric_name = \"mean_exact_match\" if eval_name != \"human_eval\" else \"pass@1\"\n",
+    "\n",
+    "mlflow_tracking_uri = ml_client.workspaces.get(\n",
+    "    ml_client.workspace_name\n",
+    ").mlflow_tracking_uri\n",
+    "mlflow.set_tracking_uri(mlflow_tracking_uri)\n",
+    "\n",
+    "run = mlflow.get_run(run_id=returned_job.name)\n",
+    "metric_val = run.data.metrics[accuracy_metric_name]\n",
+    "\n",
+    "if sample_ratio < 1.0:\n",
+    "    print(\n",
+    "        f\"**Warning** sample_ratio is {sample_ratio}. Use sample_ratio=1.0 when comparing metrics between models.\"\n",
+    "    )\n",
+    "\n",
+    "print(f\"Eval: {eval_name}\")\n",
+    "print(f\"Sample ratio: {sample_ratio}\")\n",
+    "print(f\"Accuracy metric name: {accuracy_metric_name}\")\n",
+    "print(f\"Accuracy metric value: {metric_val}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}