Skip to content

Commit

Permalink
Merge branch 'main' into kdestin/add-30-minute-timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
diondrapeck authored Jan 18, 2024
2 parents 7ef4154 + 37a4dd2 commit 171f22d
Show file tree
Hide file tree
Showing 58 changed files with 6,289 additions and 83 deletions.
6 changes: 5 additions & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@
/cli/setup.sh @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/cli/train.sh @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/deploy-arm-templates-az-cli.sh @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/assets/environment/environment.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/assets/model/model.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/endpoints/batch/deploy-models/custom-outputs-parquet/custom-output-batch.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/endpoints/batch/deploy-models/heart-classifier-mlflow/mlflow-for-batch-tabular.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
Expand All @@ -243,7 +244,10 @@
/sdk/python/featurestore_sample/notebooks/sdk_only/2.\ Experiment\ and\ train\ models\ using\ features.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/featurestore_sample/notebooks/sdk_only/3.\ Enable\ recurrent\ materialization\ and\ run\ batch\ inference.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/featurestore_sample/notebooks/sdk_only/4.\ Enable\ online\ store\ and\ run\ online\ inference.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/featurestore_sample/notebooks/sdk_only/5.\ Develop\ a\ feature\ set\ with\ custom\ source.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/featurestore_sample/notebooks/sdk_only/5.\ Develop\ a\ feature\ set\ with\ custom\ source.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/foundation-models/benchmarking/evaluating_claude_models.ipynb @arun-rajora
/sdk/python/foundation-models/benchmarking/evaluating_oai_models.ipynb @arun-rajora
/sdk/python/foundation-models/benchmarking/evaluation_pipelines/* @arun-rajora
/sdk/python/jobs/automl-standalone-jobs/automl-classification-task-bankmarketing/automl-classification-task-bankmarketing-serverless.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/jobs/automl-standalone-jobs/automl-image-object-detection-task-fridge-items/automl-image-object-detection-task-fridge-items.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
/sdk/python/jobs/configuration.ipynb @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1
Expand Down
11 changes: 11 additions & 0 deletions cli/endpoints/online/managed/sample/blue-deployment-azureml.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: blue
endpoint_name: my-endpoint
model:
path: ../../model-1/model/
code_configuration:
code: ../../model-1/onlinescoring/
scoring_script: score.py
environment: azureml://registries/azureml/environments/sklearn-1.1/versions/17
instance_type: Standard_DS3_v2
instance_count: 1
11 changes: 11 additions & 0 deletions cli/endpoints/online/managed/sample/green-deployment-azureml.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: green
endpoint_name: my-endpoint
model:
path: ../../model-2/model/
code_configuration:
code: ../../model-2/onlinescoring/
scoring_script: score.py
environment: azureml://registries/azureml/environments/sklearn-1.1/versions/17
instance_type: Standard_DS3_v2
instance_count: 1
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ inputs:
jobs:
pipeline_finetune:
type: pipeline
component: azureml://registries/azureml/components/openai_completions_finetune_pipeline/versions/0.1.1
component: azureml://registries/azureml/components/openai_completions_finetune_pipeline/versions/0.1.2
inputs:
train_dataset:
type: uri_folder
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def init():
global g_logger
global g_file_loader_dictionary
global aacs_client
global aacs_enabled

g_logger = logging.getLogger("azureml")
g_logger.setLevel(logging.INFO)
Expand All @@ -60,10 +61,15 @@ def init():
".pqt": load_parquet,
}

endpoint = os.environ.get("CONTENT_SAFETY_ENDPOINT")
key = os.environ.get("CONTENT_SAFETY_KEY")
endpoint = os.environ.get("CONTENT_SAFETY_ENDPOINT", None)
key = os.environ.get("CONTENT_SAFETY_KEY", None)
# Create an Content Safety client
aacs_client = ContentSafetyClient(endpoint, AzureKeyCredential(key))
if endpoint is not None and key is not None:
aacs_client = ContentSafetyClient(endpoint, AzureKeyCredential(key))
aacs_enabled = True
else:
aacs_enabled = False
g_logger.warn("Azure AI Content Safety (aacs) is disabled.")


def get_input_schema(model_path):
Expand Down Expand Up @@ -372,13 +378,18 @@ def run(batch_input):
predict_result = []
try:
aacs_threshold = int(os.environ.get("CONTENT_SAFETY_THRESHOLD", default=1))
blocked_input = analyze_data(
batch_input, aacs_threshold, blocked_input=None, is_input=True
)
if aacs_enabled:
blocked_input = analyze_data(
batch_input, aacs_threshold, blocked_input=None, is_input=True
)
predict_result = g_model.predict(batch_input)
_ = analyze_data(
predict_result, aacs_threshold, blocked_input=blocked_input, is_input=False
)
if aacs_enabled:
_ = analyze_data(
predict_result,
aacs_threshold,
blocked_input=blocked_input,
is_input=False,
)
except Exception as e:
g_logger.error("Processing mini batch failed with exception: " + str(e))
g_logger.error(traceback.format_exc())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ else
fi

# 4. Submit a sample request to endpoint
data_path="./text_to_image_batch_data/batch_data"
data_path="./text_to_image_batch_data"
python utils/prepare_data.py --payload-path $data_path --mode "batch"
# Path where the processes csvs are dumped. This is the input to the endpoint
processed_data_path="./text_to_image_batch_data/processed_batch_data"
Expand All @@ -73,11 +73,32 @@ az ml batch-endpoint create --name $endpoint_name $workspace_info || {
echo "endpoint create failed"; exit 1;
}

# create a environment for batch deployment

environment_name="text-to-image-model-env"
environment_label="latest"

if ! az ml environment show --name $environment_name --label $environment_label $workspace_info
then
echo "Environment $environment_name:$environment_label does not exist in Workspace."
echo "---Creating environment---"
az ml environment create --name $environment_name --build-context "./scoring-files/docker_env" \
$workspace_info || {
echo "environment create failed"; exit 1;
}
exit 1
fi

environment_version=$(az ml environment show --name $environment_name --label $environment_label $workspace_info --query version --output tsv)

# deploy model from registry to endpoint in workspace
az ml batch-deployment create --file batch-deploy.yml $workspace_info --set \
endpoint_name=$endpoint_name \
name=$deployment_name \
compute=$deployment_compute \
environment=azureml:$environment_name:$environment_version \
code_configuration.code="scoring-files/score" \
code_configuration.scoring_script="score_batch.py" \
model=azureml://registries/$registry_name/models/$model_name/versions/$model_version || {
echo "deployment create failed"; exit 1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
"source": [
"# Get a handle to registry\n",
"ml_client = MLClient(\n",
" credential=credential, registry_name=\"azureml\", registry_location=\"northcentralus\"\n",
" credential=credential, registry_name=\"azureml\", region=\"northcentralus\"\n",
")"
]
},
Expand All @@ -115,7 +115,7 @@
"outputs": [],
"source": [
"finetune_pipeline = load_component(\n",
" client=ml_client, name=\"openai_completions_finetune_pipeline\", version=\"0.1.1\"\n",
" client=ml_client, name=\"openai_completions_finetune_pipeline\", version=\"0.1.2\"\n",
")"
]
},
Expand Down Expand Up @@ -161,7 +161,7 @@
"\n",
"\n",
"# Construct pipeline\n",
"@pipeline()\n",
"@pipeline(name=\"test-run\", compute=\"serverless\")\n",
"def pipeline_with_registered_component(\n",
" train_dataset,\n",
" validation_dataset,\n",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "c90af50b-a3a3-42ea-b855-943777df2bee",
"metadata": {},
"source": [
"# Evaluating Claude models with AzureML\n",
"\n",
"In this notebook, you will learn how to run evaluations of Anthropic's Claude model using the AzureML SDK. Along with this notebook, we've included a preconfigured set of 12 evaluations using well-known, public datasets (e.g., MMLU, HellaSwag, Winogrande).\n",
"\n",
"Please see the [Azure AI Leaderboard](https://ai.azure.com/explore/leaderboard) for other supported model benchmarks and for more details on the eval datasets.\n",
"\n",
"*Disclaimer: This notebook has been tested against AWS Bedrock endpoints for Claude 2.1. Other deployments or model versions are not guaranteed to work with the evaluation pipelines distributed with this notebook.* \n",
"\n",
"## Prerequistes\n",
"- An Azure account with an active subscription - [Create an account for free](https://azure.microsoft.com/free/?WT.mc_id=A261C142F)\n",
"- An Azure ML workspace - [Configure workspace](../../configuration.ipynb)\n",
"- Installed Azure Machine Learning Python SDK v2 - [install instructions](../../../README.md) - check the getting started section\n",
"- A python environment with [mlflow](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-mlflow-configure-tracking?view=azureml-api-2&tabs=python%2Cmlflow) for retrieving eval metrics\n",
"- Access keys for Claude endpoints on [Amazon Web Services Bedrock platform](https://aws.amazon.com/bedrock/claude/) \n",
"\n",
"## Configuring a Workspace connection for Bedrock access\n",
"You will use a Workspace connection to securely store Bedrock access keys. Follow the steps below to create a custom-type connection:\n",
"- Follow directions for [creating a custom connection in the AzureML studio UI](https://learn.microsoft.com/en-us/azure/machine-learning/prompt-flow/tools-reference/python-tool?view=azureml-api-2#create-a-custom-connection)\n",
"- Add the following two key-value pairs to the custom connection:\n",
" 1. A key named `AccessKey` with a value containing your AWS access key\n",
" 2. A key named `SecretKey` with a value containing your AWS secret access key \n",
"\n",
"## Configuring and running an evaluation pipeline\n",
"Please set global values in the following cell for your AzureML Workspace, the Bedrock endpoint you want to call, the name of connection you created in the previous step, and the name of the eval you want to run.\n",
"\n",
"Supported evals are the following: `boolq`, `gsm8k`, `hellaswag`, `human_eval`, `mmlu_humanities`, `mmlu_other`, `mmlu_social_sciences`, `mmlu_stem`, `openbookqa`, `piqa`, `social_iqa`, `winogrande`.\n",
"\n",
"Note that evaluation pipelines automatically download relevant datasets from public sources. For `human_eval`, models are prompted to generate Python code that is exectued in the pipeline to measure coding capabilities of the model.\n",
"\n",
"You can also set the sample ratio, the fraction of the selected dataset to run for the eval.\n",
"\n",
"**Warning**: Many datasets contain thousands of examples which can lead to high endpoint usage costs. We advise starting with a small sample ratio (e.g., 1%) to verify the pipeline and then increasing the ratio if desired. Note that benchmark metrics obtained with small sample ratios may not be comparable between different models. Please use sample_ratio=1 for model comparisons."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ce3f8a4-ce03-462b-89fa-19cac827c30c",
"metadata": {},
"outputs": [],
"source": [
"# AzureML settings - please fill in your values\n",
"subscription_id = \"<Azure subscription ID>\"\n",
"resource_group = \"<Resource group>\"\n",
"workspace_name = \"<Workspace name\"\n",
"experiment_name = \"<Experiment name>\"\n",
"\n",
"# Eval to run - you can change this to any of the 12 supported eval names\n",
"# Supported evals: boolq, gsm8k, hellaswag, human_eval, mmlu_humanities, mmlu_other, mmlu_social_sciences, mmlu_stem, openbookqa, piqa, social_iqa, winogrande\n",
"eval_name = \"mmlu_humanities\"\n",
"\n",
"# Bedrock URL - defaults to Claude 2.1 in us-east-1\n",
"bedrock_endpoint_url = (\n",
" \"https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2:1/invoke\"\n",
")\n",
"\n",
"# Name of the connection in your Workspace storing AWS access keys\n",
"connection_name = \"<Connection name>\"\n",
"\n",
"# Sample ratio - what fraction of the dataset to run for the eval?\n",
"# **WARNING** be aware of endpoint costs!\n",
"sample_ratio = 0.01"
]
},
{
"cell_type": "markdown",
"id": "2cdbe9be-df54-4035-b7a6-f6f8677f16dc",
"metadata": {},
"source": [
"Run the following cell to get an `MLClient` for communicating with your Workspace:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "25f47da9-5c31-4729-a233-cece290415f9",
"metadata": {},
"outputs": [],
"source": [
"from azure.identity import DefaultAzureCredential\n",
"from azure.ai.ml import MLClient\n",
"\n",
"# client for AzureML Workspace actions\n",
"ml_client = MLClient(\n",
" credential=DefaultAzureCredential(),\n",
" subscription_id=subscription_id,\n",
" resource_group_name=resource_group,\n",
" workspace_name=workspace_name,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "2e4bfaa2-fd99-4850-a9c4-8838c4ef9313",
"metadata": {},
"source": [
"The code in the next cell launches the evaluation pipeline job using [serverless compute](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-serverless-compute) by default. You can optionally [create your own compute cluster](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster) and use it to execute the job."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e4c5468-7585-481e-bb20-ad91373cac6c",
"metadata": {},
"outputs": [],
"source": [
"from azure.ai.ml import load_job\n",
"\n",
"# load the pipeline from the yaml def\n",
"pipeline_job = load_job(f\"./evaluation_pipelines/claude-2_1/{eval_name}.yaml\")\n",
"\n",
"# Set pipeline job inputs\n",
"pipeline_job.inputs.endpoint_url = bedrock_endpoint_url\n",
"pipeline_job.inputs.ws_connection_name = connection_name\n",
"pipeline_job.inputs.sample_ratio = sample_ratio\n",
"\n",
"# Optionally use your own compute cluster\n",
"# pipeline_job.settings.default_compute = \"<Your compute cluster name>\"\n",
"\n",
"# Start the job in the Workspace\n",
"returned_job = ml_client.jobs.create_or_update(\n",
" pipeline_job, experiment_name=experiment_name\n",
")\n",
"returned_job"
]
},
{
"cell_type": "markdown",
"id": "095005d9-1deb-4097-a718-4059a335ec09",
"metadata": {},
"source": [
"Run the next cell to stream the job. Notebook execution will be paused until the job finishes."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1376aa46-725d-4d6c-8725-3a2d4d52bdfc",
"metadata": {},
"outputs": [],
"source": [
"# Wait until the job completes\n",
"ml_client.jobs.stream(returned_job.name)"
]
},
{
"cell_type": "markdown",
"id": "07fa2c79-f875-4c4f-8f09-37158b56bdcd",
"metadata": {},
"source": [
"## Retrieve accuracy scores from the run\n",
"When the pipeline finishes, you can retrieve evaluation metrics from the run via mlflow. The primary measure of accuracy for the evals is `mean_exact_match`, with the exception of human_eval which uses `pass@1`. \n",
"\n",
"Mean exact match is the proportion of model predictions that exactly match the corresponding correct answers. Thus, it is applicable to question answering evaluations that are multiple choice or have a single, correct answer. The pass@1 metric is used for evaluating code generation and is the proportion of model generated code solutions that pass a set of unit tests given in the eval dataset. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "299bf8ea-49ab-44c3-96e6-51902fa03df4",
"metadata": {},
"outputs": [],
"source": [
"import mlflow\n",
"\n",
"accuracy_metric_name = \"mean_exact_match\" if eval_name != \"human_eval\" else \"pass@1\"\n",
"\n",
"mlflow_tracking_uri = ml_client.workspaces.get(\n",
" ml_client.workspace_name\n",
").mlflow_tracking_uri\n",
"mlflow.set_tracking_uri(mlflow_tracking_uri)\n",
"\n",
"run = mlflow.get_run(run_id=returned_job.name)\n",
"metric_val = run.data.metrics[accuracy_metric_name]\n",
"\n",
"if sample_ratio < 1.0:\n",
" print(\n",
" f\"**Warning** sample_ratio is {sample_ratio}. Use sample_ratio=1.0 when comparing metrics between models.\"\n",
" )\n",
"\n",
"print(f\"Eval: {eval_name}\")\n",
"print(f\"Sample ratio: {sample_ratio}\")\n",
"print(f\"Accuracy metric name: {accuracy_metric_name}\")\n",
"print(f\"Accuracy metric value: {metric_val}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 171f22d

Please sign in to comment.