From 6c51ba8389cb63d1af097c51dfd6f9417f28ab9b Mon Sep 17 00:00:00 2001 From: Arpit Jasapara <87999496+arpitjasa-db@users.noreply.github.com> Date: Mon, 5 Feb 2024 22:35:58 -0800 Subject: [PATCH] Rename assets to resources (#145) * Rename assets to resources * Add hyphen check to schema name --- README.md | 28 +++--- databricks_template_schema.json | 8 +- stack-customization.md | 14 +-- template/update_layout.tmpl | 4 +- .../.azure/devops-pipelines/README.md.tmpl | 2 +- ....input_project_name}}-bundle-cicd.yml.tmpl | 12 +-- .../{{.input_project_name}}-tests-ci.yml.tmpl | 2 +- .../.github/workflows/README.md.tmpl | 2 +- ...put_project_name}}-bundle-cd-prod.yml.tmpl | 4 +- ..._project_name}}-bundle-cd-staging.yml.tmpl | 4 +- ...{{.input_project_name}}-bundle-ci.yml.tmpl | 4 +- template/{{.input_root_dir}}/README.md.tmpl | 50 +++++------ .../docs/ml-pull-request.md.tmpl | 2 +- .../docs/mlops-setup.md.tmpl | 44 ++++----- .../README.md.tmpl | 48 +++++----- .../databricks.yml.tmpl | 6 +- .../deployment/batch_inference/README.md.tmpl | 2 +- .../notebooks/BatchInference.py.tmpl | 2 +- .../feature_engineering/README.md.tmpl | 2 +- .../GenerateAndWriteFeatures.py.tmpl | 2 +- .../{assets => resources}/README.md.tmpl | 90 +++++++++---------- ...atch-inference-workflow-resource.yml.tmpl} | 2 +- ...re-engineering-workflow-resource.yml.tmpl} | 4 +- .../ml-artifacts-resource.yml.tmpl} | 0 .../model-workflow-resource.yml.tmpl} | 10 +-- .../monitoring-workflow-resource.yml.tmpl} | 0 .../training/README.md.tmpl | 8 +- .../training/notebooks/Train.py.tmpl | 2 +- .../notebooks/TrainWithFeatureStore.py.tmpl | 2 +- .../notebooks/TrainWithMLflowRecipes.py.tmpl | 2 +- .../validation/README.md.tmpl | 2 +- .../notebooks/ModelValidation.py.tmpl | 2 +- tests/utils.py | 4 +- 33 files changed, 185 insertions(+), 185 deletions(-) rename template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/{assets => resources}/README.md.tmpl (74%) rename template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/{assets/batch-inference-workflow-asset.yml.tmpl => resources/batch-inference-workflow-resource.yml.tmpl} (94%) rename template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/{assets/feature-engineering-workflow-asset.yml.tmpl => resources/feature-engineering-workflow-resource.yml.tmpl} (93%) rename template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/{assets/ml-artifacts-asset.yml.tmpl => resources/ml-artifacts-resource.yml.tmpl} (100%) rename template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/{assets/model-workflow-asset.yml.tmpl => resources/model-workflow-resource.yml.tmpl} (92%) rename template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/{assets/monitoring-workflow-asset.yml.tmpl => resources/monitoring-workflow-resource.yml.tmpl} (100%) diff --git a/README.md b/README.md index e4140dee..7530eff1 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This repo provides a customizable stack for starting new ML projects on Databricks that follow production best-practices out of the box. -Using Databricks MLOps Stacks, data scientists can quickly get started iterating on ML code for new projects while ops engineers set up CI/CD and ML assets +Using Databricks MLOps Stacks, data scientists can quickly get started iterating on ML code for new projects while ops engineers set up CI/CD and ML resources management, with an easy transition to production. You can also use MLOps Stacks as a building block in automation for creating new data science projects with production-grade CI/CD pre-configured. The default stack in this repo includes three modular components: @@ -13,14 +13,14 @@ The default stack in this repo includes three modular components: | Component | Description | Why it's useful | |-----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [ML Code](template/{{.input_root_dir}}/{{template%20`project_name_alphanumeric_underscore`%20.}}/) | Example ML project structure ([training](template/{{.input_root_dir}}/{{template%20`project_name_alphanumeric_underscore`%20.}}/training) and [batch inference](template/{{.input_root_dir}}/{{template%20`project_name_alphanumeric_underscore`%20.}}/deployment/batch_inference), etc), with unit tested Python modules and notebooks | Quickly iterate on ML problems, without worrying about refactoring your code into tested modules for productionization later on. | -| [ML Assets as Code](template/{{.input_root_dir}}/{{template%20`project_name_alphanumeric_underscore`%20.}}/assets) | ML pipeline assets ([training](template/{{.input_root_dir}}/{{template%20`project_name_alphanumeric_underscore`%20.}}/assets/model-workflow-asset.yml.tmpl) and [batch inference](template/{{.input_root_dir}}/{{template%20`project_name_alphanumeric_underscore`%20.}}/assets/batch-inference-workflow-asset.yml.tmpl) jobs, etc) defined through [databricks CLI bundles](https://docs.databricks.com/dev-tools/cli/bundle-cli.html) | Govern, audit, and deploy changes to your ML assets (e.g. "use a larger instance type for automated model retraining") through pull requests, rather than adhoc changes made via UI. | -| CI/CD([GitHub Actions](template/{{.input_root_dir}}/.github/) or [Azure DevOps](template/{{.input_root_dir}}/.azure/)) | [GitHub Actions](https://docs.github.com/en/actions) or [Azure DevOps](https://azure.microsoft.com/en-us/products/devops) workflows to test and deploy ML code and assets | Ship ML code faster and with confidence: ensure all production changes are performed through automation and that only tested code is deployed to prod | +| [ML Resources as Code](template/{{.input_root_dir}}/{{template%20`project_name_alphanumeric_underscore`%20.}}/resources) | ML pipeline resources ([training](template/{{.input_root_dir}}/{{template%20`project_name_alphanumeric_underscore`%20.}}/resources/model-workflow-resource.yml.tmpl) and [batch inference](template/{{.input_root_dir}}/{{template%20`project_name_alphanumeric_underscore`%20.}}/resources/batch-inference-workflow-resource.yml.tmpl) jobs, etc) defined through [databricks CLI bundles](https://docs.databricks.com/dev-tools/cli/bundle-cli.html) | Govern, audit, and deploy changes to your ML resources (e.g. "use a larger instance type for automated model retraining") through pull requests, rather than adhoc changes made via UI. | +| CI/CD([GitHub Actions](template/{{.input_root_dir}}/.github/) or [Azure DevOps](template/{{.input_root_dir}}/.azure/)) | [GitHub Actions](https://docs.github.com/en/actions) or [Azure DevOps](https://azure.microsoft.com/en-us/products/devops) workflows to test and deploy ML code and resources | Ship ML code faster and with confidence: ensure all production changes are performed through automation and that only tested code is deployed to prod | See the [FAQ](#FAQ) for questions on common use cases. ## ML pipeline structure and development loops -An ML solution comprises data, code, and models. These assets need to be developed, validated (staging), and deployed (production). In this repository, we use the notion of dev, staging, and prod to represent the execution +An ML solution comprises data, code, and models. These resources need to be developed, validated (staging), and deployed (production). In this repository, we use the notion of dev, staging, and prod to represent the execution environments of each stage. An instantiated project from MLOps Stacks contains an ML pipeline with CI/CD workflows to test and deploy automated model training and batch inference jobs across your dev, staging, and prod Databricks workspaces. @@ -85,17 +85,17 @@ Others must be correctly specified for CI/CD to work: to enable them to view and debug CI test results * ``input_databricks_prod_workspace_host``: URL of production Databricks workspace. We encourage granting data scientists working on the current ML project non-admin (read) access to this workspace, to enable them to view production job status and see job logs to debug failures. - * ``input_default_branch``: Name of the default branch, where the prod and staging ML assets are deployed from and the latest ML code is staged. + * ``input_default_branch``: Name of the default branch, where the prod and staging ML resources are deployed from and the latest ML code is staged. * ``input_release_branch``: Name of the release branch. The production jobs (model training, batch inference) defined in this repo pull ML code from this branch. Or used for project initialization: * ``input_project_name``: name of the current project - * ``input_read_user_group``: User group name to give READ permissions to for project assets (ML jobs, integration test job runs, and machine learning assets). A group with this name must exist in both the staging and prod workspaces. Defaults to "users", which grants read permission to all users in the staging/prod workspaces. You can specify a custom group name e.g. to restrict read permissions to members of the team working on the current ML project. + * ``input_read_user_group``: User group name to give READ permissions to for project resources (ML jobs, integration test job runs, and machine learning resources). A group with this name must exist in both the staging and prod workspaces. Defaults to "users", which grants read permission to all users in the staging/prod workspaces. You can specify a custom group name e.g. to restrict read permissions to members of the team working on the current ML project. * ``input_include_models_in_unity_catalog``: If selected, models will be registered to [Unity Catalog](https://docs.databricks.com/en/mlflow/models-in-uc.html#models-in-unity-catalog). Models will be registered under a three-level namespace of `..`, according the the target environment in which the model registration code is executed. Thus, if model registration code runs in the `prod` environment, the model will be registered to the `prod` catalog under the namespace `..`. This assumes that the respective catalogs exist in Unity Catalog (e.g. `dev`, `staging` and `prod` catalogs). Target environment names, and catalogs to be used are defined in the Databricks bundles files, and can be updated as needed. * ``input_schema_name``: If using [Models in Unity Catalog](https://docs.databricks.com/en/mlflow/models-in-uc.html#models-in-unity-catalog), specify the name of the schema under which the models should be registered, but we recommend keeping the name the same as the project name. We default to using the same `schema_name` across catalogs, thus this schema must exist in each catalog used. For example, the training pipeline when executed in the staging environment will register the model to `staging..`, whereas the same pipeline executed in the prod environment will register the mode to `prod..`. Also, be sure that the service principals in each respective environment have the right permissions to access this schema, which would be `USE_CATALOG`, `USE_SCHEMA`, `MODIFY`, `CREATE_MODEL`, and `CREATE_TABLE`. * ``input_unity_catalog_read_user_group``: If using [Models in Unity Catalog](https://docs.databricks.com/en/mlflow/models-in-uc.html#models-in-unity-catalog), define the name of the user group to grant `EXECUTE` (read & use model) privileges for the registered model. Defaults to "account users". - * ``input_include_feature_store``: If selected, will provide [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) stack components including: project structure and sample feature Python modules, feature engineering notebooks, ML asset configs to provision and manage Feature Store jobs, and automated integration tests covering feature engineering and training. + * ``input_include_feature_store``: If selected, will provide [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) stack components including: project structure and sample feature Python modules, feature engineering notebooks, ML resource configs to provision and manage Feature Store jobs, and automated integration tests covering feature engineering and training. * ``input_include_mlflow_recipes``: If selected, will provide [MLflow Recipes](https://mlflow.org/docs/latest/recipes.html) stack components, dividing the training pipeline into configurable steps and profiles. See the generated ``README.md`` for next steps! @@ -116,20 +116,20 @@ production model serving endpoints. However, you can create a single workspace stack, by supplying the same workspace URL for `input_databricks_staging_workspace_host` and `input_databricks_prod_workspace_host`. If you go this route, we -recommend using different service principals to manage staging vs prod assets, -to ensure that CI workloads run in staging cannot interfere with production assets. +recommend using different service principals to manage staging vs prod resources, +to ensure that CI workloads run in staging cannot interfere with production resources. ### I have an existing ML project. Can I productionize it using MLOps Stacks? Yes. Currently, you can instantiate a new project and copy relevant components into your existing project to productionize it. MLOps Stacks is modularized, so -you can e.g. copy just the GitHub Actions workflows under `.github` or ML asset configs - under ``{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets`` +you can e.g. copy just the GitHub Actions workflows under `.github` or ML resource configs + under ``{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources`` and ``{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml`` into your existing project. ### Can I adopt individual components of MLOps Stacks? For this use case, we recommend instantiating via [Databricks asset bundle templates](https://docs.databricks.com/en/dev-tools/bundles/templates.html) -and copying the relevant subdirectories. For example, all ML asset configs -are defined under ``{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets`` +and copying the relevant subdirectories. For example, all ML resource configs +are defined under ``{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources`` and ``{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml``, while CI/CD is defined e.g. under `.github` if using GitHub Actions, or under `.azure` if using Azure DevOps. @@ -142,7 +142,7 @@ for details on how to do this. ### Does the MLOps Stacks cover data (ETL) pipelines? Since MLOps Stacks is based on [databricks CLI bundles](https://docs.databricks.com/dev-tools/cli/bundle-commands.html), -it's not limited only to ML workflows and assets - it works for assets across the Databricks Lakehouse. For instance, while the existing ML +it's not limited only to ML workflows and resources - it works for resources across the Databricks Lakehouse. For instance, while the existing ML code samples contain feature engineering, training, model validation, deployment and batch inference workflows, you can use it for Delta Live Tables pipelines as well. diff --git a/databricks_template_schema.json b/databricks_template_schema.json index 3e6c34d9..03e3fa2c 100644 --- a/databricks_template_schema.json +++ b/databricks_template_schema.json @@ -92,7 +92,7 @@ "order": 8, "type": "string", "default": "main", - "description": "\nName of the default branch, where the prod and staging ML assets are deployed from and the latest ML code is staged. Default", + "description": "\nName of the default branch, where the prod and staging ML resources are deployed from and the latest ML code is staged. Default", "skip_prompt_if": { "properties": { "input_setup_cicd_and_project": { @@ -118,7 +118,7 @@ "order": 10, "type": "string", "default": "users", - "description": "\nUser group name to give READ permissions to for project assets (ML jobs, integration test job runs, and machine learning assets). A group with this name must exist in both the staging and prod workspaces. Default", + "description": "\nUser group name to give READ permissions to for project resources (ML jobs, integration test job runs, and machine learning resources). A group with this name must exist in both the staging and prod workspaces. Default", "skip_prompt_if": { "properties": { "input_setup_cicd_and_project": { @@ -146,8 +146,8 @@ "type": "string", "description": "\nName of schema to use when registering a model in Unity Catalog. \nNote that this schema must already exist, and we recommend keeping the name the same as the project name as well as giving the service principals the right access. Default", "default": "{{ .input_project_name }}", - "pattern": "^[^ .\\/]*$", - "pattern_match_failure_message": "Valid schema names cannot contain any of the following characters: \" \", \".\", \"\\\", \"/\"", + "pattern": "^[^ .\\-\\/]*$", + "pattern_match_failure_message": "Valid schema names cannot contain any of the following characters: \" \", \".\", \"-\", \"\\\", \"/\"", "skip_prompt_if": { "anyOf":[ { diff --git a/stack-customization.md b/stack-customization.md index 20a20987..ab3c3c31 100644 --- a/stack-customization.md +++ b/stack-customization.md @@ -51,7 +51,7 @@ MLOps Stacks provides example ML code. You may want to customize the example code, e.g. further prune it down into a skeleton for data scientists to fill out. -If you customize this component, you can still use the CI/CD and ML asset components to build production ML pipelines, as long as you provide ML +If you customize this component, you can still use the CI/CD and ML resource components to build production ML pipelines, as long as you provide ML notebooks with the expected interface. For example, model training under ``template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/`` and inference under ``template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/``. See code comments in the notebook files for the expected interface & behavior of these notebooks. @@ -63,18 +63,18 @@ MLOps Stacks currently has the following sub-components for CI/CD: * Logic to trigger model deployment through REST API calls to your CD system, when model training completes. This logic is currently captured in ``template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/notebooks/ModelDeployment.py`` -### ML asset configs -Root ML asset config file can be found as ``{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml``. -It defines the ML config assets to be included and workspace host for each deployment target. +### ML resource configs +Root ML resource config file can be found as ``{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml``. +It defines the ML config resources to be included and workspace host for each deployment target. -ML asset configs (databricks CLI bundles code definitions of ML jobs, experiments, models etc) can be found under -``template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets``, along with docs. +ML resource configs (databricks CLI bundles code definitions of ML jobs, experiments, models etc) can be found under +``template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources``, along with docs. You can update this component to customize the default ML pipeline structure for new ML projects in your organization, e.g. add additional model inference jobs or modify the default instance type used in ML jobs. When updating this component, you may want to update developer-facing docs in -``template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/README.md``. +``template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md``. ### Docs After making customizations, make any changes needed to diff --git a/template/update_layout.tmpl b/template/update_layout.tmpl index 9b62b724..cb6bf698 100644 --- a/template/update_layout.tmpl +++ b/template/update_layout.tmpl @@ -35,7 +35,7 @@ {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `feature_engineering`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/feature_engineering`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithFeatureStore.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `assets/feature-engineering-workflow-asset.yml`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/feature-engineering-workflow-resource.yml`) }} # Remove Delta and MLflow Recipes code in cases of Feature Store. {{ else if (eq .input_include_feature_store `yes`) }} # delta_paths @@ -72,7 +72,7 @@ {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `feature_engineering`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/feature_engineering`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithFeatureStore.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `assets/feature-engineering-workflow-asset.yml`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/feature-engineering-workflow-resource.yml`) }} {{ end }} # Remove utils if using Models in Unity Catalog diff --git a/template/{{.input_root_dir}}/.azure/devops-pipelines/README.md.tmpl b/template/{{.input_root_dir}}/.azure/devops-pipelines/README.md.tmpl index a04a3bf6..711e0254 100644 --- a/template/{{.input_root_dir}}/.azure/devops-pipelines/README.md.tmpl +++ b/template/{{.input_root_dir}}/.azure/devops-pipelines/README.md.tmpl @@ -1,7 +1,7 @@ # CI/CD Workflow Definitions This directory contains CI/CD workflow definitions using [Azure DevOps Pipelines](https://azure.microsoft.com/en-gb/products/devops/pipelines/), under ``devops-pipelines``. These workflows cover testing and deployment of both ML code (for model training, batch inference, etc) and -Databricks ML asset definitions. +Databricks ML resource definitions. To set up CI/CD for a new project, please refer to [Setting up CI/CD](<../../README.md#Setting up CI/CD>) and following the [MLOps Setup Guide](../../docs/mlops-setup.md#Steps). diff --git a/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-bundle-cicd.yml.tmpl b/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-bundle-cicd.yml.tmpl index 93936cd1..f5e09964 100644 --- a/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-bundle-cicd.yml.tmpl +++ b/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-bundle-cicd.yml.tmpl @@ -1,9 +1,9 @@ -# This Azure Pipeline validates and deploys bundle config (ML asset config and more) -# defined under {{template `project_name_alphanumeric_underscore` .}}/assets/* +# This Azure Pipeline validates and deploys bundle config (ML resource config and more) +# defined under {{template `project_name_alphanumeric_underscore` .}}/resources/* # and {{template `project_name_alphanumeric_underscore` .}}/databricks.yml. # The bundle is validated (CI) upon making a PR against the {{ .input_default_branch }} branch. -# Bundle assets defined for staging are deployed when a PR is merged into the {{ .input_default_branch }} branch. -# Bundle assets defined for prod are deployed when a PR is merged into the {{ .input_release_branch }} branch. +# Bundle resources defined for staging are deployed when a PR is merged into the {{ .input_default_branch }} branch. +# Bundle resources defined for prod are deployed when a PR is merged into the {{ .input_release_branch }} branch. trigger: branches: @@ -102,7 +102,7 @@ stages: # Run StagingBundleCD stage after successfully merging into the {{ .input_default_branch }} branch - stage: StagingBundleCD displayName: 'Staging bundle deployment for {{ .input_project_name }}' - # Trigger deployment of bundle assets when PRs are merged into the {{ .input_default_branch }} branch + # Trigger deployment of bundle resources when PRs are merged into the {{ .input_default_branch }} branch condition: | and( eq(variables['Build.SourceBranch'], 'refs/heads/{{ .input_default_branch }}'), @@ -161,7 +161,7 @@ stages: # Run prod bundle CD stage after successfully merging into the {{ .input_release_branch }} branch - stage: prodBundleCD displayName: 'Prod bundle deployment for {{ .input_project_name }}' - # Trigger deployment of Bundle assets when PRs are merged into the {{ .input_release_branch }} branch + # Trigger deployment of Bundle resources when PRs are merged into the {{ .input_release_branch }} branch condition: | and( eq(variables['Build.SourceBranch'], 'refs/heads/{{ .input_release_branch }}'), diff --git a/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-tests-ci.yml.tmpl b/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-tests-ci.yml.tmpl index 337ef636..364ae74b 100644 --- a/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-tests-ci.yml.tmpl +++ b/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-tests-ci.yml.tmpl @@ -2,7 +2,7 @@ # This pipeline is triggered upon making a PR against the {{ .input_default_branch }} branch. # Unit tests are defined under {{template `project_name_alphanumeric_underscore` .}}/tests # and are executed on the Azure Pipelines agent. -# The integration test deploys and runs the model_training_job defined in {{template `project_name_alphanumeric_underscore` .}}/assets/model-workflow-asset.yml +# The integration test deploys and runs the model_training_job defined in {{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml # This integration test is run in the staging workspace, as defined under {{template `project_name_alphanumeric_underscore` .}}/databricks.yml trigger: diff --git a/template/{{.input_root_dir}}/.github/workflows/README.md.tmpl b/template/{{.input_root_dir}}/.github/workflows/README.md.tmpl index f910d72b..d31eda00 100644 --- a/template/{{.input_root_dir}}/.github/workflows/README.md.tmpl +++ b/template/{{.input_root_dir}}/.github/workflows/README.md.tmpl @@ -1,7 +1,7 @@ # CI/CD Workflow Definitions This directory contains CI/CD workflow definitions using [GitHub Actions](https://docs.github.com/en/actions), under ``workflows``. These workflows cover testing and deployment of both ML code (for model training, batch inference, etc) and -Databricks ML asset definitions. +Databricks ML resource definitions. To set up CI/CD for a new project, please refer to [Setting up CI/CD](<../../README.md#Setting up CI/CD>) and following the [MLOps Setup Guide](../../docs/mlops-setup.md#Steps) diff --git a/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-cd-prod.yml.tmpl b/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-cd-prod.yml.tmpl index da8b737d..a5072d40 100644 --- a/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-cd-prod.yml.tmpl +++ b/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-cd-prod.yml.tmpl @@ -1,5 +1,5 @@ -# This GitHub workflow deploys Bundle assets (ML asset config and more) -# defined under {{template `project_name_alphanumeric_underscore` .}}/assets/* +# This GitHub workflow deploys Bundle resources (ML resource config and more) +# defined under {{template `project_name_alphanumeric_underscore` .}}/resources/* # and {{template `project_name_alphanumeric_underscore` .}}/databricks.yml with prod deployment target configs, # when PRs are merged into the release branch name: Bundle Deployment for {{ .input_project_name }} Prod diff --git a/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-cd-staging.yml.tmpl b/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-cd-staging.yml.tmpl index 9e239c22..e84a9384 100644 --- a/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-cd-staging.yml.tmpl +++ b/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-cd-staging.yml.tmpl @@ -1,5 +1,5 @@ -# This GitHub workflow deploys Bundle assets (ML asset config and more) -# defined under {{template `project_name_alphanumeric_underscore` .}}/assets/* +# This GitHub workflow deploys Bundle resources (ML resource config and more) +# defined under {{template `project_name_alphanumeric_underscore` .}}/resources/* # and {{template `project_name_alphanumeric_underscore` .}}/databricks.yml with staging deployment target configs, # when PRs are merged into the default branch name: Bundle Deployment for {{ .input_project_name }} Staging diff --git a/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-ci.yml.tmpl b/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-ci.yml.tmpl index 55c1fbe0..67b78399 100644 --- a/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-ci.yml.tmpl +++ b/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-bundle-ci.yml.tmpl @@ -1,5 +1,5 @@ -# This GitHub workflow validates Bundle config (ML asset config and more) -# defined under {{template `project_name_alphanumeric_underscore` .}}/assets/* +# This GitHub workflow validates Bundle config (ML resource config and more) +# defined under {{template `project_name_alphanumeric_underscore` .}}/resources/* # and {{template `project_name_alphanumeric_underscore` .}}/databricks.yml, when PRs are merged into the main branch name: Bundle validation for {{ .input_project_name }} diff --git a/template/{{.input_root_dir}}/README.md.tmpl b/template/{{.input_root_dir}}/README.md.tmpl index 60bee0f1..9630c1a7 100644 --- a/template/{{.input_root_dir}}/README.md.tmpl +++ b/template/{{.input_root_dir}}/README.md.tmpl @@ -17,9 +17,9 @@ This project contains the following components: |----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} | ML Code | Example ML project code, with unit tested Python modules and notebooks | -| ML Assets as Code | ML pipeline assets (training and batch inference jobs with schedules, etc) configured and deployed through [databricks CLI bundles]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/bundle-cli.html")) }}) | +| ML Resources as Code | ML pipeline resources (training and batch inference jobs with schedules, etc) configured and deployed through [databricks CLI bundles]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/bundle-cli.html")) }}) | {{ end }} -| CI/CD | {{ if (eq .input_cicd_platform `github_actions`) }}[GitHub Actions](https://github.com/actions) workflows to test and deploy ML code and assets {{ else if (eq .input_cicd_platform `azure_devops`) }}[Azure DevOps Pipelines](https://azure.microsoft.com/en-gb/products/devops/pipelines/) to test and deploy ML code and assets{{ end }} | +| CI/CD | {{ if (eq .input_cicd_platform `github_actions`) }}[GitHub Actions](https://github.com/actions) workflows to test and deploy ML code and resources {{ else if (eq .input_cicd_platform `azure_devops`) }}[Azure DevOps Pipelines](https://azure.microsoft.com/en-gb/products/devops/pipelines/) to test and deploy ML code and resources{{ end }} | contained in the following files: @@ -27,11 +27,11 @@ contained in the following files: {{ .input_root_dir }} <- Root directory. Both monorepo and polyrepo are supported. │ {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} -├── {{template `project_name_alphanumeric_underscore` .}} <- Contains python code, notebooks and ML assets related to one ML project. +├── {{template `project_name_alphanumeric_underscore` .}} <- Contains python code, notebooks and ML resources related to one ML project. │ │ │ ├── requirements.txt <- Specifies Python dependencies for ML code (for example: model training, batch inference). │ │ -│ ├── databricks.yml <- databricks.yml is the root bundle file for the ML project that can be loaded by databricks CLI bundles. It defines the bundle name, workspace URL and asset config component to be included. +│ ├── databricks.yml <- databricks.yml is the root bundle file for the ML project that can be loaded by databricks CLI bundles. It defines the bundle name, workspace URL and resource config component to be included. │ │ {{ if and (eq .input_include_feature_store `no`) (eq .input_include_mlflow_recipes `no`) }} │ ├── training <- Training folder contains Notebook that trains and registers the model. @@ -49,15 +49,15 @@ contained in the following files: │ │ │ ├── tests <- Unit tests for the ML project, including the modules under `features`. │ │ -│ ├── assets <- ML asset (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. +│ ├── resources <- ML resource (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. │ │ -│ ├── model-workflow-asset.yml <- ML asset config definition for model training, validation, deployment workflow +│ ├── model-workflow-resource.yml <- ML resource config definition for model training, validation, deployment workflow │ │ -│ ├── batch-inference-workflow-asset.yml <- ML asset config definition for batch inference workflow +│ ├── batch-inference-workflow-resource.yml <- ML resource config definition for batch inference workflow │ │ -│ ├── ml-artifacts-asset.yml <- ML asset config definition for model and experiment +│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment │ │ -│ ├── monitoring-workflow-asset.yml <- ML asset config definition for data monitoring workflow +│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow {{ else if (eq .input_include_feature_store `yes`) }} │ ├── training <- Training folder contains Notebook that trains and registers the model with feature store support. │ │ @@ -78,17 +78,17 @@ contained in the following files: │ │ │ ├── tests <- Unit tests for the ML project, including the modules under `features`. │ │ -│ ├── assets <- ML asset (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. +│ ├── resources <- ML resource (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. │ │ -│ ├── model-workflow-asset.yml <- ML asset config definition for model training, validation, deployment workflow +│ ├── model-workflow-resource.yml <- ML resource config definition for model training, validation, deployment workflow │ │ -│ ├── batch-inference-workflow-asset.yml <- ML asset config definition for batch inference workflow +│ ├── batch-inference-workflow-resource.yml <- ML resource config definition for batch inference workflow │ │ -│ ├── feature-engineering-workflow-asset.yml <- ML asset config definition for feature engineering workflow +│ ├── feature-engineering-workflow-resource.yml <- ML resource config definition for feature engineering workflow │ │ -│ ├── ml-artifacts-asset.yml <- ML asset config definition for model and experiment +│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment │ │ -│ ├── monitoring-workflow-asset.yml <- ML asset config definition for data monitoring workflow +│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow {{ else }} │ ├── training <- Folder for model development via MLflow recipes. │ │ │ @@ -118,22 +118,22 @@ contained in the following files: │ │ │ ├── tests <- Unit tests for the ML project, including modules under `steps`. │ │ -│ ├── assets <- ML asset (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. +│ ├── resources <- ML resource (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. │ │ -│ ├── model-workflow-asset.yml <- ML asset config definition for model training, validation, deployment workflow +│ ├── model-workflow-resource.yml <- ML resource config definition for model training, validation, deployment workflow │ │ -│ ├── batch-inference-workflow-asset.yml <- ML asset config definition for batch inference workflow +│ ├── batch-inference-workflow-resource.yml <- ML resource config definition for batch inference workflow │ │ -│ ├── ml-artifacts-asset.yml <- ML asset config definition for model and experiment +│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment │ │ -│ ├── monitoring-workflow-asset.yml <- ML asset config definition for data monitoring workflow +│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow {{ end }} {{ end }} │ {{ if or (eq .input_cicd_platform `github_actions`) (eq .input_cicd_platform `github_actions_for_github_enterprise_servers`) }} -├── .github <- Configuration folder for CI/CD using GitHub Actions. {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} The CI/CD workflows deploy ML assets defined in the `./assets/*` folder with databricks CLI bundles.{{ end }} +├── .github <- Configuration folder for CI/CD using GitHub Actions. {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} The CI/CD workflows deploy ML resources defined in the `./resources/*` folder with databricks CLI bundles.{{ end }} {{ else if (eq .input_cicd_platform `azure_devops`) }} -├── .azure <- Configuration folder for CI/CD using Azure DevOps Pipelines. {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} The CI/CD workflows deploy ML assets defined in the `./assets/*` folder with databricks CLI bundles.{{ end }} +├── .azure <- Configuration folder for CI/CD using Azure DevOps Pipelines. {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} The CI/CD workflows deploy ML resources defined in the `./resources/*` folder with databricks CLI bundles.{{ end }} {{ end }} │ ├── docs <- Contains documentation for the repo. @@ -158,15 +158,15 @@ pipelines, ask your ops team to follow the [MLOps setup guide](docs/mlops-setup. production ML pipelines. After that, follow the [ML pull request guide](docs/ml-pull-request.md) -{{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} and [ML asset config guide]({{template `project_name_alphanumeric_underscore` .}}/assets/README.md) {{ end }} to propose, test, and deploy changes to production ML code (e.g. update model parameters) -or pipeline assets (e.g. use a larger instance type for model training) via pull request. +{{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} and [ML resource config guide]({{template `project_name_alphanumeric_underscore` .}}/resources/README.md) {{ end }} to propose, test, and deploy changes to production ML code (e.g. update model parameters) +or pipeline resources (e.g. use a larger instance type for model training) via pull request. | Role | Goal | Docs | |-------------------------------|------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Data Scientist | Get started writing ML code for a brand new project | {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}}[project README](./{{template `project_name_alphanumeric_underscore` .}}/README.md) {{ else }} README when project is initialized {{ end }}| | MLOps / DevOps | Set up CI/CD for the current ML project | [MLOps setup guide](docs/mlops-setup.md) | | Data Scientist | Update production ML code (e.g. model training logic) for an existing project | [ML pull request guide](docs/ml-pull-request.md) | -| Data Scientist | Modify production model ML assets, e.g. model training or inference jobs | {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}}[ML asset config guide]({{template `project_name_alphanumeric_underscore` .}}/assets/README.md) {{ else }} ML assets README when project is initialized {{ end }} | +| Data Scientist | Modify production model ML resources, e.g. model training or inference jobs | {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}}[ML resource config guide]({{template `project_name_alphanumeric_underscore` .}}/resources/README.md) {{ else }} ML resources README when project is initialized {{ end }} | ## Setting up CI/CD This stack comes with a workflow to set up CI/CD for projects that can be found in diff --git a/template/{{.input_root_dir}}/docs/ml-pull-request.md.tmpl b/template/{{.input_root_dir}}/docs/ml-pull-request.md.tmpl index 20f294c8..3345a25d 100644 --- a/template/{{.input_root_dir}}/docs/ml-pull-request.md.tmpl +++ b/template/{{.input_root_dir}}/docs/ml-pull-request.md.tmpl @@ -3,7 +3,7 @@ [(back to main README)](../README.md) **NOTE**: This page assumes that your MLOps team has already configured CI/CD and deployed initial -ML assets, per the [MLOps setup guide](mlops-setup.md). +ML resources, per the [MLOps setup guide](mlops-setup.md). ## Table of contents * [Opening a pull request](#opening-a-pull-request) diff --git a/template/{{.input_root_dir}}/docs/mlops-setup.md.tmpl b/template/{{.input_root_dir}}/docs/mlops-setup.md.tmpl index 2fbcd72c..a7f56c52 100644 --- a/template/{{.input_root_dir}}/docs/mlops-setup.md.tmpl +++ b/template/{{.input_root_dir}}/docs/mlops-setup.md.tmpl @@ -14,12 +14,12 @@ * [Create release branch](#create-release-branch) {{ end -}} {{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} -* [Deploy ML assets and enable production jobs](#deploy-ml-assets-and-enable-production-jobs){{ end }} +* [Deploy ML resources and enable production jobs](#deploy-ml-resources-and-enable-production-jobs){{ end }} * [Next steps](#next-steps) ## Intro This page explains how to productionize the current project, setting up CI/CD and -ML asset deployment, and deploying ML training and inference jobs. +ML resource deployment, and deploying ML training and inference jobs. After following this guide, data scientists can follow the [ML Pull Request](ml-pull-request.md) guide to make changes to ML code or deployed jobs. @@ -37,7 +37,7 @@ git remote add upstream Commit the current `README.md` file and other docs to the `{{ .input_default_branch }}` branch of the repo, to enable forking the repo: ``` {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} -git add README.md docs .gitignore {{template `project_name_alphanumeric_underscore` .}}/assets/README.md +git add README.md docs .gitignore {{template `project_name_alphanumeric_underscore` .}}/resources/README.md git commit -m "Adding project README" {{ else }} git add . @@ -59,7 +59,7 @@ git push upstream {{ .input_default_branch }} ### Set up authentication for CI/CD #### Set up Service Principal {{ if eq .input_cloud `azure` }} -To authenticate and manage ML assets created by CI/CD, +To authenticate and manage ML resources created by CI/CD, [service principals]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) for the project should be created and added to both staging and prod workspaces. Follow [Add a service principal to your Azure Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals#--add-a-service-principal-to-your-azure-databricks-account")) }}) @@ -69,7 +69,7 @@ for details. For your convenience, we also have Terraform modules that can be used to [create](https://registry.terraform.io/modules/databricks/mlops-azure-project-with-sp-creation/databricks/latest) or [link](https://registry.terraform.io/modules/databricks/mlops-azure-project-with-sp-linking/databricks/latest) service principals. {{ else if eq .input_cloud `aws` }} -To authenticate and manage ML assets created by CI/CD, +To authenticate and manage ML resources created by CI/CD, [service principals]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html")) }}) for the project should be created and added to both staging and prod workspaces. Follow [Add a service principal to your Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#add-a-service-principal-to-your-databricks-account")) }}) @@ -83,7 +83,7 @@ For your convenience, we also have a [Terraform module](https://registry.terrafo If the created project uses **Unity Catalog**, we expect a catalog to exist with the name of the deployment target by default. For example, if the deployment target is dev, we expect a catalog named dev to exist in the workspace. If you want to use different catalog names, please update the targets declared in the {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}}[{{ .input_project_name }}/databricks.yml](../{{template `project_name_alphanumeric_underscore` .}}/databricks.yml) -and [{{ .input_project_name }}/assets/ml-artifacts-asset.yml](../{{template `project_name_alphanumeric_underscore` .}}/assets/ml-artifacts-asset.yml) {{ else }} `databricks.yml` and `assets/ml-artifacts-asset.yml` {{ end }} files. +and [{{ .input_project_name }}/resources/ml-artifacts-resource.yml](../{{template `project_name_alphanumeric_underscore` .}}/resources/ml-artifacts-resource.yml) {{ else }} `databricks.yml` and `resources/ml-artifacts-resource.yml` {{ end }} files. If changing the staging, prod, or test deployment targets, you'll need to update the workflows located in the .github/workflows directory. The SP must have proper permission in each respective environment and the catalog for the environments. @@ -134,7 +134,7 @@ Be sure to update the [Workflow Permissions](https://docs.github.com/en/actions/ ### Setting up CI/CD workflows After setting up authentication for CI/CD, you can now set up CI/CD workflows. We provide a [Deploy CICD workflow](../.github/workflows/deploy-cicd.yml) that can be used to generate the other CICD workflows mentioned below for projects. -This workflow is manually triggered with `project_name` as parameter. This workflow will need to be triggered for each project to set up its set of CI/CD workflows that can be used to deploy ML assets and run ML jobs in the staging and prod workspaces. +This workflow is manually triggered with `project_name` as parameter. This workflow will need to be triggered for each project to set up its set of CI/CD workflows that can be used to deploy ML resources and run ML jobs in the staging and prod workspaces. These workflows will be defined under `.github/workflows`. {{ else if (eq .input_cicd_platform `azure_devops`) -}} @@ -153,15 +153,15 @@ Project-Specific pipelines: - **[CI]** Performs unit and integration tests
- Triggered on PR to main - **`{{ .input_project_name }}-bundle-cicd.yml`**:
- - **[CI]** Performs validation of Databricks assets defined under `{{template `project_name_alphanumeric_underscore` .}}/assets`
+ - **[CI]** Performs validation of Databricks resources defined under `{{template `project_name_alphanumeric_underscore` .}}/resources`
- Triggered on PR to main
- - **[CD]** Deploys Databricks assets to the staging workspace
+ - **[CD]** Deploys Databricks resources to the staging workspace
- Triggered on merging into main
- - **[CD]** Deploys Databricks assets to the prod workspace
+ - **[CD]** Deploys Databricks resources to the prod workspace
- Triggered on merging into release > Note that these workflows are provided as example CI/CD workflows, and can be easily modified to match your preferred CI/CD order of operations. -Within the CI/CD pipelines defined under `.azure/devops-pipelines`, we will be deploying Databricks assets to the defined staging and prod workspaces using the `databricks` CLI. This requires setting up authentication between the `databricks` CLI and Databricks. By default we show how to authenticate with service principals by passing [secret variables from a variable group](https://learn.microsoft.com/en-us/azure/devops/pipelines/scripts/cli/pipeline-variable-group-secret-nonsecret-variables?view=azure-devops). In a production setting it is recommended to either use an [Azure Key Vault](https://learn.microsoft.com/en-us/azure/devops/pipelines/release/azure-key-vault?view=azure-devops&tabs=yaml) to store these secrets, or alternatively use [Azure service connections](https://learn.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml). We describe below how you can adapt the project Pipelines to leverage service connections. Let's add these. +Within the CI/CD pipelines defined under `.azure/devops-pipelines`, we will be deploying Databricks resources to the defined staging and prod workspaces using the `databricks` CLI. This requires setting up authentication between the `databricks` CLI and Databricks. By default we show how to authenticate with service principals by passing [secret variables from a variable group](https://learn.microsoft.com/en-us/azure/devops/pipelines/scripts/cli/pipeline-variable-group-secret-nonsecret-variables?view=azure-devops). In a production setting it is recommended to either use an [Azure Key Vault](https://learn.microsoft.com/en-us/azure/devops/pipelines/release/azure-key-vault?view=azure-devops&tabs=yaml) to store these secrets, or alternatively use [Azure service connections](https://learn.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml). We describe below how you can adapt the project Pipelines to leverage service connections. Let's add these. ``` git add .azure @@ -184,12 +184,12 @@ By default, we provide Azure Pipelines where authentication is done using servic #### Steps: {{ if (eq .input_cloud `azure`) }} -1. Create two service principals - one to be used for deploying and running staging assets, and one to be used for deploying and running production assets. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) for details on how to create a service principal. +1. Create two service principals - one to be used for deploying and running staging resources, and one to be used for deploying and running production resources. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) for details on how to create a service principal. 1. [Add the staging and production service principals to your Azure Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals#add-service-principals-to-your-account-using-the-account-console")) }}), and following this add the staging service principal to the staging workspace, and production service principal to the production workspace. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) for details. 1. Follow ['Get Azure AD tokens for the service principals']({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/api/latest/aad/service-prin-aad-token")) }}) to get your service principal credentials (tenant id, application id, and client secret) for both the staging and prod service principals. You will use these credentials as variables in the project Azure Pipelines. {{ else }} -1. Create two service principals - one to be used for deploying and running staging assets, and one to be used for deploying and running production assets. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html")) }}) for details on how to create a service principal. +1. Create two service principals - one to be used for deploying and running staging resources, and one to be used for deploying and running production resources. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html")) }}) for details on how to create a service principal. 1. [Add the staging and production service principals to your Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#add-service-principals-to-your-account-using-the-account-console")) }}), and following this add the staging service principal to the staging workspace, and production service principal to the production workspace. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html")) }}) for details. 1. Follow ['Get tokens for the service principals']({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#manage-personal-access-tokens-for-a-service-principal")) }}) to get your service principal token for both the staging and prod service principals. You will use the token as variables in the project Azure Pipelines. @@ -225,12 +225,12 @@ In the case of a monorepo, where there are multiple projects under a single repo - Permissions to create Azure DevOps Pipelines in your Azure DevOps project. See the following [Azure DevOps prerequisites](https://learn.microsoft.com/en-us/azure/devops/organizations/security/about-permissions). - Permissions to create Azure DevOps build policies. See the following [prerequisites](https://learn.microsoft.com/azure/devops/repos/git/branch-policies). -The ultimate aim of the service connection approach is to use two separate service connections, authenticated with a staging service principal and a production service principal, to deploy and run assets in the respective Azure Databricks workspaces. Taking this approach then negates the need to read client secrets or client IDs from the CI/CD pipelines. +The ultimate aim of the service connection approach is to use two separate service connections, authenticated with a staging service principal and a production service principal, to deploy and run resources in the respective Azure Databricks workspaces. Taking this approach then negates the need to read client secrets or client IDs from the CI/CD pipelines. #### Steps: -1. Create two service principals - one to be used for deploying and running staging assets, and one to be used for deploying and running production assets. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) for details on how to create a service principal. +1. Create two service principals - one to be used for deploying and running staging resources, and one to be used for deploying and running production resources. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) for details on how to create a service principal. 1. [Add the staging and production service principals to your Azure Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals#add-service-principals-to-your-account-using-the-account-console")) }}), and following this add the staging service principal to the staging workspace, and production service principal to the production workspace. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html")) }}) for details. -1. [Create two Azure Resource Manager service connections](https://learn.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection) - one to be used to deploy to staging Databricks assets, the other for production assets. Each of these service connections should be authenticated with the respective staging and production service principals created in the prior step. +1. [Create two Azure Resource Manager service connections](https://learn.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection) - one to be used to deploy to staging Databricks resources, the other for production resources. Each of these service connections should be authenticated with the respective staging and production service principals created in the prior step. 1. Update pipeline YAML files to use service connections rather than pipeline variables: - First, remove any lines where the environment variables are set in tasks in `{{ .input_project_name }}-tests-ci.yml` or `{{ .input_project_name }}-bundle-cicd.yml` files. Specifically, any lines where the following env vars are used: `PROD_AZURE_SP_TENANT_ID`, `PROD_AZURE_SP_APPLICATION_ID`, `PROD_AZURE_SP_CLIENT_SECRET`, `STAGING_AZURE_SP_TENANT_ID`, `STAGING_AZURE_SP_APPLICATION_ID`, `STAGING_AZURE_SP_CLIENT_SECRET` - Then, add the following AzureCLI task prior to installing the `databricks` cli in any of the pipeline jobs: @@ -251,7 +251,7 @@ The ultimate aim of the service connection approach is to use two separate servi echo "##vso[task.setvariable variable=ARM_TENANT_ID]${tenantId}" echo "##vso[task.setvariable variable=ARM_SUBSCRIPTION_ID]${subscription_id}" ``` - > Note that you will have to update this code snippet with the respective service connection names, depending on which Databricks workspace you are deploying assets to. + > Note that you will have to update this code snippet with the respective service connection names, depending on which Databricks workspace you are deploying resources to. 1. Create separate Azure Pipelines under your Azure DevOps project using the ‘Existing Azure Pipelines YAML file’ option. Create one pipeline for each script. See [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/create-first-pipeline) for more details on creating Azure Pipelines. 6. Define [build validation branch policies](https://learn.microsoft.com/en-us/azure/devops/repos/git/branch-policies?view=azure-devops&tabs=browser#build-validation) for the `{{ .input_default_branch }}` branch using the Azure build pipelines created in step 1. This is required so that any PR changes to the `{{ .input_default_branch }}` must build successfully before PRs can complete. @@ -260,7 +260,7 @@ In the case of a monorepo, where there are multiple projects under a single repo ### Setting up CI/CD workflows After setting up authentication for CI/CD, you can now set up CI/CD workflows. We provide a [Deploy CICD workflow](../.azure/devops-pipelines/deploy-cicd.yml) that can be used to generate the other CICD workflows mentioned below for projects. -This workflow is manually triggered with `project_name` as parameter. This workflow will need to be triggered for each project to set up its set of CI/CD workflows that can be used to deploy ML assets and run ML jobs in the staging and prod workspaces. +This workflow is manually triggered with `project_name` as parameter. This workflow will need to be triggered for each project to set up its set of CI/CD workflows that can be used to deploy ML resources and run ML jobs in the staging and prod workspaces. These workflows will be defined under `.azure/devops-pipelines`. After generating these workflows, be sure to go through the above workflow-specific steps again to add the appropriate build branch policies and filters. {{ end }} @@ -306,14 +306,14 @@ git push upstream {{ .input_release_branch }} git checkout {{ .input_default_branch }} ``` -Your production jobs (model training, batch inference) will pull ML code against this branch, while your staging jobs will pull ML code against the `{{ .input_default_branch }}` branch. Note that the `{{ .input_default_branch }}` branch will be the source of truth for ML asset configs and CI/CD workflows. +Your production jobs (model training, batch inference) will pull ML code against this branch, while your staging jobs will pull ML code against the `{{ .input_default_branch }}` branch. Note that the `{{ .input_default_branch }}` branch will be the source of truth for ML resource configs and CI/CD workflows. For future ML code changes, iterate against the `{{ .input_default_branch }}` branch and regularly deploy your ML code from staging to production by merging code changes from the `{{ .input_default_branch }}` branch into the `{{ .input_release_branch }}` branch. {{ end -}} {{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} -## Deploy ML assets and enable production jobs -Follow the instructions in [{{ .input_project_name }}/assets/README.md](../{{template `project_name_alphanumeric_underscore` .}}/assets/README.md) to deploy ML assets +## Deploy ML resources and enable production jobs +Follow the instructions in [{{ .input_project_name }}/resources/README.md](../{{template `project_name_alphanumeric_underscore` .}}/resources/README.md) to deploy ML resources and production jobs. {{- end }} @@ -321,5 +321,5 @@ and production jobs. After you configure CI/CD and deploy training & inference pipelines, notify data scientists working on the current project. They should now be able to follow the [ML pull request guide](ml-pull-request.md) and -{{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}}[ML asset config guide](../{{template `project_name_alphanumeric_underscore` .}}/assets/README.md){{- end }} to propose, test, and deploy +{{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}}[ML resource config guide](../{{template `project_name_alphanumeric_underscore` .}}/resources/README.md){{- end }} to propose, test, and deploy ML code and pipeline changes to production. diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl index 7eecae89..15ae96d6 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl @@ -22,18 +22,18 @@ This project contains the following components: | Component | Description | |----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | ML Code | Example ML project code, with unit tested Python modules and notebooks | -| ML Assets as Code | ML pipeline assets (training and batch inference jobs with schedules, etc) configured and deployed through [databricks CLI bundles]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/bundle-cli.html")) }}) | +| ML Resources as Code | ML pipeline resources (training and batch inference jobs with schedules, etc) configured and deployed through [databricks CLI bundles]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/bundle-cli.html")) }}) | contained in the following files: ``` {{ .input_root_dir }} <- Root directory. Both monorepo and polyrepo are supported. │ -├── {{template `project_name_alphanumeric_underscore` .}} <- Contains python code, notebooks and ML assets related to one ML project. +├── {{template `project_name_alphanumeric_underscore` .}} <- Contains python code, notebooks and ML resources related to one ML project. │ │ │ ├── requirements.txt <- Specifies Python dependencies for ML code (for example: model training, batch inference). │ │ -│ ├── databricks.yml <- databricks.yml is the root bundle file for the ML project that can be loaded by databricks CLI bundles. It defines the bundle name, workspace URL and asset config component to be included. +│ ├── databricks.yml <- databricks.yml is the root bundle file for the ML project that can be loaded by databricks CLI bundles. It defines the bundle name, workspace URL and resource config component to be included. │ │ {{ if and (eq .input_include_feature_store `no`) (eq .input_include_mlflow_recipes `no`) -}} │ ├── training <- Training folder contains Notebook that trains and registers the model. @@ -51,15 +51,15 @@ contained in the following files: │ │ │ ├── tests <- Unit tests for the ML project, including the modules under `features`. │ │ -│ ├── assets <- ML asset (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. +│ ├── resources <- ML resource (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. │ │ -│ ├── model-workflow-asset.yml <- ML asset config definition for model training, validation, deployment workflow +│ ├── model-workflow-resource.yml <- ML resource config definition for model training, validation, deployment workflow │ │ -│ ├── batch-inference-workflow-asset.yml <- ML asset config definition for batch inference workflow +│ ├── batch-inference-workflow-resource.yml <- ML resource config definition for batch inference workflow │ │ -│ ├── ml-artifacts-asset.yml <- ML asset config definition for model and experiment +│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment │ │ -│ ├── monitoring-workflow-asset.yml <- ML asset config definition for data monitoring workflow +│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow {{ else if (eq .input_include_feature_store `yes`) -}} │ ├── training <- Training folder contains Notebook that trains and registers the model with feature store support. │ │ @@ -80,17 +80,17 @@ contained in the following files: │ │ │ ├── tests <- Unit tests for the ML project, including the modules under `features`. │ │ -│ ├── assets <- ML asset (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. +│ ├── resources <- ML resource (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. │ │ -│ ├── model-workflow-asset.yml <- ML asset config definition for model training, validation, deployment workflow +│ ├── model-workflow-resource.yml <- ML resource config definition for model training, validation, deployment workflow │ │ -│ ├── batch-inference-workflow-asset.yml <- ML asset config definition for batch inference workflow +│ ├── batch-inference-workflow-resource.yml <- ML resource config definition for batch inference workflow │ │ -│ ├── feature-engineering-workflow-asset.yml <- ML asset config definition for feature engineering workflow +│ ├── feature-engineering-workflow-resource.yml <- ML resource config definition for feature engineering workflow │ │ -│ ├── ml-artifacts-asset.yml <- ML asset config definition for model and experiment +│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment │ │ -│ ├── monitoring-workflow-asset.yml <- ML asset config definition for data monitoring workflow +│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow {{ else -}} │ ├── training <- Folder for model development via MLflow recipes. │ │ │ @@ -120,15 +120,15 @@ contained in the following files: │ │ │ ├── tests <- Unit tests for the ML project, including modules under `steps`. │ │ -│ ├── assets <- ML asset (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. +│ ├── resources <- ML resource (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. │ │ -│ ├── model-workflow-asset.yml <- ML asset config definition for model training, validation, deployment workflow +│ ├── model-workflow-resource.yml <- ML resource config definition for model training, validation, deployment workflow │ │ -│ ├── batch-inference-workflow-asset.yml <- ML asset config definition for batch inference workflow +│ ├── batch-inference-workflow-resource.yml <- ML resource config definition for batch inference workflow │ │ -│ ├── ml-artifacts-asset.yml <- ML asset config definition for model and experiment +│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment │ │ -│ ├── monitoring-workflow-asset.yml <- ML asset config definition for data monitoring workflow +│ ├── monitoring-workflow-resource.yml <- ML resource config definition for data monitoring workflow {{ end -}} ``` @@ -152,18 +152,18 @@ See the example modules' documentation for more information. To adapt this sample code for your use case, implement your own feature module, specifying configs such as input Delta tables/dataset path(s) to use when developing the feature engineering pipelines. 1. Implement your feature module, address TODOs in `feature_engineering/features` and create unit test in `tests/feature_engineering` -2. Update `assets/feature-engineering-workflow-asset.yml`. Fill in notebook parameters for `write_feature_table_job`. -3. Update training data path in `assets/model-workflow-asset.yml`. +2. Update `resources/feature-engineering-workflow-resource.yml`. Fill in notebook parameters for `write_feature_table_job`. +3. Update training data path in `resources/model-workflow-resource.yml`. We expect most of the development to take place in the `feature_engineering` folder. {{ end }} ## Iterating on ML code -### Deploy ML code and assets to dev workspace using Bundles +### Deploy ML code and resources to dev workspace using Bundles -Refer to [Local development and dev workspace](./assets/README.md#local-development-and-dev-workspace) -to use databricks CLI bundles to deploy ML code together with ML asset configs to dev workspace. +Refer to [Local development and dev workspace](./resources/README.md#local-development-and-dev-workspace) +to use databricks CLI bundles to deploy ML code together with ML resource configs to dev workspace. This will allow you to develop locally and use databricks CLI bundles to deploy to your dev workspace to test out code and config changes. diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl index 418d7f28..833893b5 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl @@ -12,10 +12,10 @@ variables: {{- else -}}default: {{template `model_name` .}}{{end}} include: - # Assets folder contains ML artifact assets for the ml project that defines model and experiment - # And workflows assets for the ml project including model training -> validation -> deployment, + # Resources folder contains ML artifact resources for the ML project that defines model and experiment + # And workflows resources for the ML project including model training -> validation -> deployment, # {{- if (eq .input_include_feature_store `yes`) }} feature engineering, {{ end }} batch inference, data monitoring, metric refresh, alerts and triggering retraining - - ./assets/*.yml + - ./resources/*.yml # Deployment Target specific values for workspace targets: diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/README.md.tmpl index 38de3fec..61ba6ab5 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/README.md.tmpl @@ -1,5 +1,5 @@ # Batch Inference -To set up batch inference job via scheduled Databricks workflow, please refer to [{{template `project_name_alphanumeric_underscore` .}}/assets/README.md](../../assets/README.md) +To set up batch inference job via scheduled Databricks workflow, please refer to [{{template `project_name_alphanumeric_underscore` .}}/resources/README.md](../../resources/README.md) ## Prepare the batch inference input table for the example Project Please run the following code in a notebook to generate the example batch inference input table. diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl index b83015fa..647a4f97 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl @@ -4,7 +4,7 @@ # # This notebook is an example of applying a model for batch inference against an input delta table, # It is configured and can be executed as the batch_inference_job in the batch_inference_job workflow defined under -# ``{{template `project_name_alphanumeric_underscore` .}}/assets/batch-inference-workflow-asset.yml`` +# ``{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml`` # # Parameters: # diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/README.md.tmpl index 5db6b0e2..395591a8 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/README.md.tmpl @@ -1,4 +1,4 @@ # Feature Engineering -To set up the feature engineering job via scheduled Databricks workflow, please refer to [{{template `project_name_alphanumeric_underscore` .}}/assets/README.md](../assets/README.md) +To set up the feature engineering job via scheduled Databricks workflow, please refer to [{{template `project_name_alphanumeric_underscore` .}}/resources/README.md](../resources/README.md) For additional details on using the feature store, please refer to [the project-level README](../README.md). diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl index 70f9826a..90b79c95 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl @@ -4,7 +4,7 @@ # # This notebook can be used to generate and write features to a Databricks Feature Store table. # It is configured and can be executed as the tasks in the write_feature_table_job workflow defined under -# ``{{template `project_name_alphanumeric_underscore` .}}/assets/feature-engineering-workflow-asset.yml`` +# ``{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml`` # # Parameters: # diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl similarity index 74% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/README.md.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl index a25a1c9c..be303306 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl @@ -7,42 +7,42 @@ * [Develop and test config changes](#develop-and-test-config-changes) {{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} * [CI/CD](#set-up-cicd) -* [Deploy initial ML assets](#deploy-initial-ml-assets) +* [Deploy initial ML resources](#deploy-initial-ml-resources) * [Deploy config changes](#deploy-config-changes) {{- end }} ## Intro ### databricks CLI bundles -MLOps Stacks ML assets are configured and deployed through [databricks CLI bundles]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/bundle-cli.html")) }}). +MLOps Stacks ML resources are configured and deployed through [databricks CLI bundles]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/bundle-cli.html")) }}). The bundle setting file must be expressed in YAML format and must contain at minimum the top-level bundle mapping. The databricks CLI bundles top level is defined by file `{{template `project_name_alphanumeric_underscore` .}}/databricks.yml`. -During databricks CLI bundles deployment, the root config file will be loaded, validated and deployed to workspace provided by the environment together with all the included assets. +During databricks CLI bundles deployment, the root config file will be loaded, validated and deployed to workspace provided by the environment together with all the included resources. ML Resource Configurations in this directory: - - model workflow (`{{template `project_name_alphanumeric_underscore` .}}/assets/model-workflow-asset.yml`) - - batch inference workflow (`{{template `project_name_alphanumeric_underscore` .}}/assets/batch-inference-workflow-asset.yml`) - - monitoring workflow (`{{template `project_name_alphanumeric_underscore` .}}/assets/monitoring-workflow-asset.yml`) - - feature engineering workflow (`{{template `project_name_alphanumeric_underscore` .}}/assets/feature-engineering-workflow-asset.yml`) - - model definition and experiment definition (`{{template `project_name_alphanumeric_underscore` .}}/assets/ml-artifacts-asset.yml`) + - model workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml`) + - batch inference workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml`) + - monitoring workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-workflow-resource.yml`) + - feature engineering workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml`) + - model definition and experiment definition (`{{template `project_name_alphanumeric_underscore` .}}/resources/ml-artifacts-resource.yml`) ### Deployment Config & CI/CD integration -The ML assets can be deployed to databricks workspace based on the databricks CLI bundles deployment config. -Deployment configs of different deployment targets share the general ML asset configurations with added ability to specify deployment target specific values (workspace URI, model name, jobs notebook parameters, etc). +The ML resources can be deployed to databricks workspace based on the databricks CLI bundles deployment config. +Deployment configs of different deployment targets share the general ML resource configurations with added ability to specify deployment target specific values (workspace URI, model name, jobs notebook parameters, etc). {{- if (eq .input_setup_cicd_and_project `Project_Only`) }} NOTE: This project was not setup with CI/CD workflows. You can setup CI/CD with a new initialization of MLOps Stacks. The rest of this section only applies if you are using a monorepo setup with CI/CD previously or have setup CI/CD otherwise. {{- else }} -This project ships with CI/CD workflows for developing and deploying ML asset configurations based on deployment config. +This project ships with CI/CD workflows for developing and deploying ML resource configurations based on deployment config. {{- end }} {{- if (eq .input_include_models_in_unity_catalog "yes") }} For Model Registry in Unity Catalog, we expect a catalog to exist with the name of the deployment target by default. For example, if the deployment target is `dev`, we expect a catalog named `dev` to exist in the workspace. -If you want to use different catalog names, please update the `targets` declared in the `{{template `project_name_alphanumeric_underscore` .}}/databricks.yml` and `{{template `project_name_alphanumeric_underscore` .}}/assets/ml-artifacts-asset.yml` files. +If you want to use different catalog names, please update the `targets` declared in the `{{template `project_name_alphanumeric_underscore` .}}/databricks.yml` and `{{template `project_name_alphanumeric_underscore` .}}/resources/ml-artifacts-resource.yml` files. If changing the `staging`, `prod`, or `test` deployment targets, you'll need to update the {{- if (eq .input_setup_cicd_and_project `Project_Only`) }} workflows located in the root directory if in a monorepo setup with CI/CD. Otherwise you can setup CI/CD with a new initialization of MLOps Stacks. {{- else if or (eq .input_cicd_platform `github_actions`) (eq .input_cicd_platform `github_actions_for_github_enterprise_servers`) }} workflows located in the `.github/workflows` directory. @@ -52,23 +52,23 @@ If changing the `staging`, `prod`, or `test` deployment targets, you'll need to | Deployment Target | Description | Databricks Workspace | Model Name | Experiment Name | |-------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------|-------------------------------------|------------------------------------------------| -| dev | The `dev` deployment target is used by ML engineers to deploy ML assets to development workspace with `dev` configs. The config is for ML project development purposes. | dev workspace | dev-{{template `model_name` .}} | /dev-{{template `experiment_base_name` .}} | +| dev | The `dev` deployment target is used by ML engineers to deploy ML resources to development workspace with `dev` configs. The config is for ML project development purposes. | dev workspace | dev-{{template `model_name` .}} | /dev-{{template `experiment_base_name` .}} | | staging | The `staging` deployment target is part of the CD pipeline. Latest {{ .input_default_branch }} content will be deployed to staging workspace with `staging` config. | staging workspace | staging-{{template `model_name` .}} | /staging-{{template `experiment_base_name` .}} | | prod | The `prod` deployment target is part of the CD pipeline. Latest {{ .input_release_branch }} content will be deployed to prod workspace with `prod` config. | prod workspace | prod-{{template `model_name` .}} | /prod-{{template `experiment_base_name` .}} | -| test | The `test` deployment target is part of the CI pipeline. For changes targeting the {{ .input_default_branch }} branch, upon making a PR, an integration test will be triggered and ML assets deployed to the staging workspace defined under `test` deployment target. | staging workspace | test-{{template `model_name` .}} | /test-{{template `experiment_base_name` .}} | +| test | The `test` deployment target is part of the CI pipeline. For changes targeting the {{ .input_default_branch }} branch, upon making a PR, an integration test will be triggered and ML resources deployed to the staging workspace defined under `test` deployment target. | staging workspace | test-{{template `model_name` .}} | /test-{{template `experiment_base_name` .}} | -During ML code development, you can deploy local ML asset configurations together with ML code to the a Databricks workspace to run the training, model validation or batch inference pipelines. The deployment will use `dev` config by default. +During ML code development, you can deploy local ML resource configurations together with ML code to the a Databricks workspace to run the training, model validation or batch inference pipelines. The deployment will use `dev` config by default. -You can open a PR (pull request) to modify ML code or the asset config against {{ .input_default_branch }} branch. -The PR will trigger Python unit tests, followed by an integration test executed on the staging workspace, as defined under the `test` environment asset. +You can open a PR (pull request) to modify ML code or the resource config against {{ .input_default_branch }} branch. +The PR will trigger Python unit tests, followed by an integration test executed on the staging workspace, as defined under the `test` environment resource. -Upon merging a PR to the {{ .input_default_branch }} branch, the {{ .input_default_branch }} branch content will be deployed to the staging workspace with `staging` environment asset configurations. +Upon merging a PR to the {{ .input_default_branch }} branch, the {{ .input_default_branch }} branch content will be deployed to the staging workspace with `staging` environment resource configurations. -Upon merging code into the release branch, the release branch content will be deployed to prod workspace with `prod` environment asset configurations. +Upon merging code into the release branch, the release branch content will be deployed to prod workspace with `prod` environment resource configurations. {{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} -![ML asset config diagram](../../docs/images/mlops-stack-deploy.png) +![ML resource config diagram](../../docs/images/mlops-stack-deploy.png) {{- end }} ## Local development and dev workspace @@ -82,37 +82,37 @@ To set up the databricks CLI using a Databricks personal access token, take the 3. [Create a personal access token]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/auth.html#personal-access-tokens-for-users")) }}) in your dev workspace and copy it. 4. Set an env variable `DATABRICKS_TOKEN` with your Databricks personal access token in your terminal. For example, run `export DATABRICKS_TOKEN=dapi12345` if the access token is dapi12345. -5. You can now use the databricks CLI to validate and deploy ML asset configurations to the dev workspace. +5. You can now use the databricks CLI to validate and deploy ML resource configurations to the dev workspace. Alternatively, you can use the other approaches described in the [databricks CLI]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/databricks-cli.html")) }}) documentation to set up authentication. For example, using your Databricks username/password, or seting up a local profile. -### Validate and provision ML asset configurations +### Validate and provision ML resource configurations 1. After installing the databricks CLI and creating the `DATABRICKS_TOKEN` env variable, change to the `{{template `project_name_alphanumeric_underscore` .}}` directory. -2. Run `databricks bundle validate` to validate the Databricks asset configurations. -3. Run `databricks bundle deploy` to provision the Databricks asset configurations to the dev workspace. The asset configurations and your ML code will be copied together to the dev workspace. The defined assets such as Databricks Workflows, MLflow Model and MLflow Experiment will be provisioned according to the config files under `{{template `project_name_alphanumeric_underscore` .}}/assets`. +2. Run `databricks bundle validate` to validate the Databricks resource configurations. +3. Run `databricks bundle deploy` to provision the Databricks resource configurations to the dev workspace. The resource configurations and your ML code will be copied together to the dev workspace. The defined resources such as Databricks Workflows, MLflow Model and MLflow Experiment will be provisioned according to the config files under `{{template `project_name_alphanumeric_underscore` .}}/resources`. 4. Go to the Databricks dev workspace, check the defined model, experiment and workflows status, and interact with the created workflows. -### Destroy ML asset configurations -After development is done, you can run `databricks bundle destroy` to destroy (remove) the defined Databricks assets in the dev workspace. Any model version with `Production` or `Staging` stage will prevent the model from being deleted. Please update the version stage to `None` or `Archived` before destroying the ML assets. +### Destroy ML resource configurations +After development is done, you can run `databricks bundle destroy` to destroy (remove) the defined Databricks resources in the dev workspace. Any model version with `Production` or `Staging` stage will prevent the model from being deleted. Please update the version stage to `None` or `Archived` before destroying the ML resources. {{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} ## Set up CI/CD Please refer to [mlops-setup](../../docs/mlops-setup.md#configure-cicd) for instructions to set up CI/CD. -## Deploy initial ML assets +## Deploy initial ML resources After completing the prerequisites, create and push a PR branch adding all files to the Git repo: ``` -git checkout -b add-ml-asset-config-and-code +git checkout -b add-ml-resource-config-and-code git add . -git commit -m "Add ML asset config and ML code" -git push upstream add-ml-asset-config-and-code +git commit -m "Add ML resource config and ML code" +git push upstream add-ml-resource-config-and-code ``` Open a pull request to merge the pushed branch into the `{{ .input_default_branch }}` branch. Upon creating this PR, the CI workflows will be triggered. These CI workflow will run unit and integration tests of the ML code, -in addition to validating the Databricks assets to be deployed to both staging and prod workspaces. -Once CI passes, merge the PR into the `{{ .input_default_branch }}` branch. This will deploy an initial set of Databricks assets to the staging workspace. -assets will be deployed to the prod workspace on pushing code to the `{{ .input_release_branch }}` branch. +in addition to validating the Databricks resources to be deployed to both staging and prod workspaces. +Once CI passes, merge the PR into the `{{ .input_default_branch }}` branch. This will deploy an initial set of Databricks resources to the staging workspace. +resources will be deployed to the prod workspace on pushing code to the `{{ .input_release_branch }}` branch. Follow the next section to configure the input and output data tables for the batch inference job. {{- end }} @@ -120,7 +120,7 @@ Follow the next section to configure the input and output data tables for the ba ### Setting up the batch inference job The batch inference job expects an input Delta table with a schema that your registered model accepts. To use the batch inference job, set up such a Delta table in both your staging and prod workspaces. -Following this, update the batch_inference_job base parameters in `{{template `project_name_alphanumeric_underscore` .}}/assets/batch-inference-workflow-asset.yml` to pass +Following this, update the batch_inference_job base parameters in `{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml` to pass the name of the input Delta table and the name of the output Delta table to which to write batch predictions. As the batch job will be run with the credentials of the service principal that provisioned it, make sure that the service @@ -136,17 +136,17 @@ in staging and prod. Its central purpose is to evaluate a registered model and validate its quality before deploying the model to Production/Staging. Model validation contains three components: -* [model-workflow-asset.yml](./model-workflow-asset.yml) contains the asset config and input parameters for model validation. -* [validation.py](../validation/validation.py) defines custom metrics and validation thresholds that are referenced by the above asset config files. +* [model-workflow-resource.yml](./model-workflow-resource.yml) contains the resource config and input parameters for model validation. +* [validation.py](../validation/validation.py) defines custom metrics and validation thresholds that are referenced by the above resource config files. * [notebooks/ModelValidation](../validation/notebooks/ModelValidation.py) contains the validation job implementation. In most cases you don't need to modify this file. To set up and enable model validation, update [validation.py](../validation/validation.py) to return desired custom metrics and validation thresholds, then -resolve the `TODOs` in the ModelValidation task of [model-workflow-asset.yml](./model-workflow-asset.yml). +resolve the `TODOs` in the ModelValidation task of [model-workflow-resource.yml](./model-workflow-resource.yml). ## Develop and test config changes ### databricks CLI bundles schema overview -To get started, open `{{template `project_name_alphanumeric_underscore` .}}/assets/batch-inference-workflow-asset.yml`. The file contains the ML asset definition of a batch inference job, like: +To get started, open `{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml`. The file contains the ML resource definition of a batch inference job, like: ```$xslt new_cluster: &new_cluster @@ -175,13 +175,13 @@ resources: The example above defines a Databricks job with name `${bundle.target}-{{ .input_project_name }}-batch-inference-job` that runs the notebook under `{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py` to regularly apply your ML model for batch inference. -At the start of the asset definition, we declared an anchor `new_cluster` that will be referenced and used later. For more information about anchors in yaml schema, please refer to the [yaml documentation](https://yaml.org/spec/1.2.2/#3222-anchors-and-aliases). +At the start of the resource definition, we declared an anchor `new_cluster` that will be referenced and used later. For more information about anchors in yaml schema, please refer to the [yaml documentation](https://yaml.org/spec/1.2.2/#3222-anchors-and-aliases). -We specify a `batch_inference_job` under `assets/jobs` to define a databricks workflow with internal key `batch_inference_job` and job name `{bundle.target}-{{ .input_project_name }}-batch-inference-job`. +We specify a `batch_inference_job` under `resources/jobs` to define a databricks workflow with internal key `batch_inference_job` and job name `{bundle.target}-{{ .input_project_name }}-batch-inference-job`. The workflow contains a single task with task key `batch_inference_job`. The task runs notebook `../deployment/batch_inference/notebooks/BatchInference.py` with provided parameters `env` and `input_table_name` passing to the notebook. After setting up databricks CLI, you can run command `databricks bundle schema` to learn more about databricks CLI bundles schema. -The notebook_path is the relative path starting from the asset yaml file. +The notebook_path is the relative path starting from the resource yaml file. ### Environment config based variables The `${bundle.target}` will be replaced by the environment config name during the bundle deployment. For example, during the deployment of a `test` environment config, the job name will be @@ -246,13 +246,13 @@ Alternatively you can open a PR. Continuous integration will then validate the u Please refer to [Local development and dev workspace](#local-development-and-dev-workspace). ### Test workspace deployment(CI) -After setting up CI/CD, PRs against the {{ .input_default_branch }} branch will trigger CI workflows to run unit tests, integration test and asset validation. -The integration test will deploy MLflow model, MLflow experiment and Databricks workflow assets defined under the `test` environment asset config to the staging workspace. The integration test then triggers a run of the model workflow to verify the ML code. +After setting up CI/CD, PRs against the {{ .input_default_branch }} branch will trigger CI workflows to run unit tests, integration test and resource validation. +The integration test will deploy MLflow model, MLflow experiment and Databricks workflow resources defined under the `test` environment resource config to the staging workspace. The integration test then triggers a run of the model workflow to verify the ML code. ### Staging and Prod workspace deployment(CD) -After merging a PR to the {{ .input_default_branch }} branch, continuous deployment automation will deploy the `staging` assets to the staging workspace. +After merging a PR to the {{ .input_default_branch }} branch, continuous deployment automation will deploy the `staging` resources to the staging workspace. -When you about to cut a release, you can create and merge a PR to merge changes from {{ .input_default_branch }} to {{ .input_release_branch }}. Continuous deployment automation will deploy `prod` assets to the prod workspace. +When you about to cut a release, you can create and merge a PR to merge changes from {{ .input_default_branch }} to {{ .input_release_branch }}. Continuous deployment automation will deploy `prod` resources to the prod workspace. {{- end }} [Back to project README](../README.md) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/batch-inference-workflow-asset.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl similarity index 94% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/batch-inference-workflow-asset.yml.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl index a445d6ff..b9703fce 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/batch-inference-workflow-asset.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl @@ -29,7 +29,7 @@ resources: {{- else -}}output_table_name: ${bundle.target}.{{ .input_schema_name }}.predictions{{ end }} {{ if (eq .input_include_models_in_unity_catalog `no`) }}model_name: ${var.model_name} {{- else -}}model_name: ${bundle.target}.{{ .input_schema_name }}.${var.model_name}{{ end }} - # git source information of current ML asset deployment. It will be persisted as part of the workflow run + # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} schedule: diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/feature-engineering-workflow-asset.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl similarity index 93% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/feature-engineering-workflow-asset.yml.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl index a6442789..5f22c602 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/feature-engineering-workflow-asset.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl @@ -34,7 +34,7 @@ resources: {{- else -}}output_table_name: ${bundle.target}.{{ .input_schema_name }}.trip_pickup_features{{ end }} features_transform_module: pickup_features primary_keys: zip - # git source information of current ML asset deployment. It will be persisted as part of the workflow run + # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} - task_key: DropoffFeatures job_cluster_key: write_feature_table_job_cluster @@ -51,7 +51,7 @@ resources: {{- else -}}output_table_name: ${bundle.target}.{{ .input_schema_name }}.trip_dropoff_features{{ end }} features_transform_module: dropoff_features primary_keys: zip - # git source information of current ML asset deployment. It will be persisted as part of the workflow run + # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} schedule: quartz_cron_expression: "0 0 7 * * ?" # daily at 7am diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/ml-artifacts-asset.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/ml-artifacts-resource.yml.tmpl similarity index 100% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/ml-artifacts-asset.yml.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/ml-artifacts-resource.yml.tmpl diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/model-workflow-asset.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl similarity index 92% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/model-workflow-asset.yml.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl index 61c295fa..10e8e7bc 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/model-workflow-asset.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl @@ -30,7 +30,7 @@ resources: experiment_name: ${var.experiment_name} {{ if (eq .input_include_models_in_unity_catalog `no`) }}model_name: ${var.model_name} {{- else -}}model_name: ${bundle.target}.{{ .input_schema_name }}.${var.model_name}{{ end }} - # git source information of current ML asset deployment. It will be persisted as part of the workflow run + # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} {{ else if (eq .input_include_feature_store `yes`) }}notebook_task: notebook_path: ../training/notebooks/TrainWithFeatureStore.py @@ -45,13 +45,13 @@ resources: {{- else -}}pickup_features_table: ${bundle.target}.{{ .input_schema_name }}.trip_pickup_features{{ end }} {{ if (eq .input_include_models_in_unity_catalog `no`) }}dropoff_features_table: feature_store_taxi_example.${bundle.target}_{{template `project_name_alphanumeric_underscore` .}}_trip_dropoff_features {{- else -}}dropoff_features_table: ${bundle.target}.{{ .input_schema_name }}.trip_dropoff_features{{ end }} - # git source information of current ML asset deployment. It will be persisted as part of the workflow run + # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} {{- else -}}notebook_task: notebook_path: ../training/notebooks/TrainWithMLflowRecipes.py base_parameters: env: ${bundle.target} - # git source information of current ML asset deployment. It will be persisted as part of the workflow run + # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}{{ end }} - task_key: ModelValidation job_cluster_key: model_training_job_cluster @@ -97,7 +97,7 @@ resources: # Specifies the name of the function in {{ .input_project_name }}/training_validation_deployment/validation/validation.py that returns evaluator_config. # TODO(optional): evaluator_config_loader_function evaluator_config_loader_function: evaluator_config - # git source information of current ML asset deployment. It will be persisted as part of the workflow run + # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} - task_key: ModelDeployment job_cluster_key: model_training_job_cluster @@ -107,7 +107,7 @@ resources: notebook_path: ../deployment/model_deployment/notebooks/ModelDeployment.py base_parameters: env: ${bundle.target} - # git source information of current ML asset deployment. It will be persisted as part of the workflow run + # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} schedule: quartz_cron_expression: "0 0 9 * * ?" # daily at 9am diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/monitoring-workflow-asset.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-workflow-resource.yml.tmpl similarity index 100% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/assets/monitoring-workflow-asset.yml.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-workflow-resource.yml.tmpl diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl index 28aced83..f4789b2c 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl @@ -12,7 +12,7 @@ This folder contains example ML code to train a regression model to predict NYC [MLflow recipes](https://mlflow.org/docs/latest/recipes.html). **Note**: MLflow Recipes currently supports regression and classification problems. Usage of MLflow Recipes is encouraged but not required: you can still use the provided -CI/CD and ML asset configs to build production ML pipelines, as long as you provide ML notebooks under `notebooks` +CI/CD and ML resource configs to build production ML pipelines, as long as you provide ML notebooks under `notebooks` directory of the corresponding component, for example, model training notebooks in `{{template `project_name_alphanumeric_underscore` .}}/training/notebooks`, batch inference notebook in `{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks`. See code comments in files under `notebooks` for the expected interface & behavior of these notebooks. @@ -41,10 +41,10 @@ We expect most development to take place in the abovementioned YAML config files ## Iterating on ML code -### Deploy ML code and assets to dev workspace using Bundles +### Deploy ML code and resources to dev workspace using Bundles -Refer to [Local development and dev workspace](../assets/README.md#local-development-and-dev-workspace) -to use databricks CLI bundles to deploy ML code together with ML asset configs to dev workspace. +Refer to [Local development and dev workspace](../resources/README.md#local-development-and-dev-workspace) +to use databricks CLI bundles to deploy ML code together with ML resource configs to dev workspace. ### Develop on Databricks using Databricks Repos diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl index 44eac90c..b6cd7693 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl @@ -4,7 +4,7 @@ # # This notebook shows an example of a Model Training pipeline using Delta tables. # It is configured and can be executed as the "Train" task in the model_training_job workflow defined under -# ``{{template `project_name_alphanumeric_underscore` .}}/assets/model-workflow-asset.yml`` +# ``{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml`` # # Parameters: # * env (required): - Environment the notebook is run in (staging, or prod). Defaults to "staging". diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl index 9198d62d..c8b8884d 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl @@ -4,7 +4,7 @@ # # This notebook shows an example of a Model Training pipeline using Databricks Feature Store tables. # It is configured and can be executed as the "Train" task in the model_training_job workflow defined under -# ``{{template `project_name_alphanumeric_underscore` .}}/assets/model-workflow-asset.yml`` +# ``{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml`` # # Parameters: # * env (required): - Environment the notebook is run in (staging, or prod). Defaults to "staging". diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl index c207505d..c06cc6c4 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl @@ -4,7 +4,7 @@ ## # This notebook runs the MLflow Regression Recipe to train and registers an MLflow model in the model registry. # It is configured and can be executed as the "Train" task in the model_training_job workflow defined under -# ``{{template `project_name_alphanumeric_underscore` .}}/assets/model-workflow-asset.yml`` +# ``{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml`` # # NOTE: In general, we recommend that you do not modify this notebook directly, and instead update data-loading # and model training logic in Python modules under the `steps` directory. diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/README.md.tmpl index 9a21a3f9..e5a6fec8 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/README.md.tmpl @@ -1,2 +1,2 @@ # Model Validation -To enable model validation as part of scheduled databricks workflow, please refer to [{{template `project_name_alphanumeric_underscore` .}}/assets/README.md](../assets/README.md) \ No newline at end of file +To enable model validation as part of scheduled databricks workflow, please refer to [{{template `project_name_alphanumeric_underscore` .}}/resources/README.md](../resources/README.md) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl index 3870444a..be0a3b6f 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl @@ -5,7 +5,7 @@ # This notebook uses mlflow model validation API to run mode validation after training and registering a model # in model registry, before deploying it to the {{- if (eq .input_include_models_in_unity_catalog "no") }}"Production" stage{{else}} "Champion" alias{{end -}}. # -# It runs as part of CD and by an automated model training job -> validation -> deployment job defined under ``{{template `project_name_alphanumeric_underscore` .}}/assets/model-workflow-asset.yml`` +# It runs as part of CD and by an automated model training job -> validation -> deployment job defined under ``{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml`` # # # Parameters: diff --git a/tests/utils.py b/tests/utils.py index 3b58d495..3d448cb6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,7 +5,7 @@ import subprocess from functools import wraps -ASSET_TEMPLATE_ROOT_DIRECTORY = str(pathlib.Path(__file__).parent.parent) +RESOURCE_TEMPLATE_ROOT_DIRECTORY = str(pathlib.Path(__file__).parent.parent) AZURE_DEFAULT_PARAMS = { "input_setup_cicd_and_project": "CICD_and_Project", @@ -161,7 +161,7 @@ def generate(directory, databricks_cli, context): check=True, ) subprocess.run( - f"{databricks_cli} bundle init {ASSET_TEMPLATE_ROOT_DIRECTORY} --config-file {config_file} --output-dir {directory}", + f"{databricks_cli} bundle init {RESOURCE_TEMPLATE_ROOT_DIRECTORY} --config-file {config_file} --output-dir {directory}", shell=True, check=True, )