diff --git a/.gitignore b/.gitignore index da9e42c201..065f31cb3c 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,8 @@ node_modules/ .coverage htmlcov/ *.iml +**/target + # Terraform # See: https://github.com/github/gitignore/blob/master/Terraform.gitignore diff --git a/examples/iap-user-profile/package-lock.json b/examples/iap-user-profile/package-lock.json index 8707d1da6e..e8036e6941 100644 --- a/examples/iap-user-profile/package-lock.json +++ b/examples/iap-user-profile/package-lock.json @@ -86,24 +86,38 @@ } } }, - "@babel/helper-function-name": { - "version": "7.9.5", - "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.9.5.tgz", - "integrity": "sha512-JVcQZeXM59Cd1qanDUxv9fgJpt3NeKUaqBqUEvfmQ+BCOKq2xUgaWZW2hr0dkbyJgezYuplEoh5knmrnS68efw==", - "dev": true, - "requires": { - "@babel/helper-get-function-arity": "^7.8.3", - "@babel/template": "^7.8.3", - "@babel/types": "^7.9.5" - } + "@babel/helper-environment-visitor": { + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.20.tgz", + "integrity": "sha512-zfedSIzFhat/gFhWfHtgWvlec0nqB9YEIVrpuwjruLlXfUSnA8cJB0miHKwqDnQ7d32aKo2xt88/xZptwxbfhA==", + "dev": true }, - "@babel/helper-get-function-arity": { - "version": "7.8.3", - "resolved": "https://registry.npmjs.org/@babel/helper-get-function-arity/-/helper-get-function-arity-7.8.3.tgz", - "integrity": "sha512-FVDR+Gd9iLjUMY1fzE2SR0IuaJToR4RkCDARVfsBBPSP53GEqSFjD8gNyxg246VUyc/ALRxFaAK8rVG7UT7xRA==", + "@babel/helper-hoist-variables": { + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.22.5.tgz", + "integrity": "sha512-wGjk9QZVzvknA6yKIUURb8zY3grXCcOZt+/7Wcy8O2uctxhplmUPkOdlgoNhmdVee2c92JXbf1xpMtVNbfoxRw==", "dev": true, "requires": { - "@babel/types": "^7.8.3" + "@babel/types": "^7.22.5" + }, + "dependencies": { + "@babel/helper-validator-identifier": { + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz", + "integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==", + "dev": true + }, + "@babel/types": { + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.23.0.tgz", + "integrity": "sha512-0oIyUfKoI3mSqMvsxBdclDwxXKXAUA8v/apZbc+iSyARYou1o8ZGDxbUYyLFoW2arqS2jDGqJuZvv1d/io1axg==", + "dev": true, + "requires": { + "@babel/helper-string-parser": "^7.22.5", + "@babel/helper-validator-identifier": "^7.22.20", + "to-fast-properties": "^2.0.0" + } + } } }, "@babel/helper-member-expression-to-functions": { @@ -185,6 +199,12 @@ "@babel/types": "^7.8.3" } }, + "@babel/helper-string-parser": { + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.22.5.tgz", + "integrity": "sha512-mM4COjgZox8U+JcXQwPijIZLElkgEpO5rsERVDJTc2qfCDfERyob6k5WegS14SX18IIjv+XD+GrqNumY5JRCDw==", + "dev": true + }, "@babel/helper-validator-identifier": { "version": "7.9.5", "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.9.5.tgz", @@ -358,36 +378,158 @@ } }, "@babel/traverse": { - "version": "7.9.6", - "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.9.6.tgz", - "integrity": "sha512-b3rAHSjbxy6VEAvlxM8OV/0X4XrG72zoxme6q1MOoe2vd0bEc+TwayhuC1+Dfgqh1QEG+pj7atQqvUprHIccsg==", - "dev": true, - "requires": { - "@babel/code-frame": "^7.8.3", - "@babel/generator": "^7.9.6", - "@babel/helper-function-name": "^7.9.5", - "@babel/helper-split-export-declaration": "^7.8.3", - "@babel/parser": "^7.9.6", - "@babel/types": "^7.9.6", + "version": "7.23.2", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.2.tgz", + "integrity": "sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw==", + "dev": true, + "requires": { + "@babel/code-frame": "^7.22.13", + "@babel/generator": "^7.23.0", + "@babel/helper-environment-visitor": "^7.22.20", + "@babel/helper-function-name": "^7.23.0", + "@babel/helper-hoist-variables": "^7.22.5", + "@babel/helper-split-export-declaration": "^7.22.6", + "@babel/parser": "^7.23.0", + "@babel/types": "^7.23.0", "debug": "^4.1.0", - "globals": "^11.1.0", - "lodash": "^4.17.13" + "globals": "^11.1.0" }, "dependencies": { + "@babel/code-frame": { + "version": "7.22.13", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.22.13.tgz", + "integrity": "sha512-XktuhWlJ5g+3TJXc5upd9Ks1HutSArik6jf2eAjYFyIOf4ej3RN+184cZbzDvbPnuTJIUhPKKJE3cIsYTiAT3w==", + "dev": true, + "requires": { + "@babel/highlight": "^7.22.13", + "chalk": "^2.4.2" + } + }, + "@babel/generator": { + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.23.0.tgz", + "integrity": "sha512-lN85QRR+5IbYrMWM6Y4pE/noaQtg4pNiqeNGX60eqOfo6gtEj6uw/JagelB8vVztSd7R6M5n1+PQkDbHbBRU4g==", + "dev": true, + "requires": { + "@babel/types": "^7.23.0", + "@jridgewell/gen-mapping": "^0.3.2", + "@jridgewell/trace-mapping": "^0.3.17", + "jsesc": "^2.5.1" + } + }, + "@babel/helper-function-name": { + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.23.0.tgz", + "integrity": "sha512-OErEqsrxjZTJciZ4Oo+eoZqeW9UIiOcuYKRJA4ZAgV9myA+pOXhhmpfNCKjEH/auVfEYVFJ6y1Tc4r0eIApqiw==", + "dev": true, + "requires": { + "@babel/template": "^7.22.15", + "@babel/types": "^7.23.0" + } + }, + "@babel/helper-split-export-declaration": { + "version": "7.22.6", + "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.22.6.tgz", + "integrity": "sha512-AsUnxuLhRYsisFiaJwvp1QF+I3KjD5FOxut14q/GzovUe6orHLesW2C7d754kRm53h5gqrz6sFl6sxc4BVtE/g==", + "dev": true, + "requires": { + "@babel/types": "^7.22.5" + } + }, + "@babel/helper-validator-identifier": { + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz", + "integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==", + "dev": true + }, + "@babel/highlight": { + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.22.20.tgz", + "integrity": "sha512-dkdMCN3py0+ksCgYmGG8jKeGA/8Tk+gJwSYYlFGxG5lmhfKNoAy004YpLxpS1W2J8m/EK2Ew+yOs9pVRwO89mg==", + "dev": true, + "requires": { + "@babel/helper-validator-identifier": "^7.22.20", + "chalk": "^2.4.2", + "js-tokens": "^4.0.0" + } + }, + "@babel/parser": { + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.23.0.tgz", + "integrity": "sha512-vvPKKdMemU85V9WE/l5wZEmImpCtLqbnTvqDS2U1fJ96KrxoW7KrXhNsNCblQlg8Ck4b85yxdTyelsMUgFUXiw==", + "dev": true + }, + "@babel/template": { + "version": "7.22.15", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.22.15.tgz", + "integrity": "sha512-QPErUVm4uyJa60rkI73qneDacvdvzxshT3kksGqlGWYdOTIUOwJ7RDUL8sGqslY1uXWSL6xMFKEXDS3ox2uF0w==", + "dev": true, + "requires": { + "@babel/code-frame": "^7.22.13", + "@babel/parser": "^7.22.15", + "@babel/types": "^7.22.15" + } + }, + "@babel/types": { + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.23.0.tgz", + "integrity": "sha512-0oIyUfKoI3mSqMvsxBdclDwxXKXAUA8v/apZbc+iSyARYou1o8ZGDxbUYyLFoW2arqS2jDGqJuZvv1d/io1axg==", + "dev": true, + "requires": { + "@babel/helper-string-parser": "^7.22.5", + "@babel/helper-validator-identifier": "^7.22.20", + "to-fast-properties": "^2.0.0" + } + }, + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz", + "integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, "debug": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.1.1.tgz", - "integrity": "sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw==", + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", "dev": true, "requires": { - "ms": "^2.1.1" + "ms": "2.1.2" } }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==", + "dev": true + }, "ms": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==", "dev": true + }, + "supports-color": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", + "integrity": "sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } } } }, @@ -814,6 +956,45 @@ "chalk": "^4.0.0" } }, + "@jridgewell/gen-mapping": { + "version": "0.3.3", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.3.tgz", + "integrity": "sha512-HLhSWOLRi875zjjMG/r+Nv0oCW8umGb0BgEhyX3dDX3egwZtB8PqLnjz3yedt8R5StBrzcg4aBpnh8UA9D1BoQ==", + "dev": true, + "requires": { + "@jridgewell/set-array": "^1.0.1", + "@jridgewell/sourcemap-codec": "^1.4.10", + "@jridgewell/trace-mapping": "^0.3.9" + } + }, + "@jridgewell/resolve-uri": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.1.tgz", + "integrity": "sha512-dSYZh7HhCDtCKm4QakX0xFpsRDqjjtZf/kjI/v3T3Nwt5r8/qz/M19F9ySyOqU94SXBmeG9ttTul+YnR4LOxFA==", + "dev": true + }, + "@jridgewell/set-array": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.1.2.tgz", + "integrity": "sha512-xnkseuNADM0gt2bs+BvhO0p78Mk762YnZdsuzFV018NoG1Sj1SCQvpSqa7XUaTam5vAGasABV9qXASMKnFMwMw==", + "dev": true + }, + "@jridgewell/sourcemap-codec": { + "version": "1.4.15", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.15.tgz", + "integrity": "sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg==", + "dev": true + }, + "@jridgewell/trace-mapping": { + "version": "0.3.19", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.19.tgz", + "integrity": "sha512-kf37QtfW+Hwx/buWGMPcR60iF9ziHa6r/CZJIHbmcm4+0qrXiVdxegAH0F6yddEVQ7zdkjcGCgCzUu+BcbhQxw==", + "dev": true, + "requires": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, "@opencensus/core": { "version": "0.0.20", "resolved": "https://registry.npmjs.org/@opencensus/core/-/core-0.0.20.tgz", diff --git a/examples/vertex_mlops_enterprise/.github/workflows/containers.yml.TEMPLATE b/examples/vertex_mlops_enterprise/.github/workflows/containers.yml.TEMPLATE new file mode 100644 index 0000000000..4337b493e9 --- /dev/null +++ b/examples/vertex_mlops_enterprise/.github/workflows/containers.yml.TEMPLATE @@ -0,0 +1,147 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Build Containers +on: + push: + paths: + - 'build/Dockerfile.js' + - 'requirements.txt' + workflow_dispatch: + +# Add "id-token" with the intended permissions. +permissions: + contents: 'read' + id-token: 'write' + +env: + ENVIRONMENT: ${environment} + PROJECT_ID: ${project_id} + # SA used to authenticate in GCP through Workload Identity Federation + SERVICE_ACCOUNT: ${sa} + REGION: europe-west4 + DOCKER_REPO: ${docker_repo} + WORKLOAD_ID_PROVIDER: ${wip} + CLOUDBUILD_LOGS: gs://${project_id}_cloudbuild/logs +jobs: + build-container-cicd-tfx: + name: 'Build container CI/CD TFX' + runs-on: 'ubuntu-latest' + steps: + - uses: 'actions/checkout@v3' + with: + token: $${{ github.token }} + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: $${{ env.WORKLOAD_ID_PROVIDER }} + service_account: $${{ env.SERVICE_ACCOUNT }} + access_token_lifetime: 3600s + + - name: 'Build container' + run: | + cp build/Dockerfile.cicd-tfx build/Dockerfile + gcloud builds submit --gcs-log-dir=$${{ env.CLOUDBUILD_LOGS }} --project $${{ env.PROJECT_ID }} --region $${{ env.REGION }} --tag $${{ env.DOCKER_REPO }}/cicd-tfx:latest build/. --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs + + + build-container-cicd-kfp: + name: 'Build container CI/CD KFP' + runs-on: 'ubuntu-latest' + steps: + - uses: 'actions/checkout@v3' + with: + token: $${{ github.token }} + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: $${{ env.WORKLOAD_ID_PROVIDER }} + service_account: $${{ env.SERVICE_ACCOUNT }} + access_token_lifetime: 3600s + + - name: 'Build container' + run: | + gcloud builds submit --gcs-log-dir=$${{ env.CLOUDBUILD_LOGS }} --project $${{ env.PROJECT_ID }} --region $${{ env.REGION }} --tag $${{ env.DOCKER_REPO }}/cicd-kfp:latest src/kfp_pipelines/. --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs + + build-container-model-card: + name: 'Build container for Model Card' + runs-on: 'ubuntu-latest' + steps: + - uses: 'actions/checkout@v3' + with: + token: $${{ github.token }} + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: $${{ env.WORKLOAD_ID_PROVIDER }} + service_account: $${{ env.SERVICE_ACCOUNT }} + access_token_lifetime: 3600s + + - name: 'Build container' + run: | + cp src/kfp_pipelines/Dockerfile.modelcard Dockerfile + gcloud builds submit --gcs-log-dir=$${{ env.CLOUDBUILD_LOGS }} --project $${{ env.PROJECT_ID }} --region $${{ env.REGION }} --tag $${{ env.DOCKER_REPO }}/model-card:latest . --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs + + build-container-vertex: + name: 'Build container Vertex' + runs-on: 'ubuntu-latest' + steps: + - uses: 'actions/checkout@v3' + with: + token: $${{ github.token }} + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: $${{ env.WORKLOAD_ID_PROVIDER }} + service_account: $${{ env.SERVICE_ACCOUNT }} + access_token_lifetime: 3600s + + - name: 'Build container' + run: | + cp build/Dockerfile.vertex Dockerfile + gcloud builds submit --gcs-log-dir=$${{ env.CLOUDBUILD_LOGS }} --project $${{ env.PROJECT_ID }} --region $${{ env.REGION }} --tag $${{ env.DOCKER_REPO }}/vertex:latest . --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs + + build-container-dataflow: + name: 'Build container Dataflow' + runs-on: 'ubuntu-latest' + steps: + - uses: 'actions/checkout@v3' + with: + token: $${{ github.token }} + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: $${{ env.WORKLOAD_ID_PROVIDER }} + service_account: $${{ env.SERVICE_ACCOUNT }} + access_token_lifetime: 3600s + + - name: 'Build container' + run: | + cp build/Dockerfile.dataflow Dockerfile + gcloud builds submit --gcs-log-dir=$${{ env.CLOUDBUILD_LOGS }} --project $${{ env.PROJECT_ID }} --region $${{ env.REGION }} --tag $${{ env.DOCKER_REPO }}/dataflow:latest . --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs + diff --git a/examples/vertex_mlops_enterprise/.github/workflows/deploy.yml.TEMPLATE b/examples/vertex_mlops_enterprise/.github/workflows/deploy.yml.TEMPLATE new file mode 100644 index 0000000000..7be2585cae --- /dev/null +++ b/examples/vertex_mlops_enterprise/.github/workflows/deploy.yml.TEMPLATE @@ -0,0 +1,52 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Deploy tfx model +on: + workflow_dispatch: + +# Add "id-token" with the intended permissions. +permissions: + contents: 'read' + id-token: 'write' + +env: + ENVIRONMENT: ${environment} + PROJECT_ID: ${project_id} + # SA used to authenticate in GCP through Workload Identity Federation + SERVICE_ACCOUNT: ${sa} + REGION: europe-west4 + DOCKER_REPO: ${docker_repo} + WORKLOAD_ID_PROVIDER: ${wip} +jobs: + deploy-model: + name: 'Deploy model to endpoint' + runs-on: 'ubuntu-latest' + steps: + - uses: 'actions/checkout@v3' + with: + token: $${{ github.token }} + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: $${{ env.WORKLOAD_ID_PROVIDER }} + service_account: $${{ env.SERVICE_ACCOUNT }} + access_token_lifetime: 3600s + + - name: 'Deploy model' + run: gcloud builds submit --no-source --config build/$${{ env.ENVIRONMENT }}/model-deployment.yaml --project $${{ env.PROJECT_ID }} --region $${{ env.REGION }} --machine-type=e2-highcpu-8 --suppress-logs + \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/.github/workflows/main.yml.TEMPLATE b/examples/vertex_mlops_enterprise/.github/workflows/main.yml.TEMPLATE new file mode 100644 index 0000000000..a54f500bde --- /dev/null +++ b/examples/vertex_mlops_enterprise/.github/workflows/main.yml.TEMPLATE @@ -0,0 +1,51 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Deploy Vertex AI ${framework} Pipeline +on: + workflow_dispatch: + +# Add "id-token" with the intended permissions. +permissions: + contents: 'read' + id-token: 'write' + +env: + ENVIRONMENT: ${environment} + PROJECT_ID: ${project_id} + # SA used to authenticate in GCP through Workload Identity Federation + SERVICE_ACCOUNT: ${sa} + REGION: europe-west4 + DOCKER_REPO: ${docker_repo} + WORKLOAD_ID_PROVIDER: ${wip} +jobs: + deploy-pipeline: + name: 'Compile Vertex AI pipeline' + runs-on: 'ubuntu-latest' + steps: + - uses: 'actions/checkout@v3' + with: + token: $${{ github.token }} + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: $${{ env.WORKLOAD_ID_PROVIDER }} + service_account: $${{ env.SERVICE_ACCOUNT }} + access_token_lifetime: 3600s + + - name: 'Deploy Vertex AI Pipeline' + run: gcloud builds submit --no-source --config build/$${{ env.ENVIRONMENT }}/pipeline-deployment-${framework}.yaml --project $${{ env.PROJECT_ID }} --region $${{ env.REGION }} --machine-type=e2-highcpu-8 --suppress-logs diff --git a/examples/vertex_mlops_enterprise/.github/workflows/run.yml.TEMPLATE b/examples/vertex_mlops_enterprise/.github/workflows/run.yml.TEMPLATE new file mode 100644 index 0000000000..e4c1f649d0 --- /dev/null +++ b/examples/vertex_mlops_enterprise/.github/workflows/run.yml.TEMPLATE @@ -0,0 +1,52 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Run Vertex AI ${framework} Pipeline +on: + workflow_dispatch: + +# Add "id-token" with the intended permissions. +permissions: + contents: 'read' + id-token: 'write' + +env: + ENVIRONMENT: ${environment} + PROJECT_ID: ${project_id} + # SA used to authenticate in GCP through Workload Identity Federation + SERVICE_ACCOUNT: ${sa} + REGION: europe-west4 + DOCKER_REPO: ${docker_repo} + WORKLOAD_ID_PROVIDER: ${wip} +jobs: + run-pipeline: + name: 'Run Vertex AI ${framework} pipeline' + runs-on: 'ubuntu-latest' + steps: + - uses: 'actions/checkout@v3' + with: + token: $${{ github.token }} + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: $${{ env.WORKLOAD_ID_PROVIDER }} + service_account: $${{ env.SERVICE_ACCOUNT }} + access_token_lifetime: 7200s + + - name: 'Run Vertex Pipeline' + run: gcloud builds submit --no-source --config build/$${{ env.ENVIRONMENT }}/pipeline-run-${framework}.yaml --project $${{ env.PROJECT_ID }} --region $${{ env.REGION }} --machine-type=e2-highcpu-8 --suppress-logs + \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/.gitignore b/examples/vertex_mlops_enterprise/.gitignore new file mode 100644 index 0000000000..884b95def8 --- /dev/null +++ b/examples/vertex_mlops_enterprise/.gitignore @@ -0,0 +1,6 @@ +.terraform* +terraform.tfstate* +*~ +.DS_Store +**/__pycache__/** +venv diff --git a/examples/vertex_mlops_enterprise/01-experimentation.ipynb b/examples/vertex_mlops_enterprise/01-experimentation.ipynb new file mode 100644 index 0000000000..78127fbef9 --- /dev/null +++ b/examples/vertex_mlops_enterprise/01-experimentation.ipynb @@ -0,0 +1,1206 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "da299444", + "metadata": {}, + "source": [ + "Copyright 2023 Google LLC\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "you may not use this file except in compliance with the License.\n", + "You may obtain a copy of the License at\n", + "\n", + " http://www.apache.org/licenses/LICENSE-2.0\n", + "\n", + "Unless required by applicable law or agreed to in writing, software\n", + "distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "See the License for the specific language governing permissions and\n", + "limitations under the License." + ] + }, + { + "cell_type": "markdown", + "id": "9d0817cd-9e64-4d5e-9c66-4e0961aa1085", + "metadata": {}, + "source": [ + "# MLOps End to End Workflow (I)\n", + "\n", + "Implementation of an end-to-end ML Ops workflow for the use case to detect fraudulent credit card transactions, see [Kaggle dataset](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud).\n", + "\n", + "This set of notebooks cover:\n", + "\n", + "[Experimentation](01-experimentation.ipynb):\n", + "1. Set up: Creation of the Vertex Dataset, extraction of the schema\n", + "2. Implementation of a TFX pipeline\n", + "\n", + "[CICD](02-cicd.ipynb):\n", + "\n", + "3. Deployment of the Vertex AI Pipeline through a CI/CD process\n", + "4. Deployment of a Continuous Training pipeline that can be triggered via Pub/Sub and produces a model in the Model Registry\n", + "5. Deployment of the Inference Pipeline consisting of a Cloud Function that retrieves features from Feature Store and calls the model on a Vertex AI Endpoint\n", + "6. Deployment of the model to a Vertex AI Endpoint through a CI/CD process\n", + "\n", + "[Prediction](03-prediction.ipynb):\n", + "\n", + "7. Deploy the model to an endpoint\n", + "8. Create a test prediction\n" + ] + }, + { + "cell_type": "markdown", + "id": "d62219f7-f6fe-476e-bf88-42db7978460c", + "metadata": { + "tags": [] + }, + "source": [ + "## Setup\n", + "\n", + "### Package installations\n", + "\n", + "Run \n", + "\n", + "```\n", + "pip install --user -r requirements.txt\n", + "```\n", + "\n", + "If an error occurs later, upon importing `tensorflow` related to `numpy`, this can be fixed with a forced reinstall of numpy:\n", + "\n", + "```\n", + "pip install --user numpy==1.21.6 --force-reinstall\n", + "```\n", + "\n", + "Moreover, running the end-to-end unit test of the pipeline requires forcing a re-install of `grpcio` and `grpcio-tools`:\n", + "\n", + "```\n", + "pip install --user grpcio --no-binary=grpcio --force-reinstall\n", + "pip install --user grpcio-tools --no-binary=grpcio-tools --force-reinstall\n", + "```\n", + "\n", + "### Setup config\n", + "Edit the file `mainconfig.yaml` and use the adequate configuration parameters for your environment.\n", + "\n", + "### Load data into BigQuery\n", + "\n", + "We are using the Credit Card Fraud Detection dataset from the Machine Learning Group at the Universite Libre de Bruxelles, available on [Kaggle](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud).\n", + "\n", + "Create a dataset called `creditcards` in the `EU` region.\n", + "\n", + "To load the data into BQ, from Cloud Shell:\n", + "\n", + "```\n", + "bq load --skip_leading_rows=1 creditcards.creditcards gs://cxb1-prj-test-no-vpcsc/csv/creditcard.csv Time:STRING,V1:FLOAT,V2:FLOAT,V3:FLOAT,V4:FLOAT,V5:FLOAT,V6:FLOAT,V7:FLOAT,V8:FLOAT,V9:FLOAT,V10:FLOAT,V11:FLOAT,V12:FLOAT,V13:FLOAT,V14:FLOAT,V15:FLOAT,V16:FLOAT,V17:FLOAT,V18:FLOAT,V19:FLOAT,V20:FLOAT,V21:FLOAT,V22:FLOAT,V23:FLOAT,V24:FLOAT,V25:FLOAT,V26:FLOAT,V27:FLOAT,V28:FLOAT,Amount:FLOAT,Class:STRING\n", + "```\n", + "\n", + "### Import Libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b997d6a-f8fd-43f5-bafd-81b169965160", + "metadata": {}, + "outputs": [], + "source": [ + "#%load_ext autoreload\n", + "#%autoreload 2\n", + "\n", + "import os\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import tensorflow_data_validation as tfdv\n", + "from google.cloud import bigquery\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from google.cloud import aiplatform as vertex_ai\n", + "\n", + "import yaml\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3bdeae6-b5c9-479d-8318-627959bf9898", + "metadata": {}, + "outputs": [], + "source": [ + "with open('mainconfig.yaml') as f:\n", + " main_config = yaml.safe_load(f)\n", + "\n", + "# select your config \n", + "main_config = main_config['creditcards']" + ] + }, + { + "cell_type": "markdown", + "id": "f5c0f8ad-ca30-4f5c-a135-809409f58abd", + "metadata": {}, + "source": [ + "### Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3b2ad96-7e3a-4980-a8d2-ee78993f0732", + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT = main_config['project'] \n", + "REGION = main_config['region'] \n", + "DOCKER_REPO = main_config['docker_repo']\n", + "\n", + "SERVICE_ACCOUNT = main_config['service_account']\n", + "\n", + "# BigQuery and data locations\n", + "\n", + "BQ_SOURCE_TABLE= main_config['bq']['source_table'] # raw input\n", + "ML_TABLE = main_config['bq']['ml_table'] # the one we will use for the training\n", + "\n", + "BQ_DATASET_NAME = main_config['bq']['dataset']\n", + "BQ_LOCATION = main_config['bq']['location'] # multiregion provides more resilience\n", + "\n", + "VERTEX_DATASET_NAME = main_config['vertex_dataset_name']\n", + "\n", + "RAW_SCHEMA_DIR = main_config['raw_schema_dir']\n", + "\n", + "BUCKET = main_config['bucket']\n", + "\n", + "# TFX and model config\n", + "\n", + "# model version\n", + "VERSION = main_config['version']\n", + "\n", + "\n", + "MODEL_DISPLAY_NAME = f'{VERTEX_DATASET_NAME}-classifier-{VERSION}'\n", + "WORKSPACE = f'gs://{BUCKET}/{VERTEX_DATASET_NAME}'\n", + "\n", + "MLMD_SQLLITE = 'mlmd.sqllite'\n", + "ARTIFACT_STORE = os.path.join(WORKSPACE, 'tfx_artifacts_interactive')\n", + "MODEL_REGISTRY = os.path.join(WORKSPACE, 'model_registry')\n", + "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n", + "PIPELINE_ROOT = os.path.join(ARTIFACT_STORE, PIPELINE_NAME)\n", + "\n", + "ENDPOINT_DISPLAY_NAME = f'{VERTEX_DATASET_NAME}-classifier'\n", + "\n", + "FEATURESTORE_ID = main_config['featurestore_id']\n", + "\n", + "CF_REGION = main_config['cloudfunction_region']\n", + "\n", + "DATAFLOW_SUBNETWORK = f\"https://www.googleapis.com/compute/v1/projects/{PROJECT}/regions/{REGION}/subnetworks/{main_config['dataflow']['subnet']}\"\n", + "DATAFLOW_SERVICE_ACCOUNT = main_config['dataflow']['service_account']\n", + "\n", + "CLOUDBUILD_SA = f'projects/{PROJECT}/serviceAccounts/{SERVICE_ACCOUNT}'\n", + "\n", + "LIMIT=main_config['limit']" + ] + }, + { + "cell_type": "markdown", + "id": "274eaeb4-a9db-42f6-a19d-bc9567fa5b37", + "metadata": {}, + "source": [ + "#### Generate pip configuration\n", + "\n", + "Our containers should install packages from our internal Artifact Registry, rather than try to go to the public PyPI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69a752e4-275c-464d-a842-c3feae9e5355", + "metadata": {}, + "outputs": [], + "source": [ + "pip_conf = f'''[global]\n", + "index-url = https://{main_config['artifactregistry_region']}-python.pkg.dev/{PROJECT}/{main_config['python_pkg_repo']}/simple/\n", + "timeout=10\n", + "'''\n", + "\n", + "with open('build/pip.conf', 'w') as f:\n", + " f.write(pip_conf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fda3ab18-7560-4d47-a911-101d21e8bfd7", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Project ID:\", PROJECT)\n", + "print(\"Region:\", REGION)\n", + "\n", + "vertex_ai.init(\n", + " project=PROJECT,\n", + " location=REGION\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb98a921-cba7-46ee-baa7-b0a4461c3b2c", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"VERTEX_DATASET_NAME\"] = VERTEX_DATASET_NAME\n", + "os.environ[\"MODEL_DISPLAY_NAME\"] = MODEL_DISPLAY_NAME\n", + "os.environ[\"PIPELINE_NAME\"] = PIPELINE_NAME\n", + "os.environ[\"PROJECT\"] = PROJECT\n", + "os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT\n", + "os.environ[\"REGION\"] = REGION\n", + "os.environ[\"GCS_LOCATION\"] = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}\"\n", + "os.environ[\"MODEL_REGISTRY_URI\"] = os.path.join(os.environ[\"GCS_LOCATION\"], \"model_registry\")\n", + "os.environ[\"TRAIN_LIMIT\"] = \"85000\"\n", + "os.environ[\"TEST_LIMIT\"] = \"15000\"\n", + "os.environ[\"BEAM_RUNNER\"] = \"DataflowRunner\"\n", + "os.environ[\"TRAINING_RUNNER\"] = \"vertex\"\n", + "os.environ[\"TFX_IMAGE_URI\"] = f\"{DOCKER_REPO}/vertex:latest\"\n", + "os.environ[\"ENABLE_CACHE\"] = \"1\"\n", + "os.environ[\"SUBNETWORK\"] = DATAFLOW_SUBNETWORK\n", + "os.environ[\"SERVICE_ACCOUNT\"] = DATAFLOW_SERVICE_ACCOUNT\n", + "os.environ[\"BQ_LOCATION\"] = BQ_LOCATION\n", + "os.environ[\"BQ_DATASET_NAME\"] = BQ_DATASET_NAME\n", + "os.environ[\"ML_TABLE\"] = ML_TABLE\n", + "os.environ[\"GCS_LOCATION\"] = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}/e2e_tests\"\n", + "os.environ[\"SUBNETWORK\"] = DATAFLOW_SUBNETWORK" + ] + }, + { + "cell_type": "markdown", + "id": "6febb4b2-d832-4526-afde-30266bd725cc", + "metadata": {}, + "source": [ + "# Generate ML data\n", + "\n", + "We add a `ML_use` column for pre-splitting the data, where 80% of the datsa items are set to `UNASSIGNED` while the other 20% is set to `TEST`.\n", + "This column is used during training to split the dataset for training and test.\n", + "\n", + "In the training phase, the `UNASSIGNED` are split into `train` and `eval`. The `TEST` split is will be used for the final model validation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55b9ffe2-d80a-4149-8de7-6a9ef82c85d3", + "metadata": {}, + "outputs": [], + "source": [ + "sql_script = f'''\n", + "CREATE OR REPLACE TABLE `{PROJECT}.{BQ_DATASET_NAME}.{ML_TABLE}` \n", + "AS (\n", + " SELECT\n", + " * EXCEPT(Class),\n", + " CAST(Class AS FLOAT64) as Class,\n", + " IF(ABS(MOD(FARM_FINGERPRINT(Time), 100)) <= 80, 'UNASSIGNED', 'TEST') AS ML_use\n", + " FROM\n", + " `{PROJECT}.{BQ_DATASET_NAME}.{BQ_SOURCE_TABLE}`\n", + ")\n", + "'''\n", + "\n", + "bq_client = bigquery.Client(project=PROJECT, location=BQ_LOCATION)\n", + "job = bq_client.query(sql_script)\n", + "#job.result()" + ] + }, + { + "cell_type": "markdown", + "id": "d74b54f8-9509-465c-919f-8c02c3a1aa6a", + "metadata": {}, + "source": [ + "# Data Exploration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0391720d-1aff-47fb-a0c6-8e46323f4d79", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import bigquery\n", + "\n", + "client = bigquery.Client(project=PROJECT) \n", + "\n", + "# I use the ML table here and I exclude the TIME and ML_USE columns, because I will later use this sample data to generate\n", + "# the schema for the training\n", + "sql = f\"SELECT * EXCEPT(time, ml_use) FROM `{PROJECT}.{BQ_DATASET_NAME}.{ML_TABLE}` LIMIT 1000\"\n", + "print(sql)\n", + "\n", + "query_job = client.query(sql, location=BQ_LOCATION)\n", + "sample_data = query_job.result().to_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49686501-b357-48f2-853d-1b2672c43f7f", + "metadata": {}, + "outputs": [], + "source": [ + "sample_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a520b19-87a7-482a-9de1-cdc862c12243", + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery counts --project {PROJECT} \n", + "\n", + "SELECT \n", + " Class, count(*) as n\n", + "FROM `creditcards.creditcards`\n", + "GROUP BY Class" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "243a9a1d-84b9-4943-ad77-df5b8cf97d59", + "metadata": {}, + "outputs": [], + "source": [ + "counts.plot(kind='bar', x='Class', y='n', logy=True, legend=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "539820cd-f632-477c-a7ce-88cee2ad9340", + "metadata": {}, + "outputs": [], + "source": [ + "sample_data.V4.hist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9223d4a-c6eb-429f-8b17-e80a488733e5", + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery --project {PROJECT}\n", + "\n", + "SELECT ML_use, Class, COUNT(*) as n\n", + "FROM creditcards.creditcards_ml\n", + "GROUP BY ML_use, Class" + ] + }, + { + "cell_type": "markdown", + "id": "7b4eea14-66df-4651-8c0e-3b035e7da22c", + "metadata": {}, + "source": [ + "# Generate Schema\n", + "\n", + "\n", + "The [TensorFlow Data Validation (TFDV)](https://www.tensorflow.org/tfx/data_validation/get_started) data schema will be used in:\n", + "1. Identify the raw data types and shapes in the data transformation.\n", + "2. Create the serving input signature for the custom model.\n", + "3. Validate the new raw training data in the TFX pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0633ec5-ab02-43fd-bcce-c32c62311d21", + "metadata": {}, + "outputs": [], + "source": [ + "stats = tfdv.generate_statistics_from_dataframe(\n", + " dataframe=sample_data,\n", + " stats_options=tfdv.StatsOptions(\n", + " label_feature='Class',\n", + " weight_feature=None,\n", + " sample_rate=1,\n", + " num_top_values=50\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80d78215-3700-4065-8e78-6d9b6f1131ea", + "metadata": {}, + "outputs": [], + "source": [ + "tfdv.visualize_statistics(stats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69d116ce-8fa5-491b-9ea3-00fbcd582ecc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "schema = tfdv.infer_schema(statistics=stats)\n", + "tfdv.display_schema(schema=schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33419848-88f1-4b28-9a53-9c7c723578cc", + "metadata": {}, + "outputs": [], + "source": [ + "raw_schema_location = os.path.join(RAW_SCHEMA_DIR, 'schema.pbtxt')\n", + "tfdv.write_schema_text(schema, raw_schema_location)" + ] + }, + { + "cell_type": "markdown", + "id": "326e9b0f-90d0-4b9c-a1a6-f18131dfcdf8", + "metadata": {}, + "source": [ + "# Create Vertex Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f596319-e9a9-4743-902f-19b5a8694eaa", + "metadata": {}, + "outputs": [], + "source": [ + "bq_uri = f\"bq://{PROJECT}.{BQ_DATASET_NAME}.{ML_TABLE}\"\n", + "\n", + "dataset = vertex_ai.TabularDataset.create(\n", + " display_name=VERTEX_DATASET_NAME, bq_source=bq_uri)\n", + "\n", + "dataset.gca_resource" + ] + }, + { + "cell_type": "markdown", + "id": "c236caba-18d4-4454-ba79-2a2b7bcbad32", + "metadata": {}, + "source": [ + "## Retrieve and inspect the Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6988e8a-51dd-4920-98e0-8902107f5a55", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = vertex_ai.TabularDataset.list(\n", + " filter=f\"display_name={VERTEX_DATASET_NAME}\", \n", + " order_by=\"update_time\")[-1]\n", + "\n", + "print(\"Dataset resource name:\", dataset.resource_name)\n", + "print(\"Dataset BigQuery source:\", dataset.gca_resource.metadata['inputConfig']['bigquerySource']['uri'])" + ] + }, + { + "cell_type": "markdown", + "id": "ca5baa2e-0789-4547-92ae-5d8d258b2375", + "metadata": {}, + "source": [ + "# Build the TFX Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d37c85f-410f-4222-9b22-6e5fcbaf04bd", + "metadata": {}, + "outputs": [], + "source": [ + "import tfx.v1 as tfx\n", + "from tfx.extensions.google_cloud_big_query.example_gen.component import BigQueryExampleGen\n", + "from tfx.proto import example_gen_pb2, transform_pb2\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow_transform as tft\n", + "import tensorflow_data_validation as tfdv\n", + "import tensorflow_model_analysis as tfma\n", + "from tensorflow_transform.tf_metadata import schema_utils\n", + "\n", + "\n", + "import ml_metadata as mlmd\n", + "from ml_metadata.proto import metadata_store_pb2\n", + "from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext\n", + "\n", + "import logging\n", + "import json\n", + "\n", + "from src.common import features, datasource_utils\n", + "from src.model_training import data\n", + "from src.tfx_pipelines import components" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11ecfb31-0458-4cfc-9afe-a04f1b044e34", + "metadata": {}, + "outputs": [], + "source": [ + "logging.getLogger().setLevel(logging.ERROR)\n", + "tf.get_logger().setLevel('ERROR')\n", + "\n", + "print(\"TFX Version:\", tfx.__version__)\n", + "print(\"Tensorflow Version:\", tf.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c5ec6f8-634f-47b4-85ab-756314fa8129", + "metadata": {}, + "outputs": [], + "source": [ + "PARENT = f\"projects/{PROJECT}/locations/{REGION}\"\n", + " \n", + "print(\"Project ID:\", PROJECT)\n", + "print(\"Region:\", REGION)\n", + "print(\"Bucket name:\", BUCKET)\n", + "print(\"Service Account:\", SERVICE_ACCOUNT)\n", + "print(\"Vertex API Parent URI:\", PARENT)" + ] + }, + { + "cell_type": "markdown", + "id": "377f66a3-7e5b-46b3-b8ea-cf38b5fe92d4", + "metadata": {}, + "source": [ + "## Create Interactive TFX Context" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2abcd9d2-cc09-4ba5-979e-91696db38d03", + "metadata": {}, + "outputs": [], + "source": [ + "REMOVE_ARTIFACTS = True\n", + "\n", + "if tf.io.gfile.exists(ARTIFACT_STORE) and REMOVE_ARTIFACTS:\n", + " print(\"Removing previous artifacts...\")\n", + " tf.io.gfile.rmtree(ARTIFACT_STORE)\n", + " \n", + "if tf.io.gfile.exists(MLMD_SQLLITE) and REMOVE_ARTIFACTS:\n", + " print(\"Deleting previous mlmd.sqllite...\")\n", + " tf.io.gfile.rmtree(MLMD_SQLLITE)\n", + " \n", + "print(f'Pipeline artifacts directory: {PIPELINE_ROOT}')\n", + "print(f'Local metadata SQLlit path: {MLMD_SQLLITE}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "751e4415-f6d8-48fe-b586-d2f15eafc5de", + "metadata": {}, + "outputs": [], + "source": [ + "connection_config = metadata_store_pb2.ConnectionConfig()\n", + "connection_config.sqlite.filename_uri = MLMD_SQLLITE\n", + "connection_config.sqlite.connection_mode = 3 # READWRITE_OPENCREATE\n", + "mlmd_store = mlmd.metadata_store.MetadataStore(connection_config)\n", + "\n", + "context = InteractiveContext(\n", + " pipeline_name=PIPELINE_NAME,\n", + " pipeline_root=PIPELINE_ROOT,\n", + " metadata_connection_config=connection_config\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b17c3103-8aec-4bfb-87ea-6f4d52f4aa37", + "metadata": {}, + "source": [ + "### Pipeline step 1: Hyperparameter generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fdcfe55-01fe-47fb-ae39-ae02ba2c761d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "batch_size = 512\n", + "\n", + "hyperparams_gen = components.hyperparameters_gen(\n", + " num_epochs=5,\n", + " learning_rate=0.001,\n", + " batch_size=batch_size,\n", + " hidden_units='64,64',\n", + " steps_per_epoch=LIMIT // batch_size\n", + ")\n", + "\n", + "context.run(hyperparams_gen, enable_cache=False)" + ] + }, + { + "cell_type": "markdown", + "id": "abb8af0d-2e85-449b-bab7-1b9da8f17caf", + "metadata": {}, + "source": [ + "#### Load the output of the component from Cloud Storage to check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "637629a3-e72d-40cf-8753-9e0cf7ed997a", + "metadata": {}, + "outputs": [], + "source": [ + "gcs_uri_ouput = hyperparams_gen.outputs['hyperparameters'].get()[0].uri\n", + "gcs_uri_ouput" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "274b6774-220e-4e90-86b4-4b9aec9341a8", + "metadata": {}, + "outputs": [], + "source": [ + "json.load(\n", + " tf.io.gfile.GFile(\n", + " os.path.join(gcs_uri_ouput, 'hyperparameters.json')\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4ae3bd61-080c-4af0-a88d-f58d9485508a", + "metadata": {}, + "source": [ + "### Pipeline Step 2: Extract data from BQ onto Cloud Storage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39997b04-a2a3-4490-88de-4e218376de3e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def sql_query(ml_use, limit=None):\n", + " return datasource_utils.get_training_source_query(PROJECT, REGION, VERTEX_DATASET_NAME, ml_use=ml_use, limit=limit)\n", + "\n", + "def output_config(splits):\n", + " return example_gen_pb2.Output(\n", + " split_config=example_gen_pb2.SplitConfig(\n", + " splits=[example_gen_pb2.SplitConfig.Split(name=split_name, hash_buckets=buckets) for (split_name, buckets) in splits]\n", + " )\n", + " )\n", + "\n", + "train_example_gen = BigQueryExampleGen(query=sql_query('UNASSIGNED', LIMIT), output_config=output_config([('train', 4), ('eval', 1)]))\n", + "\n", + "beam_pipeline_args=[\n", + " f\"--project={PROJECT}\",\n", + " f\"--temp_location={os.path.join(WORKSPACE, 'tmp')}\"\n", + "]\n", + "\n", + "context.run(\n", + " train_example_gen,\n", + " beam_pipeline_args=beam_pipeline_args,\n", + " enable_cache=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c83709a-e59f-405e-8ab9-6185feac6c89", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "test_example_gen = BigQueryExampleGen(query=sql_query('TEST'), output_config=output_config([('test', 1)]))\n", + "\n", + "context.run(\n", + " test_example_gen,\n", + " beam_pipeline_args=beam_pipeline_args,\n", + " enable_cache=False\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "10f80eb6-13ea-4550-8ecb-069364f4538a", + "metadata": {}, + "source": [ + "#### Read some TFRecords from the training data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d417fc7f-5a9e-4214-b01c-8305942db796", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9053e250-45f5-40b5-8d5b-7469a47c0c4d", + "metadata": {}, + "outputs": [], + "source": [ + "train_uri = os.path.join(train_example_gen.outputs['examples'].get()[0].uri, \"Split-train/*\")\n", + "\n", + "source_raw_schema = tfdv.load_schema_text(os.path.join(RAW_SCHEMA_DIR, 'schema.pbtxt'))\n", + "raw_feature_spec = schema_utils.schema_as_feature_spec(source_raw_schema).feature_spec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fff0ec27-c766-47fa-87f9-a21461029806", + "metadata": {}, + "outputs": [], + "source": [ + "def _parse_tf_example(tfrecord):\n", + " return tf.io.parse_single_example(tfrecord, raw_feature_spec)\n", + "\n", + "tfrecord_filenames = tf.data.Dataset.list_files(train_uri)\n", + "dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type=\"GZIP\")\n", + "dataset = dataset.map(_parse_tf_example)\n", + "\n", + "for raw_features in dataset.shuffle(1000).batch(3).take(1):\n", + " for key in raw_features:\n", + " print(f\"{key}: {np.squeeze(raw_features[key], -1)}\")\n", + " print(\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "3d98b8cd-d014-4b7a-b2ca-9f1e3400bcf5", + "metadata": {}, + "source": [ + "### Pipeline step 3: Data Validation\n", + "\n", + "Import the schema, generate statistics and validate the statistics against the schema." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2146edf3-01a3-4c65-80ba-d76f1604e5ad", + "metadata": {}, + "outputs": [], + "source": [ + "schema_importer = tfx.dsl.Importer(\n", + " source_uri=RAW_SCHEMA_DIR,\n", + " artifact_type=tfx.types.standard_artifacts.Schema,\n", + " reimport=False\n", + ")\n", + "\n", + "context.run(schema_importer)" + ] + }, + { + "cell_type": "markdown", + "id": "1847ecfa-4656-4efd-a234-657c78ec2968", + "metadata": {}, + "source": [ + "Generate statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "385fecfd-d582-4ff1-b0f7-9266f93d91cb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "statistics_gen = tfx.components.StatisticsGen(\n", + " examples=train_example_gen.outputs['examples'])\n", + "context.run(statistics_gen)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52b8b551-e8cf-4dfd-8b0e-75923859687d", + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {RAW_SCHEMA_DIR}/.ipynb_checkpoints/" + ] + }, + { + "cell_type": "markdown", + "id": "e1787919-0359-4003-b26a-d39c755b41f2", + "metadata": {}, + "source": [ + "Validate statistics against schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "503235ce-b751-4d9e-9968-af7d86dfa20e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "example_validator = tfx.components.ExampleValidator(\n", + " statistics=statistics_gen.outputs['statistics'],\n", + " schema=schema_importer.outputs['result'],\n", + ")\n", + "\n", + "context.run(example_validator)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bda0fdfc-b146-4224-b55d-1a904e374ac1", + "metadata": {}, + "outputs": [], + "source": [ + "context.show(example_validator.outputs['anomalies'])" + ] + }, + { + "cell_type": "markdown", + "id": "ce1eea1c-4ad0-4ec2-8a0b-155aee16b92e", + "metadata": {}, + "source": [ + "### Pipeline Step 4: Data Preprocesing using TFX Transform (TFT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c67235de-ce78-430b-8214-648e83789b94", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "_transform_module_file = 'src/preprocessing/transformations.py'\n", + "\n", + "transform = tfx.components.Transform(\n", + " examples=train_example_gen.outputs['examples'],\n", + " schema=schema_importer.outputs['result'],\n", + " module_file=_transform_module_file,\n", + " splits_config=transform_pb2.SplitsConfig(\n", + " analyze=['train'], transform=['train', 'eval']),\n", + ")\n", + "\n", + "context.run(transform, enable_cache=False)" + ] + }, + { + "cell_type": "markdown", + "id": "7f82f156-ed6f-40d0-bde1-0acf0fb0b1b8", + "metadata": {}, + "source": [ + "#### Test: Read an example of the transformed data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9204ecf-e700-4b5b-81b3-32b18e9eed56", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "transformed_train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, \"Split-train/*\")\n", + "transform_graph_uri = transform.outputs['transform_graph'].get()[0].uri\n", + "\n", + "tft_output = tft.TFTransformOutput(transform_graph_uri)\n", + "transform_feature_spec = tft_output.transformed_feature_spec()\n", + "\n", + "for input_features, target in data.get_dataset(\n", + " transformed_train_uri, transform_feature_spec, batch_size=3, epochs=1).take(1):\n", + " for key in input_features:\n", + " print(f\"{key} ({input_features[key].dtype}): {input_features[key].numpy().tolist()}\")\n", + " print(f\"target: {target.numpy().tolist()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "43191d8d-1689-4812-a07c-4811306ac112", + "metadata": {}, + "source": [ + "### Pipeline Step 5: Model Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f7fb3d7-a386-4b5f-aa55-c23b94a305b0", + "metadata": {}, + "outputs": [], + "source": [ + "from tfx.dsl.components.common.resolver import Resolver\n", + "from tfx.dsl.experimental import latest_blessed_model_resolver" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa33f320-fe92-4ee7-99ac-9486b0c9510e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "_train_module_file = 'src/model_training/runner.py'\n", + "\n", + "trainer = tfx.components.Trainer(\n", + " module_file=_train_module_file,\n", + " examples=transform.outputs['transformed_examples'],\n", + " schema=schema_importer.outputs['result'],\n", + " transform_graph=transform.outputs['transform_graph'],\n", + " hyperparameters=hyperparams_gen.outputs['hyperparameters'],\n", + ")\n", + "\n", + "context.run(trainer, enable_cache=False)" + ] + }, + { + "cell_type": "markdown", + "id": "cce6e6f9-6ca1-48cc-8b72-7a0e8e37c856", + "metadata": {}, + "source": [ + "### Pipeline Step 6: Model Evaluation\n", + "\n", + "#### Get the latest blessed model for model validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b543c1d7-37f1-49f6-b29d-518b4ff3f1d0", + "metadata": {}, + "outputs": [], + "source": [ + "blessed_model_resolver = Resolver(\n", + " strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver,\n", + " model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model),\n", + " model_blessing=tfx.dsl.Channel(type=tfx.types.standard_artifacts.ModelBlessing)\n", + ")\n", + "\n", + "context.run(blessed_model_resolver, enable_cache=False)" + ] + }, + { + "cell_type": "markdown", + "id": "64604298-be89-4c3e-9677-c292fdfadb43", + "metadata": {}, + "source": [ + "#### Evaluate the model and compare against the baseline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2085e54a-1a15-437e-bd70-9146d0ecebd9", + "metadata": {}, + "outputs": [], + "source": [ + "from tfx.components import Evaluator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b9c9ac9-46f3-45ed-8750-f18fd15ab106", + "metadata": {}, + "outputs": [], + "source": [ + "eval_config = tfma.EvalConfig(\n", + " model_specs=[\n", + " tfma.ModelSpec(\n", + " signature_name='serving_tf_example',\n", + " label_key=features.TARGET_FEATURE_NAME,\n", + " prediction_key='probabilities')\n", + " ],\n", + " slicing_specs=[\n", + " tfma.SlicingSpec(),\n", + " ],\n", + " metrics_specs=[\n", + " tfma.MetricsSpec(\n", + " metrics=[ \n", + " tfma.MetricConfig(class_name='ExampleCount'),\n", + " tfma.MetricConfig(\n", + " class_name='BinaryAccuracy',\n", + " threshold=tfma.MetricThreshold(\n", + " value_threshold=tfma.GenericValueThreshold(\n", + " lower_bound={'value': 0.1}), ## note setting a very low barrier for this example\n", + " # Change threshold will be ignored if there is no\n", + " # baseline model resolved from MLMD (first run).\n", + " change_threshold=tfma.GenericChangeThreshold(\n", + " direction=tfma.MetricDirection.HIGHER_IS_BETTER,\n", + " absolute={'value': -1e-10}))),\n", + " ])\n", + " ])\n", + "\n", + "\n", + "evaluator = Evaluator(\n", + " examples=test_example_gen.outputs['examples'],\n", + " example_splits=['test'],\n", + " model=trainer.outputs['model'],\n", + " baseline_model=blessed_model_resolver.outputs['model'],\n", + " eval_config=eval_config,\n", + " schema=schema_importer.outputs['result']\n", + ")\n", + "\n", + "context.run(evaluator, enable_cache=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d284e70-2185-4b71-8f23-cfaef4ec52d6", + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_results = evaluator.outputs['evaluation'].get()[0].uri\n", + "print(\"validation_ok:\", tfma.load_validation_result(evaluation_results).validation_ok, '\\n')\n", + "\n", + "for entry in list(tfma.load_metrics(evaluation_results))[0].metric_keys_and_values:\n", + " value = entry.value.double_value.value\n", + " if value:\n", + " print(entry.key.name, \":\", round(entry.value.double_value.value, 3))" + ] + }, + { + "cell_type": "markdown", + "id": "975e2cfd-8524-4c25-88a7-fbee8b215352", + "metadata": {}, + "source": [ + "### Pipeline Step 7: Push model to Cloud Storage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "073bfac2-ee38-4b05-9642-1e29fbdcc7ea", + "metadata": {}, + "outputs": [], + "source": [ + "exported_model_location = os.path.join(MODEL_REGISTRY, MODEL_DISPLAY_NAME)\n", + "\n", + "push_destination=tfx.proto.PushDestination(\n", + " filesystem=tfx.proto.PushDestination.Filesystem(\n", + " base_directory=exported_model_location,\n", + " )\n", + ")\n", + "\n", + "pusher = tfx.components.Pusher(\n", + " model=trainer.outputs['model'],\n", + " model_blessing=evaluator.outputs['blessing'],\n", + " push_destination=push_destination\n", + ")\n", + "\n", + "context.run(pusher, enable_cache=False)" + ] + }, + { + "cell_type": "markdown", + "id": "c6e8dbe5-f3fa-4ce6-8057-d19a672350e2", + "metadata": {}, + "source": [ + "### Pipeline Step 8: Upload model to Vertex AI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d8c0686-f971-4b60-9db1-6036052c2098", + "metadata": {}, + "outputs": [], + "source": [ + "serving_runtime = 'tf2-cpu.2-5'\n", + "serving_image_uri = f\"europe-docker.pkg.dev/vertex-ai/prediction/{serving_runtime}:latest\"\n", + "\n", + "labels = {\n", + " 'dataset_name': VERTEX_DATASET_NAME,\n", + " 'pipeline_name': PIPELINE_NAME\n", + "}\n", + "labels = json.dumps(labels)\n", + "\n", + "vertex_model_uploader = components.vertex_model_uploader(\n", + " project=PROJECT,\n", + " region=REGION,\n", + " model_display_name=MODEL_DISPLAY_NAME,\n", + " pushed_model_location=exported_model_location,\n", + " serving_image_uri=serving_image_uri,\n", + " model_blessing=evaluator.outputs['blessing'],\n", + " explanation_config='',\n", + " labels=labels\n", + ")\n", + "\n", + "context.run(vertex_model_uploader, enable_cache=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba3923d4-0048-482c-96f7-3b52782355df", + "metadata": {}, + "outputs": [], + "source": [ + "vertex_model_uploader.outputs['uploaded_model'].get()[0].uri" + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "tf2-gpu.2-8.m93", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-8:m93" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/vertex_mlops_enterprise/02-cicd.ipynb b/examples/vertex_mlops_enterprise/02-cicd.ipynb new file mode 100644 index 0000000000..ac25f1539b --- /dev/null +++ b/examples/vertex_mlops_enterprise/02-cicd.ipynb @@ -0,0 +1,1317 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "f2e51e53", + "metadata": {}, + "source": [ + "Copyright 2023 Google LLC\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "you may not use this file except in compliance with the License.\n", + "You may obtain a copy of the License at\n", + "\n", + " http://www.apache.org/licenses/LICENSE-2.0\n", + "\n", + "Unless required by applicable law or agreed to in writing, software\n", + "distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "See the License for the specific language governing permissions and\n", + "limitations under the License." + ] + }, + { + "cell_type": "markdown", + "id": "9d0817cd-9e64-4d5e-9c66-4e0961aa1085", + "metadata": { + "tags": [] + }, + "source": [ + "# MLOps End to End Workflow (II)\n", + "\n", + "Implementation of an end-to-end ML Ops workflow for the use case to detect fraudulent credit card transactions, see [Kaggle dataset](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud).\n", + "\n", + "This set of notebooks cover:\n", + "\n", + "[Experimentation](01-experimentation.ipynb):\n", + "1. Set up: Creation of the Vertex Dataset, extraction of the schema\n", + "2. Implementation of a TFX pipeline\n", + "\n", + "[CICD](02-cicd.ipynb):\n", + "\n", + "3. Deployment of the Vertex AI Pipeline through a CI/CD process\n", + "4. Deployment of a Continuous Training pipeline that can be triggered via Pub/Sub and produces a model in the Model Registry\n", + "5. Deployment of the Inference Pipeline consisting of a Cloud Function that retrieves features from Feature Store and calls the model on a Vertex AI Endpoint\n", + "6. Deployment of the model to a Vertex AI Endpoint through a CI/CD process.\n", + "\n", + "[Prediction](03-prediction.ipynb):\n", + "\n", + "7. Deploy the model to an endpoint\n", + "8. Create a test prediction\n" + ] + }, + { + "cell_type": "markdown", + "id": "f5c0f8ad-ca30-4f5c-a135-809409f58abd", + "metadata": { + "tags": [] + }, + "source": [ + "### Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b997d6a-f8fd-43f5-bafd-81b169965160", + "metadata": {}, + "outputs": [], + "source": [ + "#%load_ext autoreload\n", + "#%autoreload 2\n", + "\n", + "import os\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import tensorflow_data_validation as tfdv\n", + "from google.cloud import bigquery\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from google.cloud import aiplatform as vertex_ai\n", + "\n", + "import yaml\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3bdeae6-b5c9-479d-8318-627959bf9898", + "metadata": {}, + "outputs": [], + "source": [ + "with open('mainconfig.yaml') as f:\n", + " main_config = yaml.safe_load(f)\n", + "\n", + "# select your config \n", + "main_config = main_config['creditcards']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3b2ad96-7e3a-4980-a8d2-ee78993f0732", + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT = main_config['project'] \n", + "REGION = main_config['region'] \n", + "DOCKER_REPO = main_config['docker_repo']\n", + "\n", + "SERVICE_ACCOUNT = main_config['service_account']\n", + "\n", + "# BigQuery and data locations\n", + "\n", + "BQ_SOURCE_TABLE= main_config['bq']['source_table'] # raw input\n", + "ML_TABLE = main_config['bq']['ml_table'] # the one we will use for the training\n", + "\n", + "BQ_DATASET_NAME = main_config['bq']['dataset']\n", + "BQ_LOCATION = main_config['bq']['location'] # multiregion provides more resilience\n", + "\n", + "VERTEX_DATASET_NAME = main_config['vertex_dataset_name']\n", + "\n", + "RAW_SCHEMA_DIR = main_config['raw_schema_dir']\n", + "\n", + "BUCKET = main_config['bucket']\n", + "\n", + "# TFX and model config\n", + "\n", + "# model version\n", + "VERSION = main_config['version']\n", + "\n", + "\n", + "MODEL_DISPLAY_NAME = f'{VERTEX_DATASET_NAME}-classifier-{VERSION}'\n", + "WORKSPACE = f'gs://{BUCKET}/{VERTEX_DATASET_NAME}'\n", + "\n", + "MLMD_SQLLITE = 'mlmd.sqllite'\n", + "ARTIFACT_STORE = os.path.join(WORKSPACE, 'tfx_artifacts_interactive')\n", + "MODEL_REGISTRY = os.path.join(WORKSPACE, 'model_registry')\n", + "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n", + "PIPELINE_ROOT = os.path.join(ARTIFACT_STORE, PIPELINE_NAME)\n", + "\n", + "ENDPOINT_DISPLAY_NAME = f'{VERTEX_DATASET_NAME}-classifier'\n", + "\n", + "FEATURESTORE_ID = main_config['featurestore_id']\n", + "\n", + "CF_REGION = main_config['cloudfunction_region']\n", + "\n", + "DATAFLOW_SUBNETWORK = f\"https://www.googleapis.com/compute/v1/projects/{PROJECT}/regions/{REGION}/subnetworks/{main_config['dataflow']['subnet']}\"\n", + "DATAFLOW_SERVICE_ACCOUNT = main_config['dataflow']['service_account']\n", + "\n", + "CLOUDBUILD_SA = f'projects/{PROJECT}/serviceAccounts/{SERVICE_ACCOUNT}'\n", + "\n", + "LIMIT=main_config['limit']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fda3ab18-7560-4d47-a911-101d21e8bfd7", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Project ID:\", PROJECT)\n", + "print(\"Region:\", REGION)\n", + "print(\"Service Account:\", SERVICE_ACCOUNT)\n", + "\n", + "vertex_ai.init(\n", + " project=PROJECT,\n", + " location=REGION\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a68df038-006c-4667-ac74-0123c4789109", + "metadata": { + "tags": [] + }, + "source": [ + "## Unit Testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb98a921-cba7-46ee-baa7-b0a4461c3b2c", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"VERTEX_DATASET_NAME\"] = VERTEX_DATASET_NAME\n", + "os.environ[\"MODEL_DISPLAY_NAME\"] = MODEL_DISPLAY_NAME\n", + "os.environ[\"PIPELINE_NAME\"] = PIPELINE_NAME\n", + "os.environ[\"PROJECT\"] = PROJECT\n", + "os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT\n", + "os.environ[\"REGION\"] = REGION\n", + "os.environ[\"GCS_LOCATION\"] = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}\"\n", + "os.environ[\"MODEL_REGISTRY_URI\"] = os.path.join(os.environ[\"GCS_LOCATION\"], \"model_registry\")\n", + "os.environ[\"TRAIN_LIMIT\"] = \"85000\"\n", + "os.environ[\"TEST_LIMIT\"] = \"15000\"\n", + "os.environ[\"BEAM_RUNNER\"] = \"DataflowRunner\"\n", + "os.environ[\"TRAINING_RUNNER\"] = \"vertex\"\n", + "os.environ[\"DATAFLOW_IMAGE_URI\"] = f\"{DOCKER_REPO}/dataflow:latest\"\n", + "os.environ[\"TFX_IMAGE_URI\"] = f\"{DOCKER_REPO}/vertex:latest\"\n", + "os.environ[\"ENABLE_CACHE\"] = \"1\"\n", + "os.environ[\"SUBNETWORK\"] = DATAFLOW_SUBNETWORK\n", + "os.environ[\"SERVICE_ACCOUNT\"] = DATAFLOW_SERVICE_ACCOUNT\n", + "os.environ[\"BQ_LOCATION\"] = BQ_LOCATION\n", + "os.environ[\"BQ_DATASET_NAME\"] = BQ_DATASET_NAME\n", + "os.environ[\"ML_TABLE\"] = ML_TABLE\n", + "os.environ[\"GCS_LOCATION\"] = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}/e2e_tests\"\n", + "os.environ[\"SUBNETWORK\"] = DATAFLOW_SUBNETWORK" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d64749c-b1c6-49d6-b8fa-3a18e5bddc01", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"UPLOAD_MODEL\"] = \"0\"\n", + "os.environ[\"ACCURACY_THRESHOLD\"] = \"-0.1\" # NB Negative accuracy threshold makes no sense - allows everything\n", + "os.environ[\"BEAM_RUNNER\"] = \"DirectRunner\"\n", + "os.environ[\"TRAINING_RUNNER\"] = \"local\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d3b8465-c78c-4989-b920-3acd536c3e1c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from src.tfx_pipelines import config\n", + "import importlib\n", + "importlib.reload(config)\n", + "\n", + "for key, value in config.__dict__.items():\n", + " if key.isupper(): print(f'{key}: {value}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70c02a7f-8545-42aa-b55a-1b864ccefed8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!python -m pytest src/tests/datasource_utils_tests.py -s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5977100-f70d-4d75-bb8b-c88340d378a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!python -m pytest src/tests/model_tests.py -s" + ] + }, + { + "cell_type": "markdown", + "id": "289e973b-ed67-4933-8b83-23e514bb244c", + "metadata": {}, + "source": [ + "#### End to end pipeline unit test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ad9e768-4c25-4765-aeac-9e47f6c6cc70", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!python -m pytest src/tests/pipeline_deployment_tests.py::test_e2e_pipeline -s" + ] + }, + { + "cell_type": "markdown", + "id": "df5cf4c2-a05d-41ce-af1a-cb7c97f0dfdf", + "metadata": { + "tags": [] + }, + "source": [ + "## Deploy to Vertex AI Pipelines" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90774f24-3a81-4b6b-89ef-8dcd08387fdb", + "metadata": {}, + "outputs": [], + "source": [ + "config.BEAM_RUNNER" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a271d4e3-d45d-487f-972d-0b4e06db7758", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from src.tfx_pipelines import config\n", + "import importlib\n", + "\n", + "importlib.reload(config)\n", + "\n", + "for key, value in config.__dict__.items():\n", + " if key.isupper(): print(f'{key}: {value}')" + ] + }, + { + "cell_type": "markdown", + "id": "7510db04-eb3b-4567-a81c-9a349a58e89e", + "metadata": {}, + "source": [ + "### Create Repo for Images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4d086a8-cd73-403f-92ed-6cd8d68321d8", + "metadata": {}, + "outputs": [], + "source": [ + "# Repo should has been created in the Terraform automation stage\n", + "#! gcloud artifacts repositories create {VERTEX_DATASET_NAME} --location={REGION} --repository-format=docker" + ] + }, + { + "cell_type": "markdown", + "id": "3a37f0cc-42ff-486c-92fd-ad2628348722", + "metadata": {}, + "source": [ + "### Build Dataflow Worker Image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05123e22-d8f8-4cfd-a743-fe8c6e4925d9", + "metadata": {}, + "outputs": [], + "source": [ + "# You can also use build/Dockerfile.dataflow in case Internet access is not allowed\n", + "!cp build/Dockerfile.dataflow_internet Dockerfile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc17291e-d63f-4e13-bbd7-5de2510c87f1", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"DOCKER_REPO\"] = f\"{DOCKER_REPO}/vertex:latest\"\n", + "!gcloud builds submit --project=$PROJECT --billing-project=$PROJECT --region $REGION --tag $DOCKER_REPO/dataflow:latest . --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs" + ] + }, + { + "cell_type": "markdown", + "id": "de97d9ea-5a42-42a1-ba4c-7421e2acc052", + "metadata": {}, + "source": [ + "### Build Vertex worker image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2595e5e8-47bc-4f6d-8a39-b09d92588be2", + "metadata": {}, + "outputs": [], + "source": [ + "!echo $TFX_IMAGE_URI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12aff000-b7b8-4f1a-a2b7-2d4f56343a9e", + "metadata": {}, + "outputs": [], + "source": [ + "!echo $PYTHONPATH" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a12ebc0c-03ed-46eb-9610-9f4c24f60cce", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!cp build/Dockerfile.vertex_internet Dockerfile\n", + "!gcloud builds submit --project=$PROJECT --billing-project=$PROJECT --region $REGION --tag $TFX_IMAGE_URI . --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e73afd4-c486-48b2-ba98-fe5d5bd56ceb", + "metadata": {}, + "outputs": [], + "source": [ + "PIPELINES_STORE = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}/compiled_pipelines/\"\n", + "!gsutil cp {pipeline_definition_file} {PIPELINES_STORE}\n", + "PIPELINES_STORE" + ] + }, + { + "cell_type": "markdown", + "id": "a23bbe44-a6c8-4efa-b67f-d8c0ad3a1427", + "metadata": {}, + "source": [ + "### Compile the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e876c9eb-6f73-4ea9-a85c-0172c728c1d3", + "metadata": {}, + "outputs": [], + "source": [ + "from src.tfx_pipelines import config, runner\n", + "\n", + "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n", + "pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file)" + ] + }, + { + "cell_type": "markdown", + "id": "3df41e2b-e420-40c7-a08f-3b02a9667522", + "metadata": {}, + "source": [ + "### Submit Vertex AI Pipelines run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8516b764-1143-4b9e-9ae7-af9340146ec3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from google.cloud.aiplatform import pipeline_jobs\n", + " \n", + "job = pipeline_jobs.PipelineJob(template_path = pipeline_definition_file,\n", + " display_name=VERTEX_DATASET_NAME,\n", + " #enable_caching=False,\n", + " parameter_values={\n", + " 'learning_rate': 0.003,\n", + " 'batch_size': 512,\n", + " 'steps_per_epoch': int(config.TRAIN_LIMIT) // 512,\n", + " 'hidden_units': '128,128',\n", + " 'num_epochs': 30,\n", + " })\n", + "\n", + "job.run(sync=False, service_account=DATAFLOW_SERVICE_ACCOUNT)" + ] + }, + { + "cell_type": "markdown", + "id": "e826bd3e-7bcc-4240-90b9-9b65ba36cc4a", + "metadata": { + "tags": [] + }, + "source": [ + "## Deploy Continuous Training Pipeline (\"CI/CD\")\n", + "\n", + "* Deploy Pub/Sub topic to listen to retraining triggers\n", + "* Deploy Cloud Function to listen to Pub/Sub topic and trigger Vertex AI Pipeline\n", + "* Deploy Vertex AI Pipeline\n", + "\n", + "### Build CI/CD image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e504cc07-22ee-45ba-9416-73cd22251c01", + "metadata": {}, + "outputs": [], + "source": [ + "CICD_IMAGE_URI = f\"{DOCKER_REPO}/cicd:latest\"\n", + "CICD_IMAGE_URI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a38e3ffc-fd4e-4934-b2b2-995070ae2076", + "metadata": {}, + "outputs": [], + "source": [ + "# For the CICD container, we are just adding the build/* dir\n", + "!cp build/Dockerfile.cicd_internet build/Dockerfile\n", + "!gcloud builds submit --project=$PROJECT --billing-project=$PROJECT --region $REGION --tag $CICD_IMAGE_URI build/. --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs" + ] + }, + { + "cell_type": "markdown", + "id": "8cf9512b-323e-474c-83d4-b9f595851934", + "metadata": {}, + "source": [ + "### Automate the deployment of the Training Pipeline using Cloud Build\n", + "***Important*** you should commit the code to the git repo since the next build process will checkout the code from the repo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "370f9daf-7602-48d7-92b4-0f265ba9bee9", + "metadata": {}, + "outputs": [], + "source": [ + "REPO_URL = main_config['git']['repo_url']\n", + "BRANCH = main_config['git']['branch']\n", + "\n", + "\n", + "GCS_LOCATION = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}/\"\n", + "TEST_GCS_LOCATION = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}/e2e_tests\"\n", + "CI_TRAIN_LIMIT = 1000\n", + "CI_TEST_LIMIT = 100\n", + "CI_UPLOAD_MODEL = 0\n", + "CI_ACCURACY_THRESHOLD = -0.1 # again setting accuracy threshold to negative\n", + "BEAM_RUNNER = \"DataflowRunner\"\n", + "TRAINING_RUNNER = \"vertex\"\n", + "VERSION = 'latest'\n", + "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n", + "PIPELINES_STORE = f\"{GCS_LOCATION}compiled_pipelines/\"\n", + "\n", + "TFX_IMAGE_URI = f\"{DOCKER_REPO}/vertex:{VERSION}\"\n", + "DATAFLOW_IMAGE_URI = f\"{DOCKER_REPO}/dataflow:latest\"\n", + "\n", + "REPO_NAME = REPO_URL.split('/')[-1]\n", + "DESCR=f'\"Deploy train pipeline to GCS from {BRANCH}\"'\n", + "\n", + "\n", + "SUBSTITUTIONS=f\"\"\"\\\n", + "_REPO_URL='{REPO_URL}',\\\n", + "_BRANCH={BRANCH},\\\n", + "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n", + "_PROJECT={PROJECT},\\\n", + "_REGION={REGION},\\\n", + "_GCS_LOCATION={GCS_LOCATION},\\\n", + "_TEST_GCS_LOCATION={TEST_GCS_LOCATION},\\\n", + "_BQ_LOCATION={BQ_LOCATION},\\\n", + "_BQ_DATASET_NAME={BQ_DATASET_NAME},\\\n", + "_ML_TABLE={ML_TABLE},\\\n", + "_VERTEX_DATASET_NAME={VERTEX_DATASET_NAME},\\\n", + "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n", + "_CI_TRAIN_LIMIT={CI_TRAIN_LIMIT},\\\n", + "_CI_TEST_LIMIT={CI_TEST_LIMIT},\\\n", + "_CI_UPLOAD_MODEL={CI_UPLOAD_MODEL},\\\n", + "_CI_ACCURACY_THRESHOLD={CI_ACCURACY_THRESHOLD},\\\n", + "_BEAM_RUNNER={BEAM_RUNNER},\\\n", + "_TRAINING_RUNNER={TRAINING_RUNNER},\\\n", + "_DATAFLOW_IMAGE_URI={DATAFLOW_IMAGE_URI},\\\n", + "_TFX_IMAGE_URI={TFX_IMAGE_URI},\\\n", + "_PIPELINE_NAME={PIPELINE_NAME},\\\n", + "_PIPELINES_STORE={PIPELINES_STORE},\\\n", + "_SUBNETWORK={DATAFLOW_SUBNETWORK},\\\n", + "_GCS_BUCKET={BUCKET}/cloudbuild,\\\n", + "_SERVICE_ACCOUNT={DATAFLOW_SERVICE_ACCOUNT},\\\n", + "_WORKDIR={REPO_NAME}\\\n", + "\"\"\"\n", + "!echo $SUBSTITUTIONS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7856ce61-fe1a-4db4-9c71-fed47129ef7d", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud builds submit build/known_hosts.github.zip --config build/pipeline-deployment.yaml --substitutions {SUBSTITUTIONS} --project=$PROJECT --billing-project=$PROJECT --region $REGION --suppress-logs" + ] + }, + { + "cell_type": "markdown", + "id": "236e622d-0708-404a-b4d5-63972ea6b441", + "metadata": {}, + "source": [ + "### (Optional for Cloud Sources Repositories) Define the trigger that will deploy the pipeline after a commit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "928797f7-20cf-4769-9a35-24ad67900125", + "metadata": {}, + "outputs": [], + "source": [ + "!echo gcloud beta builds triggers create cloud-source-repositories --repo={REPO_NAME} --branch-pattern=^{BRANCH}$ --description={DESCR} --build-config=mlops-creditcard/build/pipeline-deployment.yaml --substitutions={SUBSTITUTIONS} --billing-project={PROJECT} --service-account={TRIGGER_SA}" + ] + }, + { + "cell_type": "markdown", + "id": "99667732-3f1b-4c95-86e1-8001dff672dd", + "metadata": {}, + "source": [ + "### Set up the trigger for the Training Pipeline\n", + "\n", + "* Deploy Pub/Sub Topic\n", + "* Deploy Cloud Function that listens to the topic and triggers the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64f56b2a-e55f-4b54-b90a-6335fce69314", + "metadata": {}, + "outputs": [], + "source": [ + "PUBSUB_TOPIC = f'trigger-{PIPELINE_NAME}'\n", + "CLOUD_FUNCTION_NAME = f'trigger-{PIPELINE_NAME}-fn'\n", + "GCS_PIPELINE_FILE_LOCATION = os.path.join(PIPELINES_STORE, f'{PIPELINE_NAME}.json')" + ] + }, + { + "cell_type": "markdown", + "id": "ee8e210a-cca3-41e5-9083-ff3a395b9c1a", + "metadata": {}, + "source": [ + "#### Create Pub/Sub Topic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a03ef492-ee1c-451d-a1e2-c5c511e00326", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud pubsub topics create {PUBSUB_TOPIC}" + ] + }, + { + "cell_type": "markdown", + "id": "547456c0-c55f-492e-97e6-f529182d3a13", + "metadata": {}, + "source": [ + "#### Deploy Cloud Function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d094091-3332-4c4c-9bb6-f6e5839d7ded", + "metadata": {}, + "outputs": [], + "source": [ + "ENV_VARS=f\"\"\"\\\n", + "PROJECT={PROJECT},\\\n", + "REGION={REGION},\\\n", + "GCS_PIPELINE_FILE_LOCATION={GCS_PIPELINE_FILE_LOCATION},\\\n", + "SERVICE_ACCOUNT={SERVICE_ACCOUNT},\\\n", + "PIPELINE_NAME={PIPELINE_NAME}\n", + "\"\"\"\n", + "\n", + "!echo {ENV_VARS}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3da074b8-106c-42bf-b956-e59b249bf5f2", + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf src/pipeline_triggering/.ipynb_checkpoints" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a695eef-bf18-4fce-b44c-2ac00ae46675", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud functions deploy {CLOUD_FUNCTION_NAME} --gen2 \\\n", + " --region={CF_REGION} \\\n", + " --trigger-topic={PUBSUB_TOPIC} \\\n", + " --runtime=python38 \\\n", + " --source=src/pipeline_triggering\\\n", + " --entry-point=trigger_pipeline\\\n", + " --stage-bucket={BUCKET}\\\n", + " --ingress-settings=internal-only\\\n", + " --service-account={SERVICE_ACCOUNT}\\\n", + " --update-env-vars={ENV_VARS} " + ] + }, + { + "cell_type": "markdown", + "id": "50ad5728-1b82-4979-bf16-fc518f8a9ba9", + "metadata": {}, + "source": [ + "#### Test triggering the pipeline with a Pub/Sub message" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ead71b7d-fc63-4a80-8f52-e40f3612d67a", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import pubsub\n", + "import json\n", + "\n", + "publish_client = pubsub.PublisherClient()\n", + "topic = f'projects/{PROJECT}/topics/{PUBSUB_TOPIC}'\n", + "data = {\n", + " 'num_epochs': 7,\n", + " 'learning_rate': 0.0015,\n", + " 'batch_size': 512,\n", + " 'steps_per_epoch': int(config.TRAIN_LIMIT) // 512,\n", + " 'hidden_units': '256,126'\n", + "}\n", + "message = json.dumps(data)\n", + "\n", + "_ = publish_client.publish(topic, message.encode())" + ] + }, + { + "cell_type": "markdown", + "id": "65a666dc-d06c-4442-875e-86d382e3269f", + "metadata": {}, + "source": [ + "Check the console to see that it's running.\n", + "\n", + "We now have:\n", + "\n", + "* A pipeline that can be run to test and deploy new training pipelines\n", + "* A triggering mechanism to programmatically trigger new training runs\n", + "* A training run finish with a new model in the Vertex AI Model Registry\n" + ] + }, + { + "cell_type": "markdown", + "id": "99ff68dd-b3c4-4633-88fb-5997636d15ec", + "metadata": {}, + "source": [ + "### Preparation\n", + "\n", + "#### Vertex AI Endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1793acf-21df-495f-b37b-99a54ec38cdd", + "metadata": {}, + "outputs": [], + "source": [ + "ENDPOINT_DISPLAY_NAME" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1088589e-d24a-47a8-9fe4-7a4135a8a5b4", + "metadata": {}, + "outputs": [], + "source": [ + "from build.utils import create_endpoint\n", + "\n", + "endpoint = create_endpoint(PROJECT, REGION, ENDPOINT_DISPLAY_NAME)\n", + "endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ff735bf-43a6-4fe1-b3c1-055b21860aa9", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud.aiplatform_v1beta1 import FeaturestoreOnlineServingServiceClient, FeaturestoreServiceClient" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e76b8fe-3b31-4722-b95a-3f9330e91530", + "metadata": {}, + "outputs": [], + "source": [ + "from feature_store import feature_store as fs" + ] + }, + { + "cell_type": "markdown", + "id": "02f89f3a-9456-4f5c-af84-256fd51e3cb6", + "metadata": {}, + "source": [ + "Create Feature Store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad2103aa-5845-4f51-a982-942b491b3588", + "metadata": {}, + "outputs": [], + "source": [ + "fs.create_fs(PROJECT, REGION, FEATURESTORE_ID, \"Feature Store for credit card use case\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c531af74-593a-4439-9c5a-5a45d320d951", + "metadata": {}, + "outputs": [], + "source": [ + "from google.api_core import operations_v1\n", + "from google.cloud.aiplatform_v1beta1 import FeaturestoreOnlineServingServiceClient, FeaturestoreServiceClient, FeatureSelector\n", + "from google.cloud.aiplatform_v1beta1.types import featurestore_online_service as featurestore_online_service_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import entity_type as entity_type_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import feature as feature_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import featurestore as featurestore_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import featurestore_service as featurestore_service_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import io as io_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import ListFeaturestoresRequest, CreateFeaturestoreRequest, Featurestore, ListEntityTypesRequest\n", + "\n", + "from google.protobuf.timestamp_pb2 import Timestamp\n", + "from google.cloud.aiplatform_v1beta1.types import featurestore_monitoring as featurestore_monitoring_pb2\n", + "from google.protobuf.duration_pb2 import Duration\n", + "\n", + "\n", + "API_ENDPOINT = f\"{REGION}-aiplatform.googleapis.com\" \n", + "admin_client = FeaturestoreServiceClient(client_options={\"api_endpoint\": API_ENDPOINT})\n", + "parent = f'{admin_client.common_location_path(PROJECT, REGION)}/featurestores/{FEATURESTORE_ID}'\n", + "request = ListEntityTypesRequest(parent=parent)\n", + "\n", + "# Make the request\n", + "page_result = admin_client.list_entity_types(request=request)\n", + "\n", + "# Handle the response\n", + "[x.name.split('/')[-1] for x in page_result]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "813e1109-23c9-4df1-9dcd-49579abd4637", + "metadata": {}, + "outputs": [], + "source": [ + "admin_client.featurestore_path(PROJECT, REGION, FEATURESTORE_ID)" + ] + }, + { + "cell_type": "markdown", + "id": "857b2803-8136-4aae-9d32-370daf566826", + "metadata": {}, + "source": [ + "#### Create an entity with features, generate some data and upload it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c5e3a9f-1c49-452b-b41d-54b9c766fac4", + "metadata": {}, + "outputs": [], + "source": [ + "entity = 'user'\n", + "entity_descr = 'User ID'\n", + "features = ['v27', 'v28']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "138a0401-bf30-4433-b6ce-3ce7908bfb63", + "metadata": {}, + "outputs": [], + "source": [ + "fs.create_entity(PROJECT, REGION, FEATURESTORE_ID, entity, entity_descr, features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b258871-b60a-4841-9d87-974e6b7591d2", + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "filename = f'features_{entity}.csv'\n", + "\n", + "with open(filename, 'w') as f:\n", + " line = f'{entity},{\",\".join(features)}\\n'\n", + " f.write(line)\n", + " for i in range(100):\n", + " f.write(f'user{i},{random.random()},{random.random()}\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c643c891-971f-4fcc-aa32-608881e3ff9a", + "metadata": {}, + "outputs": [], + "source": [ + "!tail -20 {filename}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3245990d-362c-48bd-a006-a39ba04225f2", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3660a3c-a615-4c09-b9a9-329bc56f7be1", + "metadata": {}, + "outputs": [], + "source": [ + "!gsutil cp {filename} gs://{BUCKET}/{filename} " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "534dbb9a-f3d8-41f4-a6a7-d78dcc615ecc", + "metadata": {}, + "outputs": [], + "source": [ + "gcs_uris = [f'gs://{BUCKET}/{filename}']\n", + "\n", + "fs.ingest_entities_csv(PROJECT, REGION, FEATURESTORE_ID, entity, features, gcs_uris)" + ] + }, + { + "cell_type": "markdown", + "id": "920fd50e-7a4c-475d-9834-eafad695bacb", + "metadata": {}, + "source": [ + "Test reading some features back" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "481ff3e3-cb67-476c-9e4b-a85fcfe1cf42", + "metadata": {}, + "outputs": [], + "source": [ + "features_data = {}\n", + "for i in range(90,102):\n", + " entity_id = f'user{i}'\n", + " features_data[entity_id] = fs.read_features(PROJECT, REGION, FEATURESTORE_ID, entity, features, entity_id)\n", + "\n", + "features_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a394b2a9-aecd-4ec4-b0b7-f7ab8d2a5c67", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ['ENDPOINT_DISPLAY_NAME'] = ENDPOINT_DISPLAY_NAME" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "073e0efb-5ae3-4bda-b1cd-d6cc08f9b64c", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pytest src/tests/model_deployment_tests.py::test_model_artifact -s" + ] + }, + { + "cell_type": "markdown", + "id": "0c5a1d30-8dad-4f0a-854b-4de67c548d54", + "metadata": {}, + "source": [ + "#### Deploy Model to Endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "697f0915-7257-41e0-bf33-2511671a439e", + "metadata": {}, + "outputs": [], + "source": [ + "!python build/utils.py \\\n", + " --mode=deploy-model\\\n", + " --project={PROJECT}\\\n", + " --region={REGION}\\\n", + " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}\\\n", + " --model-display-name={MODEL_DISPLAY_NAME}" + ] + }, + { + "cell_type": "markdown", + "id": "a402a585-5d45-4d9d-9497-cef77effb8a3", + "metadata": {}, + "source": [ + "#### Test model on Endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "062b7c0f-9a18-44cc-800e-a203903df6dd", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pytest src/tests/model_deployment_tests.py::test_model_endpoint" + ] + }, + { + "cell_type": "markdown", + "id": "3a23c981-a4d2-4396-aa1f-3785244a81ca", + "metadata": {}, + "source": [ + "#### Run the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01a0a985-2955-4c33-9cbc-09fa630a1ca2", + "metadata": {}, + "outputs": [], + "source": [ + "REPO_URL = main_config['git']['repo_url']\n", + "BRANCH = main_config['git']['branch']\n", + "\n", + "f'{REPO_URL}:{BRANCH}'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6453e104-774d-475d-8159-328ea6ae0249", + "metadata": {}, + "outputs": [], + "source": [ + "SUBSTITUTIONS=f\"\"\"\\\n", + "_REPO_URL='{REPO_URL}',\\\n", + "_BRANCH={BRANCH},\\\n", + "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n", + "_PROJECT={PROJECT},\\\n", + "_REGION={REGION},\\\n", + "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n", + "_ENDPOINT_DISPLAY_NAME={ENDPOINT_DISPLAY_NAME},\\\n", + "_GCS_BUCKET={BUCKET}/cloudbuild,\\\n", + "_SERVICE_ACCOUNT={SERVICE_ACCOUNT},\\\n", + "_WORKDIR={REPO_NAME}\\\n", + "\"\"\"\n", + "\n", + "SUBSTITUTIONS" + ] + }, + { + "cell_type": "markdown", + "id": "5cd627e0-aec5-4ff0-8bb3-b072aa01fc6d", + "metadata": {}, + "source": [ + "### Test the build and define a manual trigger" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa8712b3-b431-4c82-b49f-2df4882c25d4", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud builds submit --no-source --config build/model-deployment.yaml --substitutions {SUBSTITUTIONS} --billing-project {PROJECT} --suppress-logs --async" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb1f6fae-4065-4d4e-bcb1-16585125f00d", + "metadata": {}, + "outputs": [], + "source": [ + "DESCR=f'\"Deploy model from branch {BRANCH}\"'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34927767-4aea-43b8-a0cb-df433dc591cb", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud alpha builds triggers create manual --repo={REPO_URL} --repo-type=CLOUD_SOURCE_REPOSITORIES --branch={BRANCH} --description={DESCR} --build-config=mlops-creditcard/build/model-deployment.yaml --substitutions={SUBSTITUTIONS} --billing-project={PROJECT} --service-account={CLOUDBUILD_SA}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "420e70e8-b29a-4fc0-982f-8c6f1121d426", + "metadata": {}, + "outputs": [], + "source": [ + "endpoints = vertex_ai.Endpoint.list(\n", + " filter=f'display_name={ENDPOINT_DISPLAY_NAME}',\n", + " order_by=\"update_time\"\n", + ")\n", + "\n", + "if len(endpoints) == 0:\n", + " print(f'No endpoints found with name {ENDPOINT_DISPLAY_NAME}')\n", + "endpoint = endpoints[-1]\n", + "\n", + "os.environ['ENDPOINT_NAME'] = endpoint.name\n", + "\n", + "entity = 'user'\n", + "os.environ['ENTITY'] = entity\n", + "os.environ['FEATURESTORE_ID'] = FEATURESTORE_ID\n", + "\n", + "PREDICT_CLOUD_FUNCTION_NAME = \"predict-\" + PIPELINE_NAME + \"-fn\"\n", + "PREDICT_CLOUD_FUNCTION_NAME" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "101175fb-4cfc-42bf-8b47-87d9a19429ff", + "metadata": {}, + "outputs": [], + "source": [ + "from src.tests.model_deployment_tests import test_instance\n", + "\n", + "import base64\n", + "\n", + "if 'V27' in test_instance:\n", + " del test_instance['V27']\n", + "if 'V28' in test_instance:\n", + " del test_instance['V28']\n", + "test_instance['userid'] = 'user99'\n", + "\n", + "test_instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7b899ef-ed27-4d2c-b5ad-287e4c4c5917", + "metadata": {}, + "outputs": [], + "source": [ + "from flask import Flask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7dbd8f2-46a2-4e58-80f0-c4f7f29b8bb7", + "metadata": {}, + "outputs": [], + "source": [ + "from src.prediction_cf.main import predict\n", + "\n", + "app = Flask('test')\n", + "ctx = app.test_request_context(json=test_instance)\n", + "request = ctx.request\n", + "\n", + "pred_retval = predict(request)\n", + "pred_retval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9a69bc0-dbd0-4278-9095-52a639202ece", + "metadata": {}, + "outputs": [], + "source": [ + "GOOGLE_FUNCTION_SOURCE ='src/prediction_cf/main.py'\n", + "\n", + "ENV_VARS=f\"\"\"\\\n", + "PROJECT={PROJECT},\\\n", + "REGION={REGION},\\\n", + "ENDPOINT_NAME={endpoint.name},\\\n", + "ENTITY={entity},\\\n", + "FEATURESTORE_ID={FEATURESTORE_ID}\n", + "\"\"\"\n", + "\n", + "!echo {ENV_VARS}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3db24101-466a-4a90-b8d9-fbf3539ab33b", + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf src/prediction_cf/.ipynb_checkpoints" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09d1106d-ee2f-4e49-a4f0-ab89d6c1f77a", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud functions deploy {PREDICT_CLOUD_FUNCTION_NAME} \\\n", + " --set-build-env-vars=GOOGLE_FUNCTION_SOURCE={GOOGLE_FUNCTION_SOURCE} \\\n", + " --region={CF_REGION} \\\n", + " --runtime=python37 \\\n", + " --trigger-http \\\n", + " --source=. \\\n", + " --entry-point=predict\\\n", + " --stage-bucket={BUCKET}\\\n", + " --ingress-settings=internal-only\\\n", + " --service-account={SERVICE_ACCOUNT}\\\n", + " --set-env-vars={ENV_VARS} \\\n", + " --billing-project $PROJECT" + ] + }, + { + "cell_type": "markdown", + "id": "1c717710-888b-47d8-86c4-02eb6a7e96c5", + "metadata": {}, + "source": [ + "#### Test the prediction cloud function\n", + "\n", + "You can test it using a `curl` command, but this has to be executed from the same VPC that the Cloud Function is deployed in:\n", + "\n", + "```\n", + "curl -m 70 -X POST https://PROJECT.cloudfunctions.net/predict-creditcards-classifier-v01-train-pipeline-fn \\\n", + "-H \"Authorization:bearer $(gcloud auth print-identity-token)\" \\\n", + "-H \"Content-Type:application/json\" \\\n", + "-d '{\"V1\": [-0.906611], \"V2\": [-0.906611], \"V3\": [-0.906611], \"V4\": [-0.906611], \"V5\": [-0.906611], \"V6\": [-0.906611], \"V7\": [-0.906611], \"V8\": [-0.906611], \"V9\": [-0.906611], \"V10\": [-0.906611], \"V11\": [-0.906611], \"V12\": [-0.906611], \"V13\": [-0.906611], \"V14\": [-0.906611], \"V15\": [-0.906611], \"V16\": [-0.906611], \"V17\": [-0.906611], \"V18\": [-0.906611], \"V19\": [-0.906611], \"V20\": [-0.906611], \"V21\": [-0.906611], \"V22\": [-0.906611], \"V23\": [-0.906611], \"V24\": [-0.906611], \"V25\": [-0.906611], \"V26\": [-0.906611], \"Amount\": [15.99], \"userid\": \"user99\"}'\n", + "```\n", + "\n", + "You deploy a VM in the same VPC and run the command from there.\n", + "\n", + "Perhaps an easier way to test is test it from the Testing tab on the [web Console](https://console.cloud.google.com/functions/list)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ebb1d8a-4035-41c2-95b5-8bef50a576a3", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "json.dumps(test_instance)" + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "tf2-gpu.2-8.m93", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-8:m93" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/vertex_mlops_enterprise/03-prediction.ipynb b/examples/vertex_mlops_enterprise/03-prediction.ipynb new file mode 100644 index 0000000000..46543bca43 --- /dev/null +++ b/examples/vertex_mlops_enterprise/03-prediction.ipynb @@ -0,0 +1,1367 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "fd3395f2", + "metadata": {}, + "source": [ + "Copyright 2023 Google LLC\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "you may not use this file except in compliance with the License.\n", + "You may obtain a copy of the License at\n", + "\n", + " http://www.apache.org/licenses/LICENSE-2.0\n", + "\n", + "Unless required by applicable law or agreed to in writing, software\n", + "distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "See the License for the specific language governing permissions and\n", + "limitations under the License." + ] + }, + { + "cell_type": "markdown", + "id": "9d0817cd-9e64-4d5e-9c66-4e0961aa1085", + "metadata": { + "tags": [] + }, + "source": [ + "# MLOps End to End Workflow (III)\n", + "\n", + "Implementation of an end-to-end ML Ops workflow for the use case to detect fraudulent credit card transactions, see [Kaggle dataset](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud).\n", + "\n", + "This set of notebooks cover:\n", + "\n", + "[Experimentation](01-experimentation.ipynb):\n", + "1. Set up: Creation of the Vertex Dataset, extraction of the schema\n", + "2. Implementation of a TFX pipeline\n", + "\n", + "[CICD](02-cicd.ipynb):\n", + "\n", + "3. Deployment of the Vertex AI Pipeline through a CI/CD process\n", + "4. Deployment of a Continuous Training pipeline that can be triggered via Pub/Sub and produces a model in the Model Registry\n", + "5. Deployment of the Inference Pipeline consisting of a Cloud Function that retrieves features from Feature Store and calls the model on a Vertex AI Endpoint\n", + "6. Deployment of the model to a Vertex AI Endpoint through a CI/CD process.\n", + "\n", + "[Prediction](03-prediction.ipynb):\n", + "\n", + "7. Deploy the model to an endpoint\n", + "8. Create a test prediction\n" + ] + }, + { + "cell_type": "markdown", + "id": "f5c0f8ad-ca30-4f5c-a135-809409f58abd", + "metadata": { + "tags": [] + }, + "source": [ + "### Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b997d6a-f8fd-43f5-bafd-81b169965160", + "metadata": {}, + "outputs": [], + "source": [ + "#%load_ext autoreload\n", + "#%autoreload 2\n", + "\n", + "import os\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import tensorflow_data_validation as tfdv\n", + "from google.cloud import bigquery\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from google.cloud import aiplatform as vertex_ai\n", + "\n", + "import yaml\n", + "import os\n", + "\n", + "with open('mainconfig.yaml') as f:\n", + " main_config = yaml.safe_load(f)\n", + "\n", + "# select your config \n", + "main_config = main_config['creditcards']\n", + "\n", + "PROJECT = main_config['project'] \n", + "REGION = main_config['region'] \n", + "DOCKER_REPO = main_config['docker_repo']\n", + "\n", + "SERVICE_ACCOUNT = main_config['service_account']\n", + "\n", + "# BigQuery and data locations\n", + "\n", + "BQ_SOURCE_TABLE= main_config['bq']['source_table'] # raw input\n", + "ML_TABLE = main_config['bq']['ml_table'] # the one we will use for the training\n", + "\n", + "BQ_DATASET_NAME = main_config['bq']['dataset']\n", + "BQ_LOCATION = main_config['bq']['location'] # multiregion provides more resilience\n", + "\n", + "VERTEX_DATASET_NAME = main_config['vertex_dataset_name']\n", + "\n", + "RAW_SCHEMA_DIR = main_config['raw_schema_dir']\n", + "\n", + "BUCKET = main_config['bucket']\n", + "\n", + "# TFX and model config\n", + "\n", + "# model version\n", + "VERSION = main_config['version']\n", + "\n", + "\n", + "MODEL_DISPLAY_NAME = f'{VERTEX_DATASET_NAME}-classifier-{VERSION}'\n", + "WORKSPACE = f'gs://{BUCKET}/{VERTEX_DATASET_NAME}'\n", + "\n", + "MLMD_SQLLITE = 'mlmd.sqllite'\n", + "ARTIFACT_STORE = os.path.join(WORKSPACE, 'tfx_artifacts_interactive')\n", + "MODEL_REGISTRY = os.path.join(WORKSPACE, 'model_registry')\n", + "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n", + "PIPELINE_ROOT = os.path.join(ARTIFACT_STORE, PIPELINE_NAME)\n", + "\n", + "ENDPOINT_DISPLAY_NAME = f'{VERTEX_DATASET_NAME}-classifier'\n", + "\n", + "FEATURESTORE_ID = main_config['featurestore_id']\n", + "\n", + "CF_REGION = main_config['cloudfunction_region']\n", + "\n", + "DATAFLOW_SUBNETWORK = f\"https://www.googleapis.com/compute/v1/projects/{PROJECT}/regions/{REGION}/subnetworks/{main_config['dataflow']['subnet']}\"\n", + "DATAFLOW_SERVICE_ACCOUNT = main_config['dataflow']['service_account']\n", + "\n", + "CLOUDBUILD_SA = f'projects/{PROJECT}/serviceAccounts/{SERVICE_ACCOUNT}'\n", + "\n", + "LIMIT=main_config['limit']\n", + "\n", + "print(\"Project ID:\", PROJECT)\n", + "print(\"Region:\", REGION)\n", + "print(\"Service Account:\", SERVICE_ACCOUNT)\n", + "\n", + "vertex_ai.init(\n", + " project=PROJECT,\n", + " location=REGION\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb98a921-cba7-46ee-baa7-b0a4461c3b2c", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"VERTEX_DATASET_NAME\"] = VERTEX_DATASET_NAME\n", + "os.environ[\"MODEL_DISPLAY_NAME\"] = MODEL_DISPLAY_NAME\n", + "os.environ[\"PIPELINE_NAME\"] = PIPELINE_NAME\n", + "os.environ[\"PROJECT\"] = PROJECT\n", + "os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT\n", + "os.environ[\"REGION\"] = REGION\n", + "os.environ[\"GCS_LOCATION\"] = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}\"\n", + "os.environ[\"SERVICE_ACCOUNT\"] = DATAFLOW_SERVICE_ACCOUNT\n", + "os.environ[\"GCS_LOCATION\"] = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}/e2e_tests\"\n", + "os.environ[\"SUBNETWORK\"] = DATAFLOW_SUBNETWORK" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ad9e768-4c25-4765-aeac-9e47f6c6cc70", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!python -m pytest src/tests/pipeline_deployment_tests.py::test_e2e_pipeline -s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90774f24-3a81-4b6b-89ef-8dcd08387fdb", + "metadata": {}, + "outputs": [], + "source": [ + "config.BEAM_RUNNER" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a271d4e3-d45d-487f-972d-0b4e06db7758", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from src.tfx_pipelines import config\n", + "import importlib\n", + "\n", + "importlib.reload(config)\n", + "\n", + "for key, value in config.__dict__.items():\n", + " if key.isupper(): print(f'{key}: {value}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4d086a8-cd73-403f-92ed-6cd8d68321d8", + "metadata": {}, + "outputs": [], + "source": [ + "# Repo should has been created in the Terraform automation stage\n", + "#! gcloud artifacts repositories create {VERTEX_DATASET_NAME} --location={REGION} --repository-format=docker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05123e22-d8f8-4cfd-a743-fe8c6e4925d9", + "metadata": {}, + "outputs": [], + "source": [ + "# You can also use build/Dockerfile.dataflow in case Internet access is not allowed\n", + "!cp build/Dockerfile.dataflow_internet Dockerfile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc17291e-d63f-4e13-bbd7-5de2510c87f1", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"DOCKER_REPO\"] = f\"{DOCKER_REPO}/vertex:latest\"\n", + "!gcloud builds submit --project=$PROJECT --billing-project=$PROJECT --region $REGION --tag $DOCKER_REPO/dataflow:latest . --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2595e5e8-47bc-4f6d-8a39-b09d92588be2", + "metadata": {}, + "outputs": [], + "source": [ + "!echo $TFX_IMAGE_URI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12aff000-b7b8-4f1a-a2b7-2d4f56343a9e", + "metadata": {}, + "outputs": [], + "source": [ + "!echo $PYTHONPATH" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a12ebc0c-03ed-46eb-9610-9f4c24f60cce", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!cp build/Dockerfile.vertex_internet Dockerfile\n", + "!gcloud builds submit --project=$PROJECT --billing-project=$PROJECT --region $REGION --tag $TFX_IMAGE_URI . --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e73afd4-c486-48b2-ba98-fe5d5bd56ceb", + "metadata": {}, + "outputs": [], + "source": [ + "PIPELINES_STORE = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}/compiled_pipelines/\"\n", + "!gsutil cp {pipeline_definition_file} {PIPELINES_STORE}\n", + "PIPELINES_STORE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e876c9eb-6f73-4ea9-a85c-0172c728c1d3", + "metadata": {}, + "outputs": [], + "source": [ + "from src.tfx_pipelines import config, runner\n", + "\n", + "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n", + "pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8516b764-1143-4b9e-9ae7-af9340146ec3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from google.cloud.aiplatform import pipeline_jobs\n", + " \n", + "job = pipeline_jobs.PipelineJob(template_path = pipeline_definition_file,\n", + " display_name=VERTEX_DATASET_NAME,\n", + " #enable_caching=False,\n", + " parameter_values={\n", + " 'learning_rate': 0.003,\n", + " 'batch_size': 512,\n", + " 'steps_per_epoch': int(config.TRAIN_LIMIT) // 512,\n", + " 'hidden_units': '128,128',\n", + " 'num_epochs': 30,\n", + " })\n", + "\n", + "job.run(sync=False, service_account=DATAFLOW_SERVICE_ACCOUNT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e504cc07-22ee-45ba-9416-73cd22251c01", + "metadata": {}, + "outputs": [], + "source": [ + "CICD_IMAGE_URI = f\"{DOCKER_REPO}/cicd:latest\"\n", + "CICD_IMAGE_URI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a38e3ffc-fd4e-4934-b2b2-995070ae2076", + "metadata": {}, + "outputs": [], + "source": [ + "# For the CICD container, we are just adding the build/* dir\n", + "!cp build/Dockerfile.cicd_internet build/Dockerfile\n", + "!gcloud builds submit --project=$PROJECT --billing-project=$PROJECT --region $REGION --tag $CICD_IMAGE_URI build/. --timeout=15m --machine-type=e2-highcpu-8 --suppress-logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "370f9daf-7602-48d7-92b4-0f265ba9bee9", + "metadata": {}, + "outputs": [], + "source": [ + "REPO_URL = main_config['git']['repo_url']\n", + "BRANCH = main_config['git']['branch']\n", + "\n", + "\n", + "GCS_LOCATION = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}/\"\n", + "TEST_GCS_LOCATION = f\"gs://{BUCKET}/{VERTEX_DATASET_NAME}/e2e_tests\"\n", + "CI_TRAIN_LIMIT = 1000\n", + "CI_TEST_LIMIT = 100\n", + "CI_UPLOAD_MODEL = 0\n", + "CI_ACCURACY_THRESHOLD = -0.1 # again setting accuracy threshold to negative\n", + "BEAM_RUNNER = \"DataflowRunner\"\n", + "TRAINING_RUNNER = \"vertex\"\n", + "VERSION = 'latest'\n", + "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n", + "PIPELINES_STORE = f\"{GCS_LOCATION}compiled_pipelines/\"\n", + "\n", + "TFX_IMAGE_URI = f\"{DOCKER_REPO}/vertex:{VERSION}\"\n", + "DATAFLOW_IMAGE_URI = f\"{DOCKER_REPO}/dataflow:latest\"\n", + "\n", + "REPO_NAME = REPO_URL.split('/')[-1]\n", + "DESCR=f'\"Deploy train pipeline to GCS from {BRANCH}\"'\n", + "\n", + "\n", + "SUBSTITUTIONS=f\"\"\"\\\n", + "_REPO_URL='{REPO_URL}',\\\n", + "_BRANCH={BRANCH},\\\n", + "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n", + "_PROJECT={PROJECT},\\\n", + "_REGION={REGION},\\\n", + "_GCS_LOCATION={GCS_LOCATION},\\\n", + "_TEST_GCS_LOCATION={TEST_GCS_LOCATION},\\\n", + "_BQ_LOCATION={BQ_LOCATION},\\\n", + "_BQ_DATASET_NAME={BQ_DATASET_NAME},\\\n", + "_ML_TABLE={ML_TABLE},\\\n", + "_VERTEX_DATASET_NAME={VERTEX_DATASET_NAME},\\\n", + "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n", + "_CI_TRAIN_LIMIT={CI_TRAIN_LIMIT},\\\n", + "_CI_TEST_LIMIT={CI_TEST_LIMIT},\\\n", + "_CI_UPLOAD_MODEL={CI_UPLOAD_MODEL},\\\n", + "_CI_ACCURACY_THRESHOLD={CI_ACCURACY_THRESHOLD},\\\n", + "_BEAM_RUNNER={BEAM_RUNNER},\\\n", + "_TRAINING_RUNNER={TRAINING_RUNNER},\\\n", + "_DATAFLOW_IMAGE_URI={DATAFLOW_IMAGE_URI},\\\n", + "_TFX_IMAGE_URI={TFX_IMAGE_URI},\\\n", + "_PIPELINE_NAME={PIPELINE_NAME},\\\n", + "_PIPELINES_STORE={PIPELINES_STORE},\\\n", + "_SUBNETWORK={DATAFLOW_SUBNETWORK},\\\n", + "_GCS_BUCKET={BUCKET}/cloudbuild,\\\n", + "_SERVICE_ACCOUNT={DATAFLOW_SERVICE_ACCOUNT},\\\n", + "_WORKDIR={REPO_NAME}\\\n", + "\"\"\"\n", + "!echo $SUBSTITUTIONS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7856ce61-fe1a-4db4-9c71-fed47129ef7d", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud builds submit build/known_hosts.github.zip --config build/pipeline-deployment.yaml --substitutions {SUBSTITUTIONS} --project=$PROJECT --billing-project=$PROJECT --region $REGION --suppress-logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "928797f7-20cf-4769-9a35-24ad67900125", + "metadata": {}, + "outputs": [], + "source": [ + "!echo gcloud beta builds triggers create cloud-source-repositories --repo={REPO_NAME} --branch-pattern=^{BRANCH}$ --description={DESCR} --build-config=mlops-creditcard/build/pipeline-deployment.yaml --substitutions={SUBSTITUTIONS} --billing-project={PROJECT} --service-account={TRIGGER_SA}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64f56b2a-e55f-4b54-b90a-6335fce69314", + "metadata": {}, + "outputs": [], + "source": [ + "PUBSUB_TOPIC = f'trigger-{PIPELINE_NAME}'\n", + "CLOUD_FUNCTION_NAME = f'trigger-{PIPELINE_NAME}-fn'\n", + "GCS_PIPELINE_FILE_LOCATION = os.path.join(PIPELINES_STORE, f'{PIPELINE_NAME}.json')" + ] + }, + { + "cell_type": "markdown", + "id": "ee8e210a-cca3-41e5-9083-ff3a395b9c1a", + "metadata": {}, + "source": [ + "#### Create Pub/Sub Topic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a03ef492-ee1c-451d-a1e2-c5c511e00326", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud pubsub topics create {PUBSUB_TOPIC}" + ] + }, + { + "cell_type": "markdown", + "id": "547456c0-c55f-492e-97e6-f529182d3a13", + "metadata": {}, + "source": [ + "#### Deploy Cloud Function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d094091-3332-4c4c-9bb6-f6e5839d7ded", + "metadata": {}, + "outputs": [], + "source": [ + "ENV_VARS=f\"\"\"\\\n", + "PROJECT={PROJECT},\\\n", + "REGION={REGION},\\\n", + "GCS_PIPELINE_FILE_LOCATION={GCS_PIPELINE_FILE_LOCATION},\\\n", + "SERVICE_ACCOUNT={SERVICE_ACCOUNT},\\\n", + "PIPELINE_NAME={PIPELINE_NAME}\n", + "\"\"\"\n", + "\n", + "!echo {ENV_VARS}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3da074b8-106c-42bf-b956-e59b249bf5f2", + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf src/pipeline_triggering/.ipynb_checkpoints" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a695eef-bf18-4fce-b44c-2ac00ae46675", + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud functions deploy {CLOUD_FUNCTION_NAME} --gen2 \\\n", + " --region={CF_REGION} \\\n", + " --trigger-topic={PUBSUB_TOPIC} \\\n", + " --runtime=python38 \\\n", + " --source=src/pipeline_triggering\\\n", + " --entry-point=trigger_pipeline\\\n", + " --stage-bucket={BUCKET}\\\n", + " --ingress-settings=internal-only\\\n", + " --service-account={SERVICE_ACCOUNT}\\\n", + " --update-env-vars={ENV_VARS} " + ] + }, + { + "cell_type": "markdown", + "id": "50ad5728-1b82-4979-bf16-fc518f8a9ba9", + "metadata": {}, + "source": [ + "#### Test triggering the pipeline with a Pub/Sub message" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ead71b7d-fc63-4a80-8f52-e40f3612d67a", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import pubsub\n", + "import json\n", + "\n", + "publish_client = pubsub.PublisherClient()\n", + "topic = f'projects/{PROJECT}/topics/{PUBSUB_TOPIC}'\n", + "data = {\n", + " 'num_epochs': 7,\n", + " 'learning_rate': 0.0015,\n", + " 'batch_size': 512,\n", + " 'steps_per_epoch': int(config.TRAIN_LIMIT) // 512,\n", + " 'hidden_units': '256,126'\n", + "}\n", + "message = json.dumps(data)\n", + "\n", + "_ = publish_client.publish(topic, message.encode())" + ] + }, + { + "cell_type": "markdown", + "id": "65a666dc-d06c-4442-875e-86d382e3269f", + "metadata": {}, + "source": [ + "Check the console to see that it's running.\n", + "\n", + "We now have:\n", + "\n", + "* A pipeline that can be run to test and deploy new training pipelines\n", + "* A triggering mechanism to programmatically trigger new training runs\n", + "* A training run finish with a new model in the Vertex AI Model Registry\n" + ] + }, + { + "cell_type": "markdown", + "id": "6d67b248-7ccb-4159-9ef2-2238455737f7", + "metadata": {}, + "source": [ + "## Deploy the model\n", + "\n", + "Preparation:\n", + "\n", + "* Create a Vertex AI Endpoint\n", + "* Create a Vertex AI Feature Store and upload some feature data\n", + "* Deploy a Cloud Function that receives prediction requests, pull features from the Feature Store, call the Endpoint and returns a prediction\n", + "\n", + "After this, we will deploy a model deployment pipeline, that will:\n", + "\n", + "* Test the model locally\n", + "* Create the Endpoint if necessary\n", + "* Deploy the model to the Endpoint\n", + "* Test the model on the Endpoint\n" + ] + }, + { + "cell_type": "markdown", + "id": "99ff68dd-b3c4-4633-88fb-5997636d15ec", + "metadata": {}, + "source": [ + "### Preparation\n", + "\n", + "#### Vertex AI Endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1793acf-21df-495f-b37b-99a54ec38cdd", + "metadata": {}, + "outputs": [], + "source": [ + "ENDPOINT_DISPLAY_NAME" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1088589e-d24a-47a8-9fe4-7a4135a8a5b4", + "metadata": {}, + "outputs": [], + "source": [ + "from build.utils import create_endpoint\n", + "\n", + "endpoint = create_endpoint(PROJECT, REGION, ENDPOINT_DISPLAY_NAME)\n", + "endpoint" + ] + }, + { + "cell_type": "markdown", + "id": "603eb006-ae75-48ca-96d5-5cc22a786bc6", + "metadata": {}, + "source": [ + "### Model Deployment Pipeline\n", + "\n", + "#### Run the model artifact testing\n", + "\n", + "Artifact testing requires that the model is deployed to the Vertex AI Model Registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a394b2a9-aecd-4ec4-b0b7-f7ab8d2a5c67", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"PROJECT\"] = PROJECT\n", + "os.environ[\"REGION\"] = REGION\n", + "os.environ[\"SERVICE_ACCOUNT\"] = SERVICE_ACCOUNT\n", + "os.environ['ENDPOINT_DISPLAY_NAME'] = ENDPOINT_DISPLAY_NAME\n", + "os.environ[\"MODEL_DISPLAY_NAME\"] = MODEL_DISPLAY_NAME" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "073e0efb-5ae3-4bda-b1cd-d6cc08f9b64c", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pytest src/tests/model_deployment_tests.py::test_model_artifact -s" + ] + }, + { + "cell_type": "markdown", + "id": "0c5a1d30-8dad-4f0a-854b-4de67c548d54", + "metadata": {}, + "source": [ + "#### Deploy Model to Endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "697f0915-7257-41e0-bf33-2511671a439e", + "metadata": {}, + "outputs": [], + "source": [ + "!python build/utils.py \\\n", + " --mode=deploy-model\\\n", + " --project={PROJECT}\\\n", + " --region={REGION}\\\n", + " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}\\\n", + " --model-display-name={MODEL_DISPLAY_NAME}" + ] + }, + { + "cell_type": "markdown", + "id": "a402a585-5d45-4d9d-9497-cef77effb8a3", + "metadata": {}, + "source": [ + "#### Test model on Endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "062b7c0f-9a18-44cc-800e-a203903df6dd", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pytest src/tests/model_deployment_tests.py::test_model_endpoint" + ] + }, + { + "cell_type": "markdown", + "id": "3a23c981-a4d2-4396-aa1f-3785244a81ca", + "metadata": {}, + "source": [ + "#### Run the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01a0a985-2955-4c33-9cbc-09fa630a1ca2", + "metadata": {}, + "outputs": [], + "source": [ + "REPO_URL = main_config['git']['repo_url']\n", + "BRANCH = main_config['git']['branch']\n", + "REPO_NAME = REPO_URL.split('/')[-1]\n", + "CICD_IMAGE_URI = f\"{DOCKER_REPO}/cicd:latest\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6453e104-774d-475d-8159-328ea6ae0249", + "metadata": {}, + "outputs": [], + "source": [ + "SUBSTITUTIONS=f\"\"\"\\\n", + "_REPO_URL='{REPO_URL}',\\\n", + "_BRANCH={BRANCH},\\\n", + "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n", + "_PROJECT={PROJECT},\\\n", + "_REGION={REGION},\\\n", + "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n", + "_ENDPOINT_DISPLAY_NAME={ENDPOINT_DISPLAY_NAME},\\\n", + "_GCS_BUCKET={BUCKET}/cloudbuild,\\\n", + "_SERVICE_ACCOUNT={SERVICE_ACCOUNT},\\\n", + "_WORKDIR={REPO_NAME}\\\n", + "\"\"\"\n", + "\n", + "SUBSTITUTIONS" + ] + }, + { + "cell_type": "markdown", + "id": "5cd627e0-aec5-4ff0-8bb3-b072aa01fc6d", + "metadata": {}, + "source": [ + "### Test the build and define a manual trigger" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa8712b3-b431-4c82-b49f-2df4882c25d4", + "metadata": {}, + "outputs": [], + "source": [ + "!echo gcloud builds submit --no-source --config build/model-deployment.yaml --substitutions {SUBSTITUTIONS} --billing-project {PROJECT} --suppress-logs --async" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb1f6fae-4065-4d4e-bcb1-16585125f00d", + "metadata": {}, + "outputs": [], + "source": [ + "DESCR=f'\"Deploy model from branch {BRANCH}\"'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34927767-4aea-43b8-a0cb-df433dc591cb", + "metadata": {}, + "outputs": [], + "source": [ + "!echo gcloud alpha builds triggers create manual --repo={REPO_URL} --repo-type=CLOUD_SOURCE_REPOSITORIES --branch={BRANCH} --description={DESCR} --build-config=mlops-creditcard/build/model-deployment.yaml --substitutions={SUBSTITUTIONS} --billing-project={PROJECT} --service-account={CLOUDBUILD_SA}" + ] + }, + { + "cell_type": "markdown", + "id": "1d211c2e-8b5f-4511-845f-a96cefc7b274", + "metadata": {}, + "source": [ + "## Deploy Prediction Cloud Function\n", + "\n", + "The Cloud Function that performs the final prediction has to:\n", + " \n", + "* Receive the features from the prediction request: `V1`, `V2`, ..., `V26`, `Amount`, `userid`\n", + "* Use the `userid` to retrieve the features `V27` and `V28` from the Feature Store\n", + "* Query the model on the Vertex AI Endpint\n", + "* Return the prediction\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "420e70e8-b29a-4fc0-982f-8c6f1121d426", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Making request: POST https://oauth2.googleapis.com/token\n", + "Starting new HTTPS connection (1): oauth2.googleapis.com:443\n", + "https://oauth2.googleapis.com:443 \"POST /token HTTP/1.1\" 200 None\n" + ] + }, + { + "data": { + "text/plain": [ + "'predict-creditcards-classifier-v02-train-pipeline-fn'" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "endpoints = vertex_ai.Endpoint.list(\n", + " filter=f'display_name={ENDPOINT_DISPLAY_NAME}',\n", + " order_by=\"update_time\"\n", + ")\n", + "\n", + "if len(endpoints) == 0:\n", + " print(f'No endpoints found with name {ENDPOINT_DISPLAY_NAME}')\n", + "endpoint = endpoints[-1]\n", + "\n", + "os.environ['ENDPOINT_NAME'] = endpoint.name\n", + "\n", + "entity = 'user'\n", + "os.environ['ENTITY'] = entity\n", + "os.environ['FEATURESTORE_ID'] = FEATURESTORE_ID\n", + "\n", + "PREDICT_CLOUD_FUNCTION_NAME = \"predict-\" + PIPELINE_NAME + \"-fn\"\n", + "PREDICT_CLOUD_FUNCTION_NAME" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "eefa0135-99f3-45f9-b79d-0f04e34c4ee7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'V1': [-0.906611],\n", + " 'V2': [-0.906611],\n", + " 'V3': [-0.906611],\n", + " 'V4': [-0.906611],\n", + " 'V5': [-0.906611],\n", + " 'V6': [-0.906611],\n", + " 'V7': [-0.906611],\n", + " 'V8': [-0.906611],\n", + " 'V9': [-0.906611],\n", + " 'V10': [-0.906611],\n", + " 'V11': [-0.906611],\n", + " 'V12': [-0.906611],\n", + " 'V13': [-0.906611],\n", + " 'V14': [-0.906611],\n", + " 'V15': [-0.906611],\n", + " 'V16': [-0.906611],\n", + " 'V17': [-0.906611],\n", + " 'V18': [-0.906611],\n", + " 'V19': [-0.906611],\n", + " 'V20': [-0.906611],\n", + " 'V21': [-0.906611],\n", + " 'V22': [-0.906611],\n", + " 'V23': [-0.906611],\n", + " 'V24': [-0.906611],\n", + " 'V25': [-0.906611],\n", + " 'V26': [-0.906611],\n", + " 'V27': [-0.906611],\n", + " 'V28': [-0.906611],\n", + " 'Amount': [15.99]}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from src.tests.model_deployment_tests import test_instance\n", + "test_instance" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "72229678-bea2-44a2-b79d-74124882b847", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'scores': [0.5002141, 0.4997859], 'classes': ['legit', 'fraudulent']}\n" + ] + } + ], + "source": [ + "test_instances = [test_instance]\n", + "predictions = endpoint.predict(test_instances).predictions\n", + "\n", + "for prediction in predictions:\n", + " print(prediction)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc8de820-79c2-4d4e-bd74-4c4320e91149", + "metadata": {}, + "outputs": [], + "source": [ + "#In case model is deployed with explanations\n", + "explanations = endpoint.explain(test_instances).explanations\n", + "\n", + "for explanation in explanations:\n", + " print(explanation)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "e613276f-a40a-4c4e-b250-c1c488c3331b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'{\"instances\": [{\"V1\": [-0.906611], \"V2\": [-0.906611], \"V3\": [-0.906611], \"V4\": [-0.906611], \"V5\": [-0.906611], \"V6\": [-0.906611], \"V7\": [-0.906611], \"V8\": [-0.906611], \"V9\": [-0.906611], \"V10\": [-0.906611], \"V11\": [-0.906611], \"V12\": [-0.906611], \"V13\": [-0.906611], \"V14\": [-0.906611], \"V15\": [-0.906611], \"V16\": [-0.906611], \"V17\": [-0.906611], \"V18\": [-0.906611], \"V19\": [-0.906611], \"V20\": [-0.906611], \"V21\": [-0.906611], \"V22\": [-0.906611], \"V23\": [-0.906611], \"V24\": [-0.906611], \"V25\": [-0.906611], \"V26\": [-0.906611], \"V27\": [-0.906611], \"V28\": [-0.906611], \"Amount\": [15.99]}]}'" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "\n", + "request = {'instances': test_instances}\n", + "\n", + "REQ_JSON=json.dumps(request)\n", + "REQ_JSON" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "24b77ee8-31ca-4c19-8ab7-7a6a1f02e69a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"predictions\": [\n", + " {\n", + " \"scores\": [\n", + " 0.5002141,\n", + " 0.4997859\n", + " ],\n", + " \"classes\": [\n", + " \"legit\",\n", + " \"fraudulent\"\n", + " ]\n", + " }\n", + " ],\n", + " \"deployedModelId\": \"8097568887035396096\",\n", + " \"model\": \"projects/80023113991/locations/europe-west4/models/7404419164699361280\",\n", + " \"modelDisplayName\": \"creditcards-classifier-v02\",\n", + " \"modelVersionId\": \"1\"\n", + "}\n" + ] + } + ], + "source": [ + "os.environ['PROJECT_ID'] = PROJECT\n", + "os.environ['REGION'] = REGION\n", + "os.environ['ENDPOINT_ID'] = endpoint.name\n", + "os.environ['INPUT_DATA_FILE'] = \"INPUT-JSON\"\n", + "os.environ['REQ_JSON'] = REQ_JSON\n", + "!echo ${REQ_JSON} > ${INPUT_DATA_FILE}\n", + "!curl -X POST -H \"Authorization: Bearer $(gcloud auth print-access-token)\" -H \"Content-Type: application/json\" https://${REGION}-aiplatform.googleapis.com/v1/projects/${PROJECT_ID}/locations/europe-west4/endpoints/${ENDPOINT_ID}:predict -d \"@${INPUT_DATA_FILE}\"" + ] + }, + { + "cell_type": "markdown", + "id": "f5070311-5ad6-4afd-9baa-8420aaacd149", + "metadata": { + "tags": [] + }, + "source": [ + "## Feature Store (Optional)" + ] + }, + { + "cell_type": "markdown", + "id": "02f89f3a-9456-4f5c-af84-256fd51e3cb6", + "metadata": {}, + "source": [ + "### Create Feature Store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ff735bf-43a6-4fe1-b3c1-055b21860aa9", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud.aiplatform_v1beta1 import FeaturestoreOnlineServingServiceClient, FeaturestoreServiceClient" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e76b8fe-3b31-4722-b95a-3f9330e91530", + "metadata": {}, + "outputs": [], + "source": [ + "from feature_store import feature_store as fs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad2103aa-5845-4f51-a982-942b491b3588", + "metadata": {}, + "outputs": [], + "source": [ + "fs.create_fs(PROJECT, REGION, FEATURESTORE_ID, \"Feature Store for credit card use case\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c531af74-593a-4439-9c5a-5a45d320d951", + "metadata": {}, + "outputs": [], + "source": [ + "from google.api_core import operations_v1\n", + "from google.cloud.aiplatform_v1beta1 import FeaturestoreOnlineServingServiceClient, FeaturestoreServiceClient, FeatureSelector\n", + "from google.cloud.aiplatform_v1beta1.types import featurestore_online_service as featurestore_online_service_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import entity_type as entity_type_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import feature as feature_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import featurestore as featurestore_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import featurestore_service as featurestore_service_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import io as io_pb2\n", + "from google.cloud.aiplatform_v1beta1.types import ListFeaturestoresRequest, CreateFeaturestoreRequest, Featurestore, ListEntityTypesRequest\n", + "\n", + "from google.protobuf.timestamp_pb2 import Timestamp\n", + "from google.cloud.aiplatform_v1beta1.types import featurestore_monitoring as featurestore_monitoring_pb2\n", + "from google.protobuf.duration_pb2 import Duration\n", + "\n", + "\n", + "API_ENDPOINT = f\"{REGION}-aiplatform.googleapis.com\" \n", + "admin_client = FeaturestoreServiceClient(client_options={\"api_endpoint\": API_ENDPOINT})\n", + "parent = f'{admin_client.common_location_path(PROJECT, REGION)}/featurestores/{FEATURESTORE_ID}'\n", + "request = ListEntityTypesRequest(parent=parent)\n", + "\n", + "# Make the request\n", + "page_result = admin_client.list_entity_types(request=request)\n", + "\n", + "# Handle the response\n", + "[x.name.split('/')[-1] for x in page_result]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "813e1109-23c9-4df1-9dcd-49579abd4637", + "metadata": {}, + "outputs": [], + "source": [ + "admin_client.featurestore_path(PROJECT, REGION, FEATURESTORE_ID)" + ] + }, + { + "cell_type": "markdown", + "id": "857b2803-8136-4aae-9d32-370daf566826", + "metadata": {}, + "source": [ + "#### Create an entity with features, generate some data and upload it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c5e3a9f-1c49-452b-b41d-54b9c766fac4", + "metadata": {}, + "outputs": [], + "source": [ + "entity = 'user'\n", + "entity_descr = 'User ID'\n", + "features = ['v27', 'v28']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "138a0401-bf30-4433-b6ce-3ce7908bfb63", + "metadata": {}, + "outputs": [], + "source": [ + "fs.create_entity(PROJECT, REGION, FEATURESTORE_ID, entity, entity_descr, features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b258871-b60a-4841-9d87-974e6b7591d2", + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "filename = f'features_{entity}.csv'\n", + "\n", + "with open(filename, 'w') as f:\n", + " line = f'{entity},{\",\".join(features)}\\n'\n", + " f.write(line)\n", + " for i in range(100):\n", + " f.write(f'user{i},{random.random()},{random.random()}\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c643c891-971f-4fcc-aa32-608881e3ff9a", + "metadata": {}, + "outputs": [], + "source": [ + "!tail -20 {filename}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3245990d-362c-48bd-a006-a39ba04225f2", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3660a3c-a615-4c09-b9a9-329bc56f7be1", + "metadata": {}, + "outputs": [], + "source": [ + "!gsutil cp {filename} gs://{BUCKET}/{filename} " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "534dbb9a-f3d8-41f4-a6a7-d78dcc615ecc", + "metadata": {}, + "outputs": [], + "source": [ + "gcs_uris = [f'gs://{BUCKET}/{filename}']\n", + "\n", + "fs.ingest_entities_csv(PROJECT, REGION, FEATURESTORE_ID, entity, features, gcs_uris)" + ] + }, + { + "cell_type": "markdown", + "id": "920fd50e-7a4c-475d-9834-eafad695bacb", + "metadata": {}, + "source": [ + "Test reading some features back" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "481ff3e3-cb67-476c-9e4b-a85fcfe1cf42", + "metadata": {}, + "outputs": [], + "source": [ + "features_data = {}\n", + "for i in range(90,102):\n", + " entity_id = f'user{i}'\n", + " features_data[entity_id] = fs.read_features(PROJECT, REGION, FEATURESTORE_ID, entity, features, entity_id)\n", + "\n", + "features_data" + ] + }, + { + "cell_type": "markdown", + "id": "5a116aeb-0366-4876-91dc-5ccd4a81a402", + "metadata": { + "tags": [] + }, + "source": [ + "### Deploy Prediction Cloud Function to use with Feature Store\n", + "\n", + "The Cloud Function that performs the final prediction has to:\n", + " \n", + "* Receive the features from the prediction request: `V1`, `V2`, ..., `V26`, `Amount`, `userid`\n", + "* Use the `userid` to retrieve the features `V27` and `V28` from the Feature Store\n", + "* Query the model on the Vertex AI Endpint\n", + "* Return the prediction\n", + "\n", + "#### Test the enpoint with Feature store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "101175fb-4cfc-42bf-8b47-87d9a19429ff", + "metadata": {}, + "outputs": [], + "source": [ + "from src.tests.model_deployment_tests import test_instance\n", + "\n", + "import base64\n", + "\n", + "if 'V27' in test_instance:\n", + " del test_instance['V27']\n", + "if 'V28' in test_instance:\n", + " del test_instance['V28']\n", + "test_instance['userid'] = 'user99'\n", + "\n", + "test_instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7dbd8f2-46a2-4e58-80f0-c4f7f29b8bb7", + "metadata": {}, + "outputs": [], + "source": [ + "from flask import Flask\n", + "from src.prediction_cf.main import predict\n", + "\n", + "app = Flask('test')\n", + "ctx = app.test_request_context(json=test_instance)\n", + "request = ctx.request\n", + "\n", + "pred_retval = predict(request)\n", + "pred_retval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9a69bc0-dbd0-4278-9095-52a639202ece", + "metadata": {}, + "outputs": [], + "source": [ + "GOOGLE_FUNCTION_SOURCE ='src/prediction_cf/main.py'\n", + "\n", + "ENV_VARS=f\"\"\"\\\n", + "PROJECT={PROJECT},\\\n", + "REGION={REGION},\\\n", + "ENDPOINT_NAME={endpoint.name},\\\n", + "ENTITY={entity},\\\n", + "FEATURESTORE_ID={FEATURESTORE_ID}\n", + "\"\"\"\n", + "\n", + "!echo {ENV_VARS}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3db24101-466a-4a90-b8d9-fbf3539ab33b", + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf src/prediction_cf/.ipynb_checkpoints" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09d1106d-ee2f-4e49-a4f0-ab89d6c1f77a", + "metadata": {}, + "outputs": [], + "source": [ + "!echo gcloud functions deploy {PREDICT_CLOUD_FUNCTION_NAME} --gen2 \\\n", + " --set-build-env-vars=GOOGLE_FUNCTION_SOURCE={GOOGLE_FUNCTION_SOURCE} \\\n", + " --region={CF_REGION} \\\n", + " --runtime=python38 \\\n", + " --trigger-http \\\n", + " --source=. \\\n", + " --entry-point=predict\\\n", + " --stage-bucket={BUCKET}\\\n", + " --ingress-settings=internal-only\\\n", + " --service-account={SERVICE_ACCOUNT}\\\n", + " --set-env-vars={ENV_VARS} " + ] + }, + { + "cell_type": "markdown", + "id": "1c717710-888b-47d8-86c4-02eb6a7e96c5", + "metadata": { + "tags": [] + }, + "source": [ + "#### Test the prediction cloud function\n", + "\n", + "You can test it using a `curl` command, but this has to be executed from the same VPC that the Cloud Function is deployed in:\n", + "\n", + "```\n", + "curl -m 70 -X POST https://PROJECT.cloudfunctions.net/predict-creditcards-classifier-v01-train-pipeline-fn \\\n", + "-H \"Authorization:bearer $(gcloud auth print-identity-token)\" \\\n", + "-H \"Content-Type:application/json\" \\\n", + "-d '{\"V1\": [-0.906611], \"V2\": [-0.906611], \"V3\": [-0.906611], \"V4\": [-0.906611], \"V5\": [-0.906611], \"V6\": [-0.906611], \"V7\": [-0.906611], \"V8\": [-0.906611], \"V9\": [-0.906611], \"V10\": [-0.906611], \"V11\": [-0.906611], \"V12\": [-0.906611], \"V13\": [-0.906611], \"V14\": [-0.906611], \"V15\": [-0.906611], \"V16\": [-0.906611], \"V17\": [-0.906611], \"V18\": [-0.906611], \"V19\": [-0.906611], \"V20\": [-0.906611], \"V21\": [-0.906611], \"V22\": [-0.906611], \"V23\": [-0.906611], \"V24\": [-0.906611], \"V25\": [-0.906611], \"V26\": [-0.906611], \"Amount\": [15.99], \"userid\": \"user99\"}'\n", + "```\n", + "\n", + "You deploy a VM in the same VPC and run the command from there.\n", + "\n", + "Perhaps an easier way to test is test it from the Testing tab on the [web Console](https://console.cloud.google.com/functions/list)." + ] + }, + { + "cell_type": "markdown", + "id": "4264177a-3f8f-4a44-87b5-41f20245d7a5", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "We have defined a Vertex AI Pipeline to train a model using TFX. The model uses credit card transaction data where some of the transactions are labeled fraudulent and it predicts whethers transactions are fraudulent.\n", + "\n", + "We have defined and deployed a continuous training pipeline, allowing the model to be retrained by sending a message to a Pub/Sub topic.\n", + "\n", + "We have deployed a Feature Store and loaded it with data. We have deployed a Vertex AI Endpoint to host the model.\n", + "\n", + "We have defined and deployed a model deployment pipeline, that tests the trained model, deploys it to the Endpoint nad tests the Endpoint.\n", + "\n", + "Finally we have deployed a Cloud Function that can serve as the final prediction API, which retrieves some features from the Feature Store, uses these to feed the model and returns the prediction." + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "tf2-gpu.2-8.m93", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-8:m93" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/vertex_mlops_enterprise/README.md b/examples/vertex_mlops_enterprise/README.md new file mode 100644 index 0000000000..f9e31ad647 --- /dev/null +++ b/examples/vertex_mlops_enterprise/README.md @@ -0,0 +1,49 @@ +# ML Ops with Vertex AI for enterprises + +Enterprises frequently have specific requirements, especially around security and scale, that are +often not addressed by other examples. In this example we demonstrate machine learning use case +implementation that respects typical security requirements, and that includes that automation to +allow larger organizations achieve scale in terms of number of models. + +## Contents of this example + +We provide three notebooks to cover the three processes that we typically observe: + +1. [01-experimentation.ipynb](01-experimentation.ipynb) covers the development process, where the features, the model and the training process are defined. +1. [02-cicd.ipynb](02-cicd.ipynb) covers the the CI/CD process that tests the code produced in the experimentation phase, and trains a production-ready model. +1. [03-prediction.ipynb](03-prediction.ipynb) cover the deployment process to make the model available, for example on a Vertex AI Endpoint or through Vertex AI Batch Prediction. + +Each of the notebooks provides detailed instructions on prerequisites for their execution and they should be self-explanatory. + +Once you have reviewed the notebooks, you can go on with these advanced steps to set up the automated environments and the CI/CD process using Github. + +1. [Environments](doc/01-ENVIRONMENTS.md) covers how to automate the environments deployments using Terraform. +1. [GIT Setup](doc/02-GIT_SETUP.md) covers how to configure a Github repo to be used for the CI/CD process. +1. [03-prediction.ipynb](doc/03-MLOPS.md) cover test the automated MLOps end2end process. + + +## Contributing + +Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**. + +If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement". +Don't forget to give the project a star! Thanks again! + +1. Fork the Project +2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`) +3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`) +4. Push to the Branch (`git push origin feature/AmazingFeature`) +5. Open a Pull Request + +

(back to top)

+ + +## License + +All solutions within this repository are provided under the +[Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) license. Please see +the [LICENSE](/LICENSE) file for more detailed terms and conditions. + +## Disclaimer + +This repository and its contents are not an official Google Product. diff --git a/examples/vertex_mlops_enterprise/build/Dockerfile.cicd-tfx b/examples/vertex_mlops_enterprise/build/Dockerfile.cicd-tfx new file mode 100644 index 0000000000..41106ca9ca --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/Dockerfile.cicd-tfx @@ -0,0 +1,5 @@ +FROM gcr.io/tfx-oss-public/tfx:1.8.0 + +RUN pip install -U pip +RUN pip install google-cloud-aiplatform==1.14.0 google-cloud-aiplatform[tensorboard] +RUN pip install pytest kfp==1.8.12 google-cloud-bigquery==2.34.3 google-cloud-bigquery-storage==2.13.2 google-cloud-aiplatform==1.14.0 diff --git a/examples/vertex_mlops_enterprise/build/Dockerfile.dataflow b/examples/vertex_mlops_enterprise/build/Dockerfile.dataflow new file mode 100644 index 0000000000..d61c68af33 --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/Dockerfile.dataflow @@ -0,0 +1,7 @@ +FROM apache/beam_python3.7_sdk:2.39.0 + +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt + +COPY src/raw_schema/schema.pbtxt raw_schema/ +COPY src/ src/ diff --git a/examples/vertex_mlops_enterprise/build/Dockerfile.vertex b/examples/vertex_mlops_enterprise/build/Dockerfile.vertex new file mode 100644 index 0000000000..f12804c866 --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/Dockerfile.vertex @@ -0,0 +1,14 @@ +FROM gcr.io/tfx-oss-public/tfx:1.8.0 + +COPY requirements.txt requirements.txt + +RUN pip install -r requirements.txt + +# RuntimeError: module compiled against api version 0xe but this version of numpy is 0xd +# Fixed by below command - see https://stackoverflow.com/questions/33859531/runtimeerror-module-compiled-against-api-version-a-but-this-version-of-numpy-is + +RUN pip install -U numpy --ignore-installed + +COPY src/ src/ + +ENV PYTHONPATH="/pipeline:${PYTHONPATH}" diff --git a/examples/vertex_mlops_enterprise/build/model-deployment.yaml.TEMPLATE b/examples/vertex_mlops_enterprise/build/model-deployment.yaml.TEMPLATE new file mode 100644 index 0000000000..dfd2250d62 --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/model-deployment.yaml.TEMPLATE @@ -0,0 +1,121 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +###################################################################### +# CI/CD steps for Cloud Build to test and deploy a model to Vertex AI. +###################################################################### + +steps: + +# Clone the repository. +- name: 'gcr.io/cloud-builders/git' + secretEnv: ['SSH_KEY'] + entrypoint: 'bash' + args: + - -c + - | + echo "$$SSH_KEY" >> /root/.ssh/id_rsa + chmod 400 /root/.ssh/id_rsa + ssh-keyscan -t rsa github.com > /root/.ssh/known_hosts + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Prepare git keys' + +# Clone the repository. +- name: 'gcr.io/cloud-builders/git' + args: ['clone', '--single-branch', '--branch', + '$_BRANCH', '$_REPO_URL', + '--depth', '1', + '--verbose'] + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Clone Repository' + waitFor: ['Prepare git keys'] + + +# Test uploaded model artifact. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'pytest' + args: ['src/tests/model_deployment_tests.py::test_model_artifact'] + dir: '$_WORKDIR' + env: + - 'PROJECT=$_PROJECT' + - 'REGION=$_REGION' + - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' + id: 'Test Model Artifact' + waitFor: ['Clone Repository'] + +# Create an endpoint. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'python' + args: ['build/utils.py', + '--mode', 'create-endpoint', + '--project', '$_PROJECT', + '--region', '$_REGION', + '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME'] + dir: '$_WORKDIR' + id: 'Create Endpoint' + waitFor: ['Test Model Artifact'] + +# Deploy the model. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'python' + args: ['build/utils.py', + '--mode', 'deploy-model', + '--project', '$_PROJECT', + '--region', '$_REGION', + '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME', + '--model-display-name', '$_MODEL_DISPLAY_NAME' + ] + dir: '$_WORKDIR' + id: 'Deploy Model' + waitFor: ['Create Endpoint'] + +# Test deployed model endpoint. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'pytest' + args: ['src/tests/model_deployment_tests.py::test_model_endpoint'] + dir: '$_WORKDIR' + env: + - 'PROJECT=$_PROJECT' + - 'REGION=$_REGION' + - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' + - 'ENDPOINT_DISPLAY_NAME=$_ENDPOINT_DISPLAY_NAME' + id: 'Test Model Endpoint' + waitFor: ['Deploy Model'] + +serviceAccount: 'projects/$_PROJECT/serviceAccounts/$_SERVICE_ACCOUNT' +timeout: 1800s +logsBucket: '$_GCS_BUCKET' + +substitutions: + _REPO_URL: git@github.com:${github_org}/${github_repo} + _BRANCH: ${github_branch} + _REGION: ${region} + _PROJECT: ${project_id} + _GCS_BUCKET: ${project_id}_cloudbuild/logs + _CICD_IMAGE_URI: '${docker_repo}/cicd:latest' + _MODEL_DISPLAY_NAME: creditcards-classifier-v02 + _ENDPOINT_DISPLAY_NAME: creditcards-classifier + _SERVICE_ACCOUNT: ${sa_mlops} + _WORKDIR: ${github_repo} +options: + machineType: 'E2_HIGHCPU_8' + +availableSecrets: + secretManager: + - versionName: projects/${project_id}/secrets/github-key/versions/latest + env: 'SSH_KEY' diff --git a/examples/vertex_mlops_enterprise/build/pipeline-deployment-kfp.yaml.TEMPLATE b/examples/vertex_mlops_enterprise/build/pipeline-deployment-kfp.yaml.TEMPLATE new file mode 100644 index 0000000000..a27a5102c1 --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/pipeline-deployment-kfp.yaml.TEMPLATE @@ -0,0 +1,113 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################# +# CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI. +############################################################################# + +# Access the id_github file from Secret Manager, and setup SSH +steps: +- name: 'gcr.io/cloud-builders/git' + secretEnv: ['SSH_KEY'] + entrypoint: 'bash' + args: + - -c + - | + echo "$$SSH_KEY" >> /root/.ssh/id_rsa + chmod 400 /root/.ssh/id_rsa + ssh-keyscan -t rsa github.com > /root/.ssh/known_hosts + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Prepare git keys' + +# Clone the repository. +- name: 'gcr.io/cloud-builders/git' + args: ['clone', '--single-branch', '--branch', + '$_BRANCH', '$_REPO_URL', + '--depth', '1', + '--verbose'] + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Clone Repository' + waitFor: ['Prepare git keys'] + + +# Run datasource_utils unit tests. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'echo' + args: ['Running unit tests - dummy build'] + id: 'Unit Tests' + waitFor: ['Clone Repository'] + + +# Compile the pipeline. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'python' + args: ['pipeline.py', '--compile-only'] + dir: '$_WORKDIR/src/kfp_pipelines/src/' + env: + - 'PROJECT_ID=$_PROJECT' + - 'REGION=$_REGION' + - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' + - 'SERVICE_ACCOUNT=$_SERVICE_ACCOUNT' + - 'NETWORK=$_NETWORK' + - 'BQ_DATASET_NAME=$_BQ_DATASET_NAME' + - 'ML_TABLE=$_ML_TABLE' + - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' + - 'PIPELINE_NAME=$_PIPELINE_NAME' + - 'PIPELINES_STORE=$_PIPELINES_STORE' + - 'CICD_IMAGE_URI=$_CICD_IMAGE_URI' + - 'CICD_IMAGE_MODEL_CARD=$_CICD_IMAGE_MODEL_CARD' + - 'DATAFLOW_SA=$_SERVICE_ACCOUNT' + - 'DATAFLOW_NETWORK=$_DATAFLOW_NETWORK' + id: 'Compile Pipeline' + waitFor: ['Unit Tests'] + +# Upload compiled pipeline to GCS. +- name: 'gcr.io/cloud-builders/gsutil' + args: ['cp', '$_PIPELINE_NAME.json', '$_PIPELINES_STORE'] + dir: '$_WORKDIR/src/kfp_pipelines/src/' + id: 'Upload Pipeline to GCS' + waitFor: ['Compile Pipeline'] + + +serviceAccount: 'projects/$_PROJECT/serviceAccounts/$_SERVICE_ACCOUNT' +logsBucket: '$_GCS_BUCKET' +timeout: 7200s +substitutions: + _REPO_URL: git@github.com:${github_org}/${github_repo} + _CICD_IMAGE_URI: '${docker_repo}/cicd-kfp:latest' + _CICD_IMAGE_MODEL_CARD: '${docker_repo}/model-card:latest' + _BRANCH: ${github_branch} + _REGION: ${region} + _PROJECT: ${project_id} + _GCS_BUCKET: ${project_id}_cloudbuild/logs + _BQ_DATASET_NAME: creditcards + _ML_TABLE: creditcards_ml + _PIPELINE_NAME: creditcards-classifier-kfp-train + _PIPELINES_STORE: gs://${bucket_name}/creditcards/compiled_pipelines/ + _MODEL_DISPLAY_NAME: creditcards-kfp + _NETWORK: ${subnetwork} + _DATAFLOW_NETWORK: ${dataflow_network} + _SERVICE_ACCOUNT: ${sa_mlops} + _WORKDIR: ${github_repo} +options: + machineType: 'E2_HIGHCPU_8' + +availableSecrets: + secretManager: + - versionName: projects/${project_id}/secrets/github-key/versions/latest + env: 'SSH_KEY' diff --git a/examples/vertex_mlops_enterprise/build/pipeline-deployment-tfx.yaml.TEMPLATE b/examples/vertex_mlops_enterprise/build/pipeline-deployment-tfx.yaml.TEMPLATE new file mode 100644 index 0000000000..01d7f444d0 --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/pipeline-deployment-tfx.yaml.TEMPLATE @@ -0,0 +1,161 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################# +# CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI. +############################################################################# + +# Access the id_github file from Secret Manager, and setup SSH +steps: +- name: 'gcr.io/cloud-builders/git' + secretEnv: ['SSH_KEY'] + entrypoint: 'bash' + args: + - -c + - | + echo "$$SSH_KEY" >> /root/.ssh/id_rsa + chmod 400 /root/.ssh/id_rsa + ssh-keyscan -t rsa github.com > /root/.ssh/known_hosts + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Prepare git keys' + +# Clone the repository. +- name: 'gcr.io/cloud-builders/git' + args: ['clone', '--single-branch', '--branch', + '$_BRANCH', '$_REPO_URL', + '--depth', '1', + '--verbose'] + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Clone Repository' + waitFor: ['Prepare git keys'] + + +# Run datasource_utils unit tests. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'pytest' + args: ['src/tests/datasource_utils_tests.py', '-s'] + dir: '$_WORKDIR' + env: + - 'PROJECT=$_PROJECT' + - 'BQ_LOCATION=$_BQ_LOCATION' + - 'BQ_DATASET_NAME=$_BQ_DATASET_NAME' + - 'ML_TABLE=$_ML_TABLE' + id: 'Unit Test Datasource Utils' + waitFor: ['Clone Repository'] + + +# Run model unit tests. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'pytest' + args: ['src/tests/model_tests.py', '-s'] + dir: '$_WORKDIR' + id: 'Unit Test Model' + waitFor: ['Clone Repository'] + timeout: 1800s + + +# Test e2e pipeline using local runner. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'pytest' + args: ['src/tests/pipeline_deployment_tests.py::test_e2e_pipeline', '-s'] + dir: '$_WORKDIR' + env: + - 'PROJECT=$_PROJECT' + - 'REGION=$_REGION' + - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' + - 'VERTEX_DATASET_NAME=$_VERTEX_DATASET_NAME' + - 'GCS_LOCATION=$_TEST_GCS_LOCATION' + - 'TRAIN_LIMIT=$_CI_TRAIN_LIMIT' + - 'TEST_LIMIT=$_CI_TEST_LIMIT' + - 'UPLOAD_MODEL=$_CI_UPLOAD_MODEL' + - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD' + id: 'Local Test E2E Pipeline' + waitFor: ['Clone Repository'] + timeout: 1800s + +# Compile the pipeline. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'python' + args: ['build/utils.py', + '--mode', 'compile-pipeline', + '--pipeline-name', '$_PIPELINE_NAME' + ] + dir: '$_WORKDIR' + env: + - 'PROJECT=$_PROJECT' + - 'REGION=$_REGION' + - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' + - 'VERTEX_DATASET_NAME=$_VERTEX_DATASET_NAME' + - 'GCS_LOCATION=$_GCS_LOCATION' + - 'DATAFLOW_IMAGE_URI=$_DATAFLOW_IMAGE_URI' + - 'TFX_IMAGE_URI=$_TFX_IMAGE_URI' + - 'BEAM_RUNNER=$_BEAM_RUNNER' + - 'TRAINING_RUNNER=$_TRAINING_RUNNER' + - 'SERVICE_ACCOUNT=$_SERVICE_ACCOUNT' + - 'SUBNETWORK=$_SUBNETWORK' + - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD' + + id: 'Compile Pipeline' + waitFor: ['Local Test E2E Pipeline', 'Unit Test Datasource Utils', 'Unit Test Model'] + + +# Upload compiled pipeline to GCS. +- name: 'gcr.io/cloud-builders/gsutil' + args: ['cp', '$_PIPELINE_NAME.json', '$_PIPELINES_STORE'] + dir: '$_WORKDIR' + id: 'Upload Pipeline to GCS' + waitFor: ['Compile Pipeline'] + + +serviceAccount: 'projects/$_PROJECT/serviceAccounts/$_SERVICE_ACCOUNT' +logsBucket: '$_GCS_BUCKET' +timeout: 7200s +substitutions: + _REPO_URL: git@github.com:${github_org}/${github_repo} + _BRANCH: ${github_branch} + _REGION: ${region} + _PROJECT: ${project_id} + _GCS_BUCKET: ${project_id}_cloudbuild/logs + _CICD_IMAGE_URI: '${docker_repo}/cicd-tfx:latest' + _DATAFLOW_IMAGE_URI: '${docker_repo}/dataflow:latest' + _TFX_IMAGE_URI: '${docker_repo}/vertex:latest' + _GCS_LOCATION: 'gs://${bucket_name}/creditcards/' + _TEST_GCS_LOCATION: 'gs://${bucket_name}/creditcards/e2e_tests' + _BQ_LOCATION: ${region} + _BQ_DATASET_NAME: creditcards + _ML_TABLE: creditcards_ml + _VERTEX_DATASET_NAME: creditcards + _MODEL_DISPLAY_NAME: creditcards-classifier-v02 + _CI_TRAIN_LIMIT: '1000' + _CI_TEST_LIMIT: '100' + _CI_UPLOAD_MODEL: '0' + _CI_ACCURACY_THRESHOLD: '-0.1' + _BEAM_RUNNER: DataflowRunner + _TRAINING_RUNNER: vertex + _PIPELINE_NAME: creditcards-classifier-v02-train-pipeline + _PIPELINES_STORE: gs://${bucket_name}/creditcards/compiled_pipelines/ + _SUBNETWORK: ${subnetwork} + _SERVICE_ACCOUNT: ${sa_mlops} + _WORKDIR: ${github_repo} +options: + machineType: 'E2_HIGHCPU_8' + +availableSecrets: + secretManager: + - versionName: projects/${project_id}/secrets/github-key/versions/latest + env: 'SSH_KEY' diff --git a/examples/vertex_mlops_enterprise/build/pipeline-deployment.yaml.TEMPLATE b/examples/vertex_mlops_enterprise/build/pipeline-deployment.yaml.TEMPLATE new file mode 100644 index 0000000000..2e2ba7ac3b --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/pipeline-deployment.yaml.TEMPLATE @@ -0,0 +1,161 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################# +# CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI. +############################################################################# + +# Access the id_github file from Secret Manager, and setup SSH +steps: +- name: 'gcr.io/cloud-builders/git' + secretEnv: ['SSH_KEY'] + entrypoint: 'bash' + args: + - -c + - | + echo "$$SSH_KEY" >> /root/.ssh/id_rsa + chmod 400 /root/.ssh/id_rsa + ssh-keyscan -t rsa github.com > /root/.ssh/known_hosts + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Prepare git keys' + +# Clone the repository. +- name: 'gcr.io/cloud-builders/git' + args: ['clone', '--single-branch', '--branch', + '$_BRANCH', '$_REPO_URL', + '--depth', '1', + '--verbose'] + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Clone Repository' + waitFor: ['Prepare git keys'] + + +# Run datasource_utils unit tests. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'pytest' + args: ['src/tests/datasource_utils_tests.py', '-s'] + dir: '$_WORKDIR' + env: + - 'PROJECT=$_PROJECT' + - 'BQ_LOCATION=$_BQ_LOCATION' + - 'BQ_DATASET_NAME=$_BQ_DATASET_NAME' + - 'ML_TABLE=$_ML_TABLE' + id: 'Unit Test Datasource Utils' + waitFor: ['Clone Repository'] + + +# Run model unit tests. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'pytest' + args: ['src/tests/model_tests.py', '-s'] + dir: '$_WORKDIR' + id: 'Unit Test Model' + waitFor: ['Clone Repository'] + timeout: 1800s + + +# Test e2e pipeline using local runner. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'pytest' + args: ['src/tests/pipeline_deployment_tests.py::test_e2e_pipeline', '-s'] + dir: '$_WORKDIR' + env: + - 'PROJECT=$_PROJECT' + - 'REGION=$_REGION' + - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' + - 'VERTEX_DATASET_NAME=$_VERTEX_DATASET_NAME' + - 'GCS_LOCATION=$_TEST_GCS_LOCATION' + - 'TRAIN_LIMIT=$_CI_TRAIN_LIMIT' + - 'TEST_LIMIT=$_CI_TEST_LIMIT' + - 'UPLOAD_MODEL=$_CI_UPLOAD_MODEL' + - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD' + id: 'Local Test E2E Pipeline' + waitFor: ['Clone Repository'] + timeout: 1800s + +# Compile the pipeline. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'python' + args: ['build/utils.py', + '--mode', 'compile-pipeline', + '--pipeline-name', '$_PIPELINE_NAME' + ] + dir: '$_WORKDIR' + env: + - 'PROJECT=$_PROJECT' + - 'REGION=$_REGION' + - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' + - 'VERTEX_DATASET_NAME=$_VERTEX_DATASET_NAME' + - 'GCS_LOCATION=$_GCS_LOCATION' + - 'DATAFLOW_IMAGE_URI=$_DATAFLOW_IMAGE_URI' + - 'TFX_IMAGE_URI=$_TFX_IMAGE_URI' + - 'BEAM_RUNNER=$_BEAM_RUNNER' + - 'TRAINING_RUNNER=$_TRAINING_RUNNER' + - 'SERVICE_ACCOUNT=$_SERVICE_ACCOUNT' + - 'SUBNETWORK=$_SUBNETWORK' + - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD' + + id: 'Compile Pipeline' + waitFor: ['Local Test E2E Pipeline', 'Unit Test Datasource Utils', 'Unit Test Model'] + + +# Upload compiled pipeline to GCS. +- name: 'gcr.io/cloud-builders/gsutil' + args: ['cp', '$_PIPELINE_NAME.json', '$_PIPELINES_STORE'] + dir: '$_WORKDIR' + id: 'Upload Pipeline to GCS' + waitFor: ['Compile Pipeline'] + + +serviceAccount: 'projects/$_PROJECT/serviceAccounts/$_SERVICE_ACCOUNT' +logsBucket: '$_GCS_BUCKET' +timeout: 3600s +substitutions: + _REPO_URL: git@github.com:${github_org}/${github_repo} + _BRANCH: ${github_branch} + _REGION: ${region} + _PROJECT: ${project_id} + _GCS_BUCKET: ${project_id}_cloudbuild/logs + _CICD_IMAGE_URI: '${docker_repo}/cicd:latest' + _DATAFLOW_IMAGE_URI: '${docker_repo}/dataflow:latest' + _TFX_IMAGE_URI: '${docker_repo}/vertex:latest' + _GCS_LOCATION: 'gs://${project_id}/creditcards/' + _TEST_GCS_LOCATION: 'gs://${project_id}/creditcards/e2e_tests' + _BQ_LOCATION: ${region} + _BQ_DATASET_NAME: creditcards + _ML_TABLE: creditcards_ml + _VERTEX_DATASET_NAME: creditcards + _MODEL_DISPLAY_NAME: creditcards-classifier-v02 + _CI_TRAIN_LIMIT: '1000' + _CI_TEST_LIMIT: '100' + _CI_UPLOAD_MODEL: '0' + _CI_ACCURACY_THRESHOLD: '-0.1' + _BEAM_RUNNER: DataflowRunner + _TRAINING_RUNNER: vertex + _PIPELINE_NAME: creditcards-classifier-v02-train-pipeline + _PIPELINES_STORE: gs://${project_id}/creditcards/compiled_pipelines/ + _SUBNETWORK: ${subnetwork} + _SERVICE_ACCOUNT: ${sa_mlops} + _WORKDIR: ${github_repo} +options: + machineType: 'E2_HIGHCPU_8' + +availableSecrets: + secretManager: + - versionName: projects/${project_id}/secrets/github-key/versions/latest + env: 'SSH_KEY' diff --git a/examples/vertex_mlops_enterprise/build/pipeline-run.yaml.TEMPLATE b/examples/vertex_mlops_enterprise/build/pipeline-run.yaml.TEMPLATE new file mode 100644 index 0000000000..18414e36a8 --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/pipeline-run.yaml.TEMPLATE @@ -0,0 +1,93 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################# +# CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI. +############################################################################# + +# Access the id_github file from Secret Manager, and setup SSH +steps: +- name: 'gcr.io/cloud-builders/git' + secretEnv: ['SSH_KEY'] + entrypoint: 'bash' + args: + - -c + - | + echo "$$SSH_KEY" >> /root/.ssh/id_rsa + chmod 400 /root/.ssh/id_rsa + ssh-keyscan -t rsa github.com > /root/.ssh/known_hosts + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Prepare git keys' + +# Clone the repository. +- name: 'gcr.io/cloud-builders/git' + args: ['clone', '--single-branch', '--branch', + '$_BRANCH', '$_REPO_URL', + '--depth', '1', + '--verbose'] + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Clone Repository' + waitFor: ['Prepare git keys'] + +# Run the pipeline. +- name: python:3.9 + entrypoint: pip + args: ["install", "-r", "src/pipeline_triggering/requirements.txt", "--user"] + dir: '$_WORKDIR' + id: 'Install python packages' + waitFor: ['Clone Repository'] + +- name: python:3.9 + entrypoint: 'python' + args: ['build/utils.py', + '--mode', 'run-pipeline', + '--project', '$_PROJECT', + '--region', '$_REGION', + '--pipelines-store', '$_PIPELINES_STORE', + '--pipeline-name', '$_PIPELINE_NAME', + '--service-account', '$_SERVICE_ACCOUNT', + '--parameter-values', '$_PIPELINE_PARAMS', + '--labels', '$_LABELS' + ] + dir: '$_WORKDIR' + id: 'Run Pipeline' + waitFor: ['Install python packages'] + + +serviceAccount: 'projects/$_PROJECT/serviceAccounts/$_SERVICE_ACCOUNT' +logsBucket: '$_GCS_BUCKET' +timeout: 7200s +substitutions: + _REPO_URL: git@github.com:${github_org}/${github_repo} + _BRANCH: ${github_branch} + _REGION: ${region} + _PROJECT: ${project_id} + _GCS_BUCKET: ${project_id}_cloudbuild/logs + _PIPELINE_NAME: ${pipeline_name} + _PIPELINE_PARAMS: '${pipeline_params}' + _SERVICE_ACCOUNT: ${sa_mlops} + _PIPELINES_STORE: gs://${bucket_name}/creditcards/compiled_pipelines/ + _WORKDIR: ${github_repo} + _LABELS: 'job=1' +options: + machineType: 'E2_HIGHCPU_8' + +availableSecrets: + secretManager: + - versionName: projects/${project_id}/secrets/github-key/versions/latest + env: 'SSH_KEY' diff --git a/examples/vertex_mlops_enterprise/build/pipeline-test.yaml.TEMPLATE b/examples/vertex_mlops_enterprise/build/pipeline-test.yaml.TEMPLATE new file mode 100644 index 0000000000..4dcb2c7689 --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/pipeline-test.yaml.TEMPLATE @@ -0,0 +1,123 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################# +# CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI. +############################################################################# + +# Access the id_github file from Secret Manager, and setup SSH +steps: +- name: 'gcr.io/cloud-builders/git' + secretEnv: ['SSH_KEY'] + entrypoint: 'bash' + args: + - -c + - | + echo "$$SSH_KEY" >> /root/.ssh/id_rsa + chmod 400 /root/.ssh/id_rsa + ssh-keyscan -t rsa github.com > /root/.ssh/known_hosts + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Prepare git keys' + +# Clone the repository. +- name: 'gcr.io/cloud-builders/git' + args: ['clone', '--single-branch', '--branch', + '$_BRANCH', '$_REPO_URL', + '--depth', '1', + '--verbose'] + volumes: + - name: 'ssh' + path: /root/.ssh + id: 'Clone Repository' + waitFor: ['Prepare git keys'] + + + +# Compile the pipeline. +- name: '$_CICD_IMAGE_URI' + entrypoint: 'python' + args: ['build/utils.py', + '--mode', 'compile-pipeline', + '--pipeline-name', '$_PIPELINE_NAME' + ] + dir: '$_WORKDIR' + env: + - 'PROJECT=$_PROJECT' + - 'REGION=$_REGION' + - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' + - 'VERTEX_DATASET_NAME=$_VERTEX_DATASET_NAME' + - 'GCS_LOCATION=$_GCS_LOCATION' + - 'DATAFLOW_IMAGE_URI=$_DATAFLOW_IMAGE_URI' + - 'TFX_IMAGE_URI=$_DATAFLOW_IMAGE_URI' + - 'BEAM_RUNNER=$_BEAM_RUNNER' + - 'TRAINING_RUNNER=$_TRAINING_RUNNER' + - 'SERVICE_ACCOUNT=$_SERVICE_ACCOUNT' + - 'SUBNETWORK=$_SUBNETWORK' + - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD' + + + id: 'Compile Pipeline' + waitFor: ['Clone Repository'] + + +# Upload compiled pipeline to GCS. +- name: 'gcr.io/cloud-builders/gsutil' + args: ['cp', '$_PIPELINE_NAME.json', '$_PIPELINES_STORE'] + dir: '$_WORKDIR' + id: 'Upload Pipeline to GCS' + waitFor: ['Compile Pipeline'] + + +serviceAccount: 'projects/$_PROJECT/serviceAccounts/$_SERVICE_ACCOUNT' +logsBucket: '$_GCS_BUCKET' +timeout: 3600s + +substitutions: + _REPO_URL: git@github.com:GITHUB_ORG/GITHUB_REPO + _BRANCH: main + _REGION: europe-west4 + _PROJECT: PROJECT_ID-dev + _GCS_BUCKET: PROJECT_ID-dev_cloudbuild/logs + _CICD_IMAGE_URI: 'europe-west4-docker.pkg.dev/PROJECT_ID-dev/docker-repo/cicd:latest' + _DATAFLOW_IMAGE_URI: 'europe-west4-docker.pkg.dev/PROJECT_ID-dev/docker-repo/dataflow:latest' + _TFX_IMAGE_URI: 'europe-west4-docker.pkg.dev/PROJECT_ID-dev/docker-repo/vertex:latest' + _GCS_LOCATION: 'gs://PROJECT_ID-dev/creditcards/' + _TEST_GCS_LOCATION: 'gs://PROJECT_ID-dev/creditcards/e2e_tests' + _BQ_LOCATION: EU + _BQ_DATASET_NAME: creditcards + _ML_TABLE: creditcards_ml + _VERTEX_DATASET_NAME: creditcards + _MODEL_DISPLAY_NAME: creditcards-classifier-v02 + _CI_TRAIN_LIMIT: '1000' + _CI_TEST_LIMIT: '100' + _CI_UPLOAD_MODEL: '0' + _CI_ACCURACY_THRESHOLD: '-0.1' + _BEAM_RUNNER: DataflowRunner + _TRAINING_RUNNER: vertex + _PIPELINE_NAME: creditcards-classifier-v02-train-pipeline, + _PIPELINES_STORE: gs://PROJECT_ID-dev/creditcards/compiled_pipelines/ + _SUBNETWORK: https://www.googleapis.com/compute/v1/projects/PROJECT_ID-dev/regions/europe-west4/subnetworks/default + _SERVICE_ACCOUNT: sa-mlops-dev@PROJECT_ID-dev.iam.gserviceaccount.com + _WORKDIR: GITHUB_REPO + +options: + machineType: 'E2_HIGHCPU_8' + substitution_option: 'ALLOW_LOOSE' + +availableSecrets: + secretManager: + - versionName: projects/PROJECT_ID-dev/secrets/github-key/versions/latest + env: 'SSH_KEY' diff --git a/examples/vertex_mlops_enterprise/build/pydist/files.txt b/examples/vertex_mlops_enterprise/build/pydist/files.txt new file mode 100644 index 0000000000..55b6b235f7 --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/pydist/files.txt @@ -0,0 +1,23 @@ +# Packages required to get access to Artifact Registry without Internet Access +cachetools-5.2.0-py3-none-any.whl +certifi-2022.6.15-py3-none-any.whl +cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +charset_normalizer-2.1.0-py3-none-any.whl +cryptography-37.0.4-cp36-abi3-manylinux_2_24_x86_64.whl +google_auth-2.9.1-py2.py3-none-any.whl +idna-3.3-py3-none-any.whl +importlib_metadata-4.12.0-py3-none-any.whl +jeepney-0.8.0-py3-none-any.whl +keyring-23.7.0-py3-none-any.whl +keyrings.google_artifactregistry_auth-1.0.0-py3-none-any.whl +pluggy-1.0.0-py2.py3-none-any.whl +pyasn1-0.4.8-py2.py3-none-any.whl +pyasn1_modules-0.2.8-py2.py3-none-any.whl +pycparser-2.21-py2.py3-none-any.whl +requests-2.28.1-py3-none-any.whl +rsa-4.9-py3-none-any.whl +six-1.16.0-py2.py3-none-any.whl +SecretStorage-3.3.2-py3-none-any.whl +typing_extensions-4.3.0-py3-none-any.whl +urllib3-1.26.11-py2.py3-none-any.whl +zipp-3.8.1-py3-none-any.whl \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/build/serving_resources_spec.json b/examples/vertex_mlops_enterprise/build/serving_resources_spec.json new file mode 100644 index 0000000000..19707c5066 --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/serving_resources_spec.json @@ -0,0 +1,8 @@ +{ + "traffic_percentage": 100, + "machine_type": "n1-standard-2", + "min_replica_count": 1, + "max_replica_count": 1, + "accelerator_type": null, + "accelerator_count": null +} \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/build/utils.py b/examples/vertex_mlops_enterprise/build/utils.py new file mode 100644 index 0000000000..864072d27c --- /dev/null +++ b/examples/vertex_mlops_enterprise/build/utils.py @@ -0,0 +1,254 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for deploying pipelines and models to Vertex AI.""" + + +import argparse +import os +import sys +import logging +import json + +from google.cloud import aiplatform as vertex_ai +from google.cloud import storage + + +SCRIPT_DIR = os.path.dirname( + os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) +) +sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) + +SERVING_SPEC_FILEPATH = 'build/serving_resources_spec.json' + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + '--mode', + type=str, + required=True + ) + + parser.add_argument( + '--project', + type=str, + ) + + parser.add_argument( + '--region', + type=str, + ) + + parser.add_argument( + '--endpoint-display-name', + type=str, + ) + + parser.add_argument( + '--model-display-name', + type=str, + ) + + parser.add_argument( + '--pipeline-name', + type=str, + ) + + parser.add_argument( + '--pipelines-store', + type=str, + ) + + parser.add_argument( + '--service-account', + type=str, + ) + + parser.add_argument( + '--parameter-values', + type=str, + ) + + parser.add_argument( + '--labels', + type=str, + ) + + return parser.parse_args() + + +def create_endpoint(project, region, endpoint_display_name): + logging.info(f"Creating endpoint {endpoint_display_name}") + vertex_ai.init( + project=project, + location=region + ) + + endpoints = vertex_ai.Endpoint.list( + filter=f'display_name={endpoint_display_name}', + order_by="update_time") + + if len(endpoints) > 0: + logging.info(f"Endpoint {endpoint_display_name} already exists.") + endpoint = endpoints[-1] + else: + endpoint = vertex_ai.Endpoint.create(endpoint_display_name) + logging.info(f"Endpoint is ready.") + logging.info(endpoint.gca_resource) + return endpoint + + +def deploy_model(project, region, endpoint_display_name, model_display_name, serving_resources_spec): + logging.info(f"Deploying model {model_display_name} to endpoint {endpoint_display_name}") + vertex_ai.init( + project=project, + location=region + ) + + model = vertex_ai.Model.list( + filter=f'display_name={model_display_name}', + order_by="update_time" + )[-1] + + endpoint = vertex_ai.Endpoint.list( + filter=f'display_name={endpoint_display_name}', + order_by="update_time" + )[-1] + + deployed_model = endpoint.deploy(model=model, **serving_resources_spec) + logging.info(f"Model is deployed.") + logging.info(deployed_model) + return deployed_model + + +def compile_pipeline(pipeline_name): + from src.tfx_pipelines import runner + pipeline_definition_file = f"{pipeline_name}.json" + pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file) + return pipeline_definition + +def run_pipeline(project, region, service_account, pipelines_store, pipeline_name, parameter_values, labels_str): + if labels_str: + # Converting string into dictionary using dict comprehension + labels = dict(item.split("=") for item in labels_str.split(",")) + #labels=json.loads(labels_str) + + storage_client = storage.Client() + + gcs_pipeline_file_location = pipelines_store if pipelines_store.endswith("/") else pipelines_store + "/" + gcs_pipeline_file_location = gcs_pipeline_file_location + pipeline_name + ".json" + + path_parts = gcs_pipeline_file_location.replace("gs://", "").split("/") + bucket_name = path_parts[0] + blob_name = "/".join(path_parts[1:]) + + bucket = storage_client.bucket(bucket_name) + blob = storage.Blob(bucket=bucket, name=blob_name) + + if not blob.exists(storage_client): + raise ValueError(f"{pipelines_store}/{pipeline_name} does not exist.") + + parameter_values_json = json.loads(parameter_values) + print(f'Input: {parameter_values_json}') + print(f'JSON: {parameter_values_json}') + + job = vertex_ai.PipelineJob(display_name = pipeline_name, + template_path = gcs_pipeline_file_location, + parameter_values = parameter_values_json, + project = project, + location = region, + labels = labels) + + response = job.submit(service_account=service_account, + network=None) + + job.wait() + print(f'Job finished with state: {job.state}') + + return response + + +def main(): + args = get_args() + + if args.mode == 'create-endpoint': + if not args.project: + raise ValueError("project must be supplied.") + if not args.region: + raise ValueError("region must be supplied.") + if not args.endpoint_display_name: + raise ValueError("endpoint_display_name must be supplied.") + + result = create_endpoint( + args.project, + args.region, + args.endpoint_display_name + ) + + elif args.mode == 'deploy-model': + if not args.project: + raise ValueError("project must be supplied.") + if not args.region: + raise ValueError("region must be supplied.") + if not args.endpoint_display_name: + raise ValueError("endpoint-display-name must be supplied.") + if not args.model_display_name: + raise ValueError("model-display-name must be supplied.") + + with open(SERVING_SPEC_FILEPATH) as json_file: + serving_resources_spec = json.load(json_file) + logging.info(f"serving resources: {serving_resources_spec}") + result = deploy_model( + args.project, + args.region, + args.endpoint_display_name, + args.model_display_name, + serving_resources_spec + ) + + elif args.mode == 'compile-pipeline': + if not args.pipeline_name: + raise ValueError("pipeline-name must be supplied.") + result = compile_pipeline(args.pipeline_name) + elif args.mode == 'run-pipeline': + if not args.project: + raise ValueError("project must be supplied.") + if not args.region: + raise ValueError("region must be supplied.") + if not args.pipelines_store: + raise ValueError("pipelines-store must be supplied.") + if not args.pipeline_name: + raise ValueError("pipeline-name must be supplied.") + if not args.service_account: + raise ValueError("service-account must be supplied.") + if not args.parameter_values: + raise ValueError("parameter-values must be supplied.") + + result = run_pipeline( + args.project, + args.region, + args.service_account, + args.pipelines_store, + args.pipeline_name, + args.parameter_values, + args.labels) + else: + raise ValueError(f"Invalid mode {args.mode}.") + + logging.info(result) + + +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/doc/01-ENVIRONMENTS.md b/examples/vertex_mlops_enterprise/doc/01-ENVIRONMENTS.md new file mode 100644 index 0000000000..2b1ac745a9 --- /dev/null +++ b/examples/vertex_mlops_enterprise/doc/01-ENVIRONMENTS.md @@ -0,0 +1,102 @@ +# MLOps with Vertex AI - Infra setup + +## Introduction +This example implements the infrastructure required to deploy an end-to-end [MLOps process](https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf) using [Vertex AI](https://cloud.google.com/vertex-ai) platform. + + +## GCP resources +A terraform script is provided to setup all the required resources: + +- GCP Project to host all the resources +- Isolated VPC network and a subnet to be used by Vertex and Dataflow (using a Shared VPC is also possible). +- Firewall rule to allow the internal subnet communication required by Dataflow +- Cloud NAT required to reach the internet from the different computing resources (Vertex and Dataflow) +- GCS buckets to host Vertex AI and Cloud Build Artifacts. +- BigQuery Dataset where the training data will be stored +- Service account `mlops-[env]@` with the minimum permissions required by Vertex and Dataflow +- Service account `github` to be used by Workload Identity Federation, to federate Github identity. +- Secret to store the Github SSH key to get access the CI/CD code repo (you will set the secret value later, so it can be used). + +![MLOps project description](./images/mlops_projects.png "MLOps project description") + +## Pre-requirements + +### User groups + +User groups provide a stable frame of reference that allows decoupling the final set of permissions from the stage where entities and resources are created, and their IAM bindings defined. These groups should be created before launching Terraform. + +We use the following groups to control access to resources: + +- *Data Scientits* (gcp-ml-ds@). They create ML pipelines in the experimentation environment. +- *ML Engineers* (gcp-ml-eng@). They handle and run the different environments, with access to all resources in order to troubleshoot possible issues with pipelines. + +These groups are not suitable for production grade environments. You can configure the group names through the `groups`variable. + +### Git environment for the ML Pipelines + +Clone the Google Cloud Professional services [repo](https://github.com/GoogleCloudPlatform/professional-services) to a temp directory: +``` +git clone https://github.com/GoogleCloudPlatform/professional-services.git` +cd professional-services/ +``` + +Setup your new Github repo using the Github web console or CLI. + +Copy the `vertex_mlops_enterprise` folder to your local folder, including the Github actions: + +``` +cp -R ./examples/vertex_mlops_enterprise/* ./ +cp -R ./examples/vertex_mlops_enterprise/.github ./ +``` + +Commit the files in the main branch (`main`): +``` +git init +git add * +git commit -m "first commit" +git branch -M main +git remote add origin https://github.com//.git +git push -u origin main +``` +You will need to configure the Github organization and repo name in the `github` variable. + +### Branches +Create the additional branches in Github (`dev`, `staging`, `prod`). This can be also done from the UI (`Create branch: dev from main`). + +Pull the remote repo with `git pull`. + +Checkout the staging branch with `git checkout dev`. + +Review the files `*.yml` files in the `.github/workflows` and modify them if needed. These files should be automatically updated when launched terraform. + +Review the files `*.yaml` files in the `build` folder and modify them if needed. These files should be automatically updated when launched terraform. + +## Instructions +### Deploy the different environments + +You will need to repeat this process for each one of the different environments (01-development, 02-staging, 03-production): + +- Go to the environment folder: I.e. `cd ../terraform/01-dev` +- In the file `providers.tf`, set the name of a bucket that you want to use as the storage for your Terraform state. This should be an existing bucket that your user has access to. +- Create a `terraform.tfvars` file and specify the required variables. You can use the `terraform.tfvars.sample` an an starting point + +```tfm +project_create = { + billing_account_id = "000000-123456-123456" + parent = "folders/111111111111" +} +project_id = "creditcards-dev" +``` +- Make sure you fill in the following parameters: + - `project_create.billing_account_id`: Billing account + - `project_create.parent `: Parent folder where the project will be created. + - `project_id`: Project id, references existing project if `project_create` is null. +- Make sure you have the right authentication setup (application default credentials, or a service account key) +- Run `terraform init` and `terraform apply` +- It is possible that some errors like `googleapi: Error 400: Service account xxxx does not exist.` appears. This is due to some dependencies with the Project IAM authoritative bindings of the service accounts. In this case, re-run again the process with `terraform apply` + +## What's next? +Continue [configuring the GIT integration with Cloud Build](./02-GIT_SETUP.md) and [launching the MLOps pipeline](./03-MLOPS.md). + + + diff --git a/examples/vertex_mlops_enterprise/doc/02-GIT_SETUP.md b/examples/vertex_mlops_enterprise/doc/02-GIT_SETUP.md new file mode 100644 index 0000000000..f3351af36c --- /dev/null +++ b/examples/vertex_mlops_enterprise/doc/02-GIT_SETUP.md @@ -0,0 +1,27 @@ +# MLOps with Vertex AI - Git integration with Cloud Build + +## Accessing GitHub from Cloud Build via SSH keys + +Follow this procedure to create a private SSH key to be used for Github access from Cloud Build: +https://cloud.google.com/build/docs/access-github-from-build + +``` +mkdir workingdir +cd workingdir +ssh-keygen -t rsa -b 4096 -N '' -f id_github -C +``` + +This command creates a new SSH key `id_github`. + +Add the public SSH key `id_github.pub` to your private repository's deploy keys. + +Store the private SSH key in Secret Manager, in the `github-key` secret. + +`gcloud secrets versions add github-key --data-file=./id_github --project=$PROJECT_ID` + +After storing the secret, you can remove the local key file: + +``` +cd .. +rm -rf workingdir +``` \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/doc/03-MLOPS.md b/examples/vertex_mlops_enterprise/doc/03-MLOPS.md new file mode 100644 index 0000000000..c6db45dc09 --- /dev/null +++ b/examples/vertex_mlops_enterprise/doc/03-MLOPS.md @@ -0,0 +1,96 @@ +# MLOps with Vertex AI + +## Set up the experimentation notebook + +Once the environment has been deployed, the first step is to open the Jupyter notebook available in the [Vertex Workbench section](https://console.cloud.google.com/vertex-ai/workbench/list/managed), under the specific region (e.g. `europe-west4`). +Use the `OPEN JUPYTERLAB` button to launch the notebook. Once it is ready, you can use the menu option `Git -> Clone a Repository` to clone the Github repo. + + +## Set up the required tables + +For the Vertex MLOps end2end example we will use the public dataset `bigquery-public-data:ml_datasets.ulb_fraud_detection` that contains anonymized credit card transactions made over 2 days in September 2013 by European cardholders, with 492 frauds out of 284,807 transactions. + +``` +Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015 +``` + +If the destination dataset is located in a different region from the source dataset (US) you will need to copy the data to the desired region. You can use the Data Transfer Service or an extracing/load procedure such as the following one. +The script `create_tables.sh` has also been provided for convenience. +You will need to repeat this procedure for each environment. In a productive environment, it will be necessary to modify the pipeline to access the correct bigquery dataset. + +``` +#Set up env vars +PROJECT= +SRC_TABLE=bigquery-public-data:ml_datasets.ulb_fraud_detection +BQ_DATASET_NAME=creditcards +BQ_SOURCE_TABLE=creditcards +ML_TABLE=creditcards_ml +DST_TABLE=$BQ_DATASET_NAME.$BQ_SOURCE_TABLE +BUCKET=gs://$PROJECT/data/credit_cards* + +#Extract & Load +bq extract --project_id $PROJECT --destination_format PARQUET $SRC_TABLE $BUCKET +bq load --project_id $PROJECT --source_format=PARQUET --replace=true $DST_TABLE $BUCKET +gsutil rm $BUCKET +``` + +As next steps, we will create the base table we will use for the ML process: +``` +sql_script="CREATE OR REPLACE TABLE \`${PROJECT}.${BQ_DATASET_NAME}.${ML_TABLE}\` +AS ( + SELECT + * EXCEPT(Class), + CAST(Class AS FLOAT64) as Class, + IF(ABS(MOD(FARM_FINGERPRINT(CAST(Time AS STRING)), 100)) <= 80, 'UNASSIGNED', 'TEST') AS ML_use + FROM + \`${PROJECT}.${BQ_DATASET_NAME}.${BQ_SOURCE_TABLE}\` +) +" + +bq query --project_id $PROJECT --nouse_legacy_sql "$sql_script" +``` + +## Set up the Vertex managed Dataset +Run the following commands to setup the Vertex Dataset. + +``` + +bq_uri="bq://${PROJECT}.${BQ_DATASET_NAME}.${ML_TABLE}" +echo ${bq_uri} + +echo "{ + \"display_name\": \"creditcards\", + \"metadata_schema_uri\": \"gs://google-cloud-aiplatform/schema/dataset/metadata/tabular_1.0.0.yaml\", + \"metadata\": { + \"input_config\": { + \"bigquery_source\" :{ + \"uri\": \"${bq_uri}\" + } + } + } +}" > request.json + + +REGION=europe-west4 +ENDPOINT=$REGION-aiplatform.googleapis.com + +curl -X POST \ +-H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \ +-H "Content-Type: application/json; charset=utf-8" \ +-d @request.json \ +"https://${ENDPOINT}/v1/projects/${PROJECT}/locations/${REGION}/datasets" + +``` + + +## Test the build process +You can test the overall build process from the Github Actions section. +- **Build Containers**: This action will create the different docker containers that will be used during the Vertex AI pipeline compilation and execution. +- **Build Vertex AI pipeline**: This action will run the unit tests and if they are executed sucesfull it will compile the Vertex pipeline. +- **Run Vertex AI pipeline**: This action will execute the Vertex pipeline. Please note that the first time it is possible that the pipeline fails with `Error: Vertex AI Service Agent`. Just re-run the pipeline and it should work. +- **Deploy model**: This action will deploy the model to a Vertex AI endpoint. + + +## Troubleshooting + +See [Issues](./ISSUES.md) diff --git a/examples/vertex_mlops_enterprise/doc/ISSUES.md b/examples/vertex_mlops_enterprise/doc/ISSUES.md new file mode 100644 index 0000000000..98dcd615c4 --- /dev/null +++ b/examples/vertex_mlops_enterprise/doc/ISSUES.md @@ -0,0 +1,15 @@ + +# Issues when running Github actions: +``` +ERROR: (gcloud.builds.submit) There was a problem refreshing your current auth tokens: ('Unable to acquire impersonated credentials', '{\n "error": {\n "code": 403,\n "message": "Permission \'iam.serviceAccounts.getAccessToken\' denied on resource (or it may not exist).",\n "status": "PERMISSION_DENIED",\n "details": [\n {\n "@type": "type.googleapis.com/google.rpc.ErrorInfo",\n "reason": "IAM_PERMISSION_DENIED",\n "domain": "iam.googleapis.com",\n "metadata": {\n "permission": "iam.serviceAccounts.getAccessToken"\n }\n }\n ]\n }\n}\n') +``` + +Make sure that the `github` variable has been setup correctly: `organization` or `repo` should match the Github user/organization and repo name. + + + +# Issues when running Vertex Pipeline: +`Failed to create pipeline job. Error: Vertex AI Service Agent service-nnnnn@gcp-sa-aiplatform-cc.iam.gserviceaccount.com does not have permission to access Artifact Registry repository projects/PROJECT_ID/locations/europe-west4/repositories/docker-repo` + +This happens the first time runing a Vertex Pipeline job since the Vertex SA is not still enabled. Re-run again the trigger to launch the job. + diff --git a/examples/vertex_mlops_enterprise/doc/VPC-SC.md b/examples/vertex_mlops_enterprise/doc/VPC-SC.md new file mode 100644 index 0000000000..ecc28037f5 --- /dev/null +++ b/examples/vertex_mlops_enterprise/doc/VPC-SC.md @@ -0,0 +1,12 @@ +# Considerations with VPC SC +## Cloud Build +Use Cloud Build [private pools](https://cloud.google.com/build/docs/private-pools/using-vpc-service-controls) or create a VPC SC ingress rule or [access level](https://cloud.google.com/access-context-manager/docs/create-basic-access-level#members-example) adding the Cloud Build Service Account (PROJECT_NUMBER@cloudbuild.gserviceaccount.com) + + + + +## Github +Create a VPC SC ingress rule adding Github service account (sa-github@PROJECT_ID.iam.gserviceaccount.com) or [access level](https://cloud.google.com/access-context-manager/docs/create-basic-access-level#members-example) + + + diff --git a/examples/vertex_mlops_enterprise/doc/create_tables.sh b/examples/vertex_mlops_enterprise/doc/create_tables.sh new file mode 100644 index 0000000000..1af9f3e1b0 --- /dev/null +++ b/examples/vertex_mlops_enterprise/doc/create_tables.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# Copyright 2023 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#Set up env vars +PROJECT="$(gcloud config get-value project)" +SRC_TABLE=bigquery-public-data:ml_datasets.ulb_fraud_detection +BQ_DATASET_NAME=creditcards +BQ_SOURCE_TABLE=creditcards +ML_TABLE=creditcards_ml +DST_TABLE=$BQ_DATASET_NAME.$BQ_SOURCE_TABLE +BUCKET="gs://$PROJECT/data/credit_cards*" +REGION=europe-west4 +ENDPOINT="$REGION-aiplatform.googleapis.com" + +#Extract & Load +bq extract --project_id "$PROJECT" --destination_format PARQUET "$SRC_TABLE" "$BUCKET" +bq load --project_id "$PROJECT" --source_format=PARQUET --replace=true "$DST_TABLE" "$BUCKET" +gsutil rm "$BUCKET" + + +sql_script="CREATE OR REPLACE TABLE \`${PROJECT}.${BQ_DATASET_NAME}.${ML_TABLE}\` +AS ( + SELECT * EXCEPT(Class), CAST(Class AS FLOAT64) as Class, + IF(ABS(MOD(FARM_FINGERPRINT(CAST(Time AS STRING)), 100)) <= 80, 'UNASSIGNED', 'TEST') AS ML_use + FROM + \`${PROJECT}.${BQ_DATASET_NAME}.${BQ_SOURCE_TABLE}\` +) +" + +bq query --project_id "$PROJECT" --nouse_legacy_sql "$sql_script" + +bq_uri="bq://${PROJECT}.${BQ_DATASET_NAME}.${ML_TABLE}" +echo "Creating Bigquery Managed Dataset: ${bq_uri}" +echo "{ + \"display_name\": \"creditcards\", + \"metadata_schema_uri\": \"gs://google-cloud-aiplatform/schema/dataset/metadata/tabular_1.0.0.yaml\", + \"metadata\": { + \"input_config\": { + \"bigquery_source\" :{ + \"uri\": \"${bq_uri}\" + } + } + } +}" > request.json + + +curl -X POST \ +-H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \ +-H "Content-Type: application/json; charset=utf-8" \ +-d @request.json \ +"https://${ENDPOINT}/v1/projects/${PROJECT}/locations/${REGION}/datasets" diff --git a/examples/vertex_mlops_enterprise/doc/images/mlops_projects.png b/examples/vertex_mlops_enterprise/doc/images/mlops_projects.png new file mode 100644 index 0000000000..25482a5e2c Binary files /dev/null and b/examples/vertex_mlops_enterprise/doc/images/mlops_projects.png differ diff --git a/examples/vertex_mlops_enterprise/doc/request.json b/examples/vertex_mlops_enterprise/doc/request.json new file mode 100644 index 0000000000..8459942449 --- /dev/null +++ b/examples/vertex_mlops_enterprise/doc/request.json @@ -0,0 +1,11 @@ +{ + "display_name": "creditcards", + "metadata_schema_uri": "gs://google-cloud-aiplatform/schema/dataset/metadata/tabular_1.0.0.yaml", + "metadata": { + "input_config": { + "bigquery_source" :{ + "uri": "bq://cxt4-creditcards-dev.creditcards.creditcards_ml" + } + } + } +} diff --git a/examples/vertex_mlops_enterprise/mainconfig.yaml.TEMPLATE b/examples/vertex_mlops_enterprise/mainconfig.yaml.TEMPLATE new file mode 100644 index 0000000000..9a86c776ac --- /dev/null +++ b/examples/vertex_mlops_enterprise/mainconfig.yaml.TEMPLATE @@ -0,0 +1,26 @@ +creditcards: + project: '${project_id}' + region: 'europe-west4' + docker_repo: '${docker_repo}' + service_account: '${sa_mlops}' + vertex_dataset_name: 'creditcards' + raw_schema_dir: 'src/raw_schema' + bucket: '${project_id}' + limit: 5000 + version: 'v02' + cloudfunction_region: '${region}' + artifactregistry_region: '${region}' + python_pkg_repo: 'pyrepo' + bq: + dataset: 'creditcards' + location: '${region}' + source_table: 'creditcards' + ml_table: 'creditcards_ml' + dataflow: + subnet: '${subnetwork}' + service_account: '${sa_mlops}' + featurestore_id: 'creditcards' + git: + repo_url: "git@github.com:${github_org}/${github_repo}" + branch: "${github_branch}" + diff --git a/examples/vertex_mlops_enterprise/no_internet_access/Dockerfile.cicd b/examples/vertex_mlops_enterprise/no_internet_access/Dockerfile.cicd new file mode 100644 index 0000000000..304261dc19 --- /dev/null +++ b/examples/vertex_mlops_enterprise/no_internet_access/Dockerfile.cicd @@ -0,0 +1,38 @@ +FROM gcr.io/tfx-oss-public/tfx:1.8.0 + +#RUN pip install keyring keyrings.google-artifactregistry-auth + +COPY pip.conf pip.conf +ENV PIP_CONFIG_FILE=pip.conf + +ADD pydist pydist +# These need to go in order so we don't hit unsatisfied deps +RUN pip install -vv pydist/jeepney-0.8.0-py3-none-any.whl +RUN pip install -vv pydist/typing_extensions-4.3.0-py3-none-any.whl +RUN pip install -vv pydist/zipp-3.8.1-py3-none-any.whl +RUN pip install -vv pydist/importlib_metadata-4.12.0-py3-none-any.whl +RUN pip install -vv pydist/pycparser-2.21-py2.py3-none-any.whl +RUN pip install -vv pydist/cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +RUN pip install -vv pydist/cryptography-37.0.4-cp36-abi3-manylinux_2_24_x86_64.whl +RUN pip install -vv pydist/SecretStorage-3.3.2-py3-none-any.whl +RUN pip install -vv pydist/keyring-23.7.0-py3-none-any.whl + +RUN pip install -vv pydist/pyasn1-0.4.8-py2.py3-none-any.whl +RUN pip install -vv pydist/pyasn1_modules-0.2.8-py2.py3-none-any.whl +RUN pip install -vv pydist/rsa-4.9-py3-none-any.whl +RUN pip install -vv pydist/cachetools-5.2.0-py3-none-any.whl +RUN pip install -vv pydist/six-1.16.0-py2.py3-none-any.whl +RUN pip install -vv pydist/google_auth-2.9.1-py2.py3-none-any.whl +RUN pip install -vv pydist/certifi-2022.6.15-py3-none-any.whl +RUN pip install -vv pydist/charset_normalizer-2.1.0-py3-none-any.whl +RUN pip install -vv pydist/idna-3.3-py3-none-any.whl +RUN pip install -vv pydist/urllib3-1.26.11-py2.py3-none-any.whl +RUN pip install -vv pydist/requests-2.28.1-py3-none-any.whl +RUN pip install -vv pydist/pluggy-1.0.0-py2.py3-none-any.whl +RUN pip install -vv pydist/keyrings.google_artifactregistry_auth-1.0.0-py3-none-any.whl + +RUN keyring --list-backends + +RUN pip install -U pip +RUN pip install google-cloud-aiplatform==1.14.0 google-cloud-aiplatform[tensorboard] +RUN pip install pytest kfp==1.8.12 google-cloud-bigquery==2.34.3 google-cloud-bigquery-storage==2.13.2 google-cloud-aiplatform==1.14.0 \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/no_internet_access/Dockerfile.dataflow b/examples/vertex_mlops_enterprise/no_internet_access/Dockerfile.dataflow new file mode 100644 index 0000000000..be3df46398 --- /dev/null +++ b/examples/vertex_mlops_enterprise/no_internet_access/Dockerfile.dataflow @@ -0,0 +1,44 @@ +FROM apache/beam_python3.7_sdk:2.39.0 + +#RUN pip install keyring keyrings.google-artifactregistry-auth + + +COPY build/pip.conf pip.conf +ENV PIP_CONFIG_FILE=pip.conf + + +ADD build/pydist pydist +# These need to go in order so we don't hit unsatisfied deps +RUN pip install -vv pydist/jeepney-0.8.0-py3-none-any.whl +RUN pip install -vv pydist/typing_extensions-4.3.0-py3-none-any.whl +RUN pip install -vv pydist/zipp-3.8.1-py3-none-any.whl +RUN pip install -vv pydist/importlib_metadata-4.12.0-py3-none-any.whl +RUN pip install -vv pydist/pycparser-2.21-py2.py3-none-any.whl +RUN pip install -vv pydist/cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +RUN pip install -vv pydist/cryptography-37.0.4-cp36-abi3-manylinux_2_24_x86_64.whl +RUN pip install -vv pydist/SecretStorage-3.3.2-py3-none-any.whl +RUN pip install -vv pydist/keyring-23.7.0-py3-none-any.whl + +RUN pip install -vv pydist/pyasn1-0.4.8-py2.py3-none-any.whl +RUN pip install -vv pydist/pyasn1_modules-0.2.8-py2.py3-none-any.whl +RUN pip install -vv pydist/rsa-4.9-py3-none-any.whl +RUN pip install -vv pydist/cachetools-5.2.0-py3-none-any.whl +RUN pip install -vv pydist/six-1.16.0-py2.py3-none-any.whl +RUN pip install -vv pydist/google_auth-2.9.1-py2.py3-none-any.whl +RUN pip install -vv pydist/certifi-2022.6.15-py3-none-any.whl +RUN pip install -vv pydist/charset_normalizer-2.1.0-py3-none-any.whl +RUN pip install -vv pydist/idna-3.3-py3-none-any.whl +RUN pip install -vv pydist/urllib3-1.26.11-py2.py3-none-any.whl +RUN pip install -vv pydist/requests-2.28.1-py3-none-any.whl +RUN pip install -vv pydist/pluggy-1.0.0-py2.py3-none-any.whl +RUN pip install -vv pydist/keyrings.google_artifactregistry_auth-1.0.0-py3-none-any.whl + + +RUN keyring --list-backends + + +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt + +COPY src/raw_schema/schema.pbtxt raw_schema/ +COPY src/ src/ diff --git a/examples/vertex_mlops_enterprise/no_internet_access/Dockerfile.vertex b/examples/vertex_mlops_enterprise/no_internet_access/Dockerfile.vertex new file mode 100644 index 0000000000..b89e7a7b1f --- /dev/null +++ b/examples/vertex_mlops_enterprise/no_internet_access/Dockerfile.vertex @@ -0,0 +1,49 @@ +FROM gcr.io/tfx-oss-public/tfx:1.8.0 + + +#RUN pip install keyring keyrings.google-artifactregistry-auth + + +COPY build/pip.conf pip.conf +ENV PIP_CONFIG_FILE=pip.conf + + +ADD build/pydist pydist +# These need to go in order so we don't hit unsatisfied deps +RUN pip install pydist/jeepney-0.8.0-py3-none-any.whl +RUN pip install pydist/typing_extensions-4.3.0-py3-none-any.whl +RUN pip install pydist/zipp-3.8.1-py3-none-any.whl +RUN pip install pydist/importlib_metadata-4.12.0-py3-none-any.whl +RUN pip install pydist/pycparser-2.21-py2.py3-none-any.whl +RUN pip install pydist/cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +RUN pip install pydist/cryptography-37.0.4-cp36-abi3-manylinux_2_24_x86_64.whl +RUN pip install pydist/SecretStorage-3.3.2-py3-none-any.whl +RUN pip install pydist/keyring-23.7.0-py3-none-any.whl + +RUN pip install pydist/pyasn1-0.4.8-py2.py3-none-any.whl +RUN pip install pydist/pyasn1_modules-0.2.8-py2.py3-none-any.whl +RUN pip install pydist/rsa-4.9-py3-none-any.whl +RUN pip install pydist/cachetools-5.2.0-py3-none-any.whl +RUN pip install pydist/six-1.16.0-py2.py3-none-any.whl +RUN pip install pydist/google_auth-2.9.1-py2.py3-none-any.whl +RUN pip install pydist/certifi-2022.6.15-py3-none-any.whl +RUN pip install pydist/charset_normalizer-2.1.0-py3-none-any.whl +RUN pip install pydist/idna-3.3-py3-none-any.whl +RUN pip install pydist/urllib3-1.26.11-py2.py3-none-any.whl +RUN pip install pydist/requests-2.28.1-py3-none-any.whl +RUN pip install pydist/pluggy-1.0.0-py2.py3-none-any.whl +RUN pip install pydist/keyrings.google_artifactregistry_auth-1.0.0-py3-none-any.whl + + +COPY requirements.txt requirements.txt + +RUN pip install -r requirements.txt + +# RuntimeError: module compiled against api version 0xe but this version of numpy is 0xd +# Fixed by below command - see https://stackoverflow.com/questions/33859531/runtimeerror-module-compiled-against-api-version-a-but-this-version-of-numpy-is + +RUN pip install -U numpy --ignore-installed + +COPY src/ src/ + +ENV PYTHONPATH="/pipeline:${PYTHONPATH}" diff --git a/examples/vertex_mlops_enterprise/no_internet_access/README.md b/examples/vertex_mlops_enterprise/no_internet_access/README.md new file mode 100644 index 0000000000..d45e2181d3 --- /dev/null +++ b/examples/vertex_mlops_enterprise/no_internet_access/README.md @@ -0,0 +1,2 @@ +If there is a requirement of no internet connectivity, download files from `pkgs_to_install.txt` use these Dockerfiles files. + diff --git a/examples/vertex_mlops_enterprise/no_internet_access/pkgs_to_install.txt b/examples/vertex_mlops_enterprise/no_internet_access/pkgs_to_install.txt new file mode 100644 index 0000000000..8c6af70dd6 --- /dev/null +++ b/examples/vertex_mlops_enterprise/no_internet_access/pkgs_to_install.txt @@ -0,0 +1,22 @@ +jeepney-0.8.0-py3-none-any.whl +typing_extensions-4.3.0-py3-none-any.whl +zipp-3.8.1-py3-none-any.whl +importlib_metadata-4.12.0-py3-none-any.whl +pycparser-2.21-py2.py3-none-any.whl +cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +cryptography-37.0.4-cp36-abi3-manylinux_2_24_x86_64.whl +SecretStorage-3.3.2-py3-none-any.whl +keyring-23.7.0-py3-none-any.whl +pyasn1-0.4.8-py2.py3-none-any.whl +pyasn1_modules-0.2.8-py2.py3-none-any.whl +rsa-4.9-py3-none-any.whl +cachetools-5.2.0-py3-none-any.whl +six-1.16.0-py2.py3-none-any.whl +google_auth-2.9.1-py2.py3-none-any.whl +certifi-2022.6.15-py3-none-any.whl +charset_normalizer-2.1.0-py3-none-any.whl +idna-3.3-py3-none-any.whl +urllib3-1.26.11-py2.py3-none-any.whl +requests-2.28.1-py3-none-any.whl +pluggy-1.0.0-py2.py3-none-any.whl +keyrings.google_artifactregistry_auth-1.0.0-py3-none-any.whl diff --git a/examples/vertex_mlops_enterprise/requirements.txt b/examples/vertex_mlops_enterprise/requirements.txt new file mode 100644 index 0000000000..5894c536fa --- /dev/null +++ b/examples/vertex_mlops_enterprise/requirements.txt @@ -0,0 +1,18 @@ +kfp==1.8.12 +google-cloud-bigquery==2.34.3 +google-cloud-bigquery-storage==2.13.2 +#google-cloud-aiplatform==1.14.0 +google-cloud-aiplatform==1.15.0 +google-cloud-pubsub +cloudml-hypertune==0.1.0.dev6 +pytest==7.1.2 +#tensorflow==2.8.2 +tensorflow-data-validation==1.8.0 +tensorflow-transform==1.8.0 +tfx==1.8.0 +tensorflow-io==0.26.0 +apache-beam[gcp]==2.39.0 +numpy==1.22.0 +flask==2.2.5 +protobuf==3.20.3 +tensorflow-model-analysis==0.39.0 diff --git a/examples/vertex_mlops_enterprise/src/__init__.py b/examples/vertex_mlops_enterprise/src/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/vertex_mlops_enterprise/src/common/datasource_utils.py b/examples/vertex_mlops_enterprise/src/common/datasource_utils.py new file mode 100644 index 0000000000..b617100740 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/common/datasource_utils.py @@ -0,0 +1,60 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for generating BigQuery data querying scirpts.""" + + +from google.cloud import aiplatform as vertex_ai + + +def get_source_query(bq_dataset_name, bq_table_name, ml_use, limit=None): + query = f""" + SELECT * + """ + + if not ml_use: + query += f""" + EXCEPT (Time, ML_use, Class) + FROM {bq_dataset_name}.{bq_table_name} + """ + else: + query += f""" + EXCEPT (Time, ML_use) + FROM {bq_dataset_name}.{bq_table_name} + WHERE ML_use = '{ml_use}' + """ + + if limit: + query += f"LIMIT {limit}" + + return query + + +def get_training_source_query( + project, region, dataset_display_name, ml_use, limit=None +): + vertex_ai.init(project=project, location=region) + + dataset = vertex_ai.TabularDataset.list( + filter=f"display_name={dataset_display_name}", order_by="update_time" + )[-1] + bq_source_uri = dataset.gca_resource.metadata["inputConfig"]["bigquerySource"][ + "uri" + ] + _, bq_dataset_name, bq_table_name = bq_source_uri.replace("g://", "").split(".") + + return get_source_query(bq_dataset_name, bq_table_name, ml_use, limit) + + +def get_serving_source_query(bq_dataset_name, bq_table_name, limit=None): + return get_source_query(bq_dataset_name, bq_table_name, ml_use=None, limit=limit) diff --git a/examples/vertex_mlops_enterprise/src/common/features.py b/examples/vertex_mlops_enterprise/src/common/features.py new file mode 100644 index 0000000000..3bd3033ab4 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/common/features.py @@ -0,0 +1,52 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model features metadata utils.""" + + +TARGET_FEATURE_NAME = "Class" +TARGET_LABELS = ["legit", "fraudulent"] + + +def generate_explanation_config(transform_feature_spec=None): + explanation_config = { + "inputs": {}, + "outputs": {}, + "params": {"sampled_shapley_attribution": {"path_count": 10}}, + } + + if transform_feature_spec is None: + # hardcoded + for i in range(28): + feature_name = f'V{i+1}' + explanation_config["inputs"][feature_name] = { + "input_tensor_name": feature_name, + "modality": "numeric", + } + feature_name = 'Amount' + explanation_config["inputs"][feature_name] = { + "input_tensor_name": feature_name, + "modality": "numeric", + } + else: + # specified by input argument + for feature_name in transform_feature_spec: + if feature_name != TARGET_FEATURE_NAME: + explanation_config["inputs"][feature_name] = { + "input_tensor_name": feature_name, + "modality": "numeric", + } + + explanation_config["outputs"] = {"scores": {"output_tensor_name": "scores"}} + + return explanation_config diff --git a/examples/vertex_mlops_enterprise/src/feature_store/feature_store.py b/examples/vertex_mlops_enterprise/src/feature_store/feature_store.py new file mode 100644 index 0000000000..00992544d8 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/feature_store/feature_store.py @@ -0,0 +1,158 @@ +from google.cloud.aiplatform_v1beta1 import FeaturestoreOnlineServingServiceClient, FeaturestoreServiceClient, FeatureSelector +from google.cloud.aiplatform_v1beta1.types import featurestore_online_service as featurestore_online_service_pb2 +from google.cloud.aiplatform_v1beta1.types import entity_type as entity_type_pb2 +from google.cloud.aiplatform_v1beta1.types import feature as feature_pb2 +from google.cloud.aiplatform_v1beta1.types import featurestore_service as featurestore_service_pb2 +from google.cloud.aiplatform_v1beta1.types import io as io_pb2 +from google.cloud.aiplatform_v1beta1.types import ListFeaturestoresRequest, CreateFeaturestoreRequest, Featurestore, ListEntityTypesRequest + +from google.protobuf.timestamp_pb2 import Timestamp +from google.cloud.aiplatform_v1beta1.types import featurestore_monitoring as featurestore_monitoring_pb2 +from google.protobuf.duration_pb2 import Duration + + + +def create_fs(project, region, store_id, store_name=None, nodes=1): + + API_ENDPOINT = f"{region}-aiplatform.googleapis.com" + admin_client = FeaturestoreServiceClient(client_options={"api_endpoint": API_ENDPOINT}) + + base_path = admin_client.common_location_path(project, region) + + for f in admin_client.list_featurestores(ListFeaturestoresRequest(parent=admin_client.common_location_path(project, region))): + existing_id = f.name.split('/')[-1] + if store_id == existing_id: + print(f'Feature Store "{store_id}" already exists in {region}') + return + + if nodes == 0: + print('Creating Feature Store WITHOUT any online serving nodes. This Feature Store will not be able to serve on-line requests.') + elif nodes == 1: + print('Creating Feature Store with 1 online serving node.') + else: + print(f'Creating Feature Store with {nodes} online serving nodes.') + + if store_name is None: + store_name = f'{base_path}/{store_id}' + + req = CreateFeaturestoreRequest( + parent = base_path, + featurestore = Featurestore( + name=store_name, + online_serving_config=Featurestore.OnlineServingConfig(fixed_node_count=nodes)), + featurestore_id = store_id) + + lro = admin_client.create_featurestore(req) + name = lro.result() + print(f'Created Feature Store {name} in {region}') + return name + + +def create_entity(project, region, store_id, entity, entity_descr, features, features_descr=None): + + API_ENDPOINT = f"{region}-aiplatform.googleapis.com" + admin_client = FeaturestoreServiceClient(client_options={"api_endpoint": API_ENDPOINT}) + + if features_descr is None: + features_descr = features + + if len(features) != len(features_descr): + print(f'ERROR: Got {len(features)} features and {len(features_descr)} descriptions') + return + + # check if this entity already exists + request = ListEntityTypesRequest(parent=admin_client.featurestore_path(project, region, store_id)) + page_result = admin_client.list_entity_types(request=request) + existing_entities = [x.name.split('/')[-1] for x in page_result] + + if entity in existing_entities: + print(f'Entity {entity} already exists in Feature Store {store_id} ({region})') + return + + + print(f'Creating entity {entity} in Feature Store {store_id} ({region})') + + snapshot_analysis = featurestore_monitoring_pb2.FeaturestoreMonitoringConfig.SnapshotAnalysis( + monitoring_interval=Duration(seconds=3600)) # 1 hour + + lro = admin_client.create_entity_type( + featurestore_service_pb2.CreateEntityTypeRequest( + parent=admin_client.featurestore_path(project, region, store_id), + entity_type_id=entity, + entity_type=entity_type_pb2.EntityType( + description=entity_descr, + monitoring_config=featurestore_monitoring_pb2.FeaturestoreMonitoringConfig( + snapshot_analysis=snapshot_analysis)) + ) + ).result() + + print(lro) + + def _create_f_request(name, descr): + return featurestore_service_pb2.CreateFeatureRequest( + feature=feature_pb2.Feature( + value_type=feature_pb2.Feature.ValueType.DOUBLE, + description=descr, + monitoring_config=featurestore_monitoring_pb2.FeaturestoreMonitoringConfig( + snapshot_analysis=snapshot_analysis)), + feature_id=name) + + requests = [_create_f_request(x[0], x[1]) for x in zip(features, features_descr)] + + print(f'\nCreating features: {",".join(features)}') + + lro = admin_client.batch_create_features( + parent=admin_client.entity_type_path(project, region, store_id, entity), + requests=requests).result() + + return lro + + +def ingest_entities_csv(project, region, store_id, entity, features, gcs_uris): + + API_ENDPOINT = f"{region}-aiplatform.googleapis.com" + admin_client = FeaturestoreServiceClient(client_options={"api_endpoint": API_ENDPOINT}) + + timestamp = Timestamp() + timestamp.GetCurrentTime() + timestamp.nanos = 0 + + specs = [featurestore_service_pb2.ImportFeatureValuesRequest.FeatureSpec(id=f) for f in features] + + import_request_transaction = featurestore_service_pb2.ImportFeatureValuesRequest( + entity_type=admin_client.entity_type_path(project, region, store_id, entity), + csv_source=io_pb2.CsvSource(gcs_source=io_pb2.GcsSource(uris=gcs_uris)), + feature_specs=specs, + entity_id_field=entity, + feature_time=timestamp, # unique timestamp for all + worker_count=5) + + print(f'Ingesting features for "{entity}" entity...') + ingestion_lro = admin_client.import_feature_values(import_request_transaction).result() + print('done') + + return ingestion_lro + + +# entity is the name of the entity type you want to read, for example: user +# entity_value is the specific instance of the entity that you want to have the feature of, for example a user ID +def read_features(project, region, store_id, entity, features, entity_value): + + API_ENDPOINT = f"{region}-aiplatform.googleapis.com" + admin_client = FeaturestoreServiceClient(client_options={"api_endpoint": API_ENDPOINT}) + data_client = FeaturestoreOnlineServingServiceClient(client_options={"api_endpoint": API_ENDPOINT}) + + feature_selector = FeatureSelector() + feature_selector.id_matcher.ids = features + + read_request = featurestore_online_service_pb2.ReadFeatureValuesRequest( + entity_type = admin_client.entity_type_path(project, region, store_id, entity), + entity_id = entity_value, + feature_selector=feature_selector) + + res = data_client.read_feature_values(read_request) + values = [d.value for d in res.entity_view.data] + + # return a dict with { 'feature1': val1, 'feature2': val2, ... } + # exclude features that do not have a generate_time: these do not exist in the store + return {f:v.double_value for (f,v) in zip(features, values) if v.metadata.generate_time} \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/.gitignore b/examples/vertex_mlops_enterprise/src/kfp_pipelines/.gitignore new file mode 100644 index 0000000000..9cc40847ca --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/.gitignore @@ -0,0 +1,3 @@ +config.bash +xgb-creditcards.json +venv diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/Dockerfile b/examples/vertex_mlops_enterprise/src/kfp_pipelines/Dockerfile new file mode 100644 index 0000000000..bf29a2dca1 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.8 + + +COPY requirements.txt . +RUN pip install --upgrade pip +RUN pip install -r requirements.txt + +COPY src . +ENV PYTHONPATH=/ diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/Dockerfile.modelcard b/examples/vertex_mlops_enterprise/src/kfp_pipelines/Dockerfile.modelcard new file mode 100644 index 0000000000..f2a09ad763 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/Dockerfile.modelcard @@ -0,0 +1,9 @@ +FROM gcr.io/deeplearning-platform-release/tf-cpu.2-11.py37:latest + +RUN pip install \ +model-card-toolkit==2.0.0 \ +kfp==2.0.1 \ +pandas==1.3.5 + +COPY src . +ENV PYTHONPATH=/ diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/README.md b/examples/vertex_mlops_enterprise/src/kfp_pipelines/README.md new file mode 100644 index 0000000000..7587a96c6e --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/README.md @@ -0,0 +1,72 @@ +# Reference KFP Pipeline + +We include here a reference KFP pipeline implementation, that follows best practices such as: + +* Traceability of data by storing training, test and validation datasets as an intermediate artifact +* Splitting the input data into training, test and validation and giving the training step only access to the training and test data +* Producing a model card for flexibility of governance +* Evaluate the model performance metrics and upload this evaluation to Vertex Model Registry + +## Preparing to run the reference pipeline + +The custom components used in this pipeline use a custom package, mainly to be able to access the configuration in `config.py`. Prior to building this custom image, we need to set the following environment variables: + +``` +export PROJECT_ID= +export REGION= +export ARTIFACT_REG_REPO=kfp_reference_pipeline # or choose a different name +``` + +First create a docker repository in Artifact Registry to store the image: + +``` +gcloud artifacts repositories create $ARTIFACT_REG_REPO --project=$PROJECT --location=$REGION --repository-format=docker +``` + +This custom image is built with the following command, executed from this directory (where the `Dockerfile` is): + +``` +gcloud builds submit --tag ${REGION}-docker.pkg.dev/${PROJECT_ID}/${ARTIFACT_REG_REPO}/base:latest +``` + +## Launch the pipeline + +From the `src` directory: + +``` +python pipeline.py +``` + +## Functional description of the pipeline + +This pipeline trains an XGBoost model and uploads it to the Vertex AI Model Registry. It then proceeds to deploy this model +to a Vertex Endpoint for online predictions. It also uses the validation dataset to evaluate the model, and it uploads this +model evaluation to Model Registry, where it is visualised (see the [Google Cloud Blog](https://cloud.google.com/blog/topics/developers-practitioners/improving-model-quality-scale-vertex-ai-model-evaluation)). We have indicated these different parts of the pipeline process +in the diagram below. + +![A diagram of the training and evaluation pipeline.](train-eval-pipeline.png) + +The pipeline starts by retrieving the input data from BigQuery and splitting it into a training dataset, a test dataset, and +an evaluation dataset. The training and the test data are used by the model training process to train the model. The validation +data is kept apart (it is not used until the model is completely trained). These datasets are stored in CSV on Google Cloud Storage, +so that we have a record of exactly the data that the model was trained and evaluated with. + +The **training part of the pipeline** has just the model training and model upload steps, which finish with a new version of a model +in the Model Registry. + +The **deployment section** creates a new Vertex Endpoint and deploys the model to it. In a more realistic scenario, you probably want +to make the deployment conditional on the model performance. + +The **model evaluation** is a bit more complex. We start by uploading the evaluation data to BigQuery, to facilitate the evaluation +process. We run it through a batch prediction job to generate the model predictions and pass the output to a "model evaluation" +step, that joins the model predictions and the examples from the validation dataset (which contains the ground truth). This +gives us a dataset with examples that have both the model output and the ground truth, and we can use this to evaluate the model +performance. We do expect the model output in a certain format though, so produce the right formatting we have introduce a step +to format the model output field according to expectation. + +Aside from these processes, we have a few pipeline steps that are not included anywhere, and these are to generate a +[model card](https://medium.com/google-cloud/build-responsible-models-with-model-cards-and-vertex-ai-pipelines-8cbf451e7632) for our model. +A model card, in this case, is an HTML document with any information that we may be required to provide about our model. Think about +what the model can be used for in case someone else might want to use it, the risks associated with using the model, the performance, and so +on. Generating a model card as part of the pipeline provides the flexibility to meet any documentation requirements that the users or the +organization may have. diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/model_card_config.json b/examples/vertex_mlops_enterprise/src/kfp_pipelines/model_card_config.json new file mode 100644 index 0000000000..b77d5a8eac --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/model_card_config.json @@ -0,0 +1,24 @@ +{ + "model_name": "ULB - Credit Card Fraud Public Dataset", + "model_overview": + "The dataset contains a set of transactions, all of which have been labeled as fraudulent or legit. + The fraudulent transaction are by far the smaller class (0.172% or the total), so this is a highly + imbalanced dataset. It contains labeled transaction from 2013 from European cardholders and it + was published by the Université Libre de Bruxelles (Brussels Free University, ULB). + +

The features comprised in this dataset are the result of applying PCA to the original dataset + and therefore cannot be interpreted. There are 31 features, labeled V1 to V31 in this dataset, + plus Amount and Time.", + "model_owners": [{"name": "Paul Balm", "contact": "abcdef@google.com"}], + "model_references": ["https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud"], + "model_ethical_consideration_risks": [ + { + "name": "The dataset contains a small number of fraud cases that makes it difficult to verify that models trained on this data will generalize to unseen data.", + "mitigation_strategy": "Verify that the model generalizes to the data for the use case at hand, before using it for any decision making."}, + { + "name": "Classifying a transaction as fraudulent can be highly impactful for the person responsible for the transaction.", + "mitigation_strategy": "Any classification as fraudulent and implication thereof needs to be treated with caution and requires a very high level of certainty."}], + "model_ethical_limitations": [{"description": "Using this model requires knowledge of the PCA transformation that was applied to the training data."}], + "model_considerations_use_cases": [{"description": "Fraud detection and prevention (ML technology demo only)"}], + "model_considerations_users": [{"description": "Data Scientists"}] +} \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/requirements.txt b/examples/vertex_mlops_enterprise/src/kfp_pipelines/requirements.txt new file mode 100644 index 0000000000..d054bdac22 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/requirements.txt @@ -0,0 +1,14 @@ +cloudml-hypertune==0.1.0.dev6 +db-dtypes==1.1.1 +google-cloud-aiplatform==1.27.1 +google-cloud-storage==2.10.0 +google_cloud_pipeline_components==2.0.0b5 +kfp==2.0.1 +pandas==1.3.5 +pandas_gbq +protobuf==3.20.3 +pyarrow==6.0.1 +scikit-learn==0.24.1 +scipy==1.10.1 +seaborn==0.12.2 +xgboost==1.6.2 diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/config.py b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/config.py new file mode 100644 index 0000000000..308dc496a2 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/config.py @@ -0,0 +1,34 @@ +import os + +PROJECT_ID = os.getenv("PROJECT_ID", "") +REGION = os.getenv("REGION", "") +IMAGE=os.getenv("CICD_IMAGE_URI", f'{REGION}-docker.pkg.dev/{PROJECT_ID}/creditcards-kfp/base:latest') +TRAIN_COMPONENT_IMAGE=f'{REGION}-docker.pkg.dev/{PROJECT_ID}/creditcards-kfp/train-fraud:latest' +IMAGE_MODEL_CARD=os.getenv("CICD_IMAGE_MODEL_CARD", f'{REGION}-docker.pkg.dev/{PROJECT_ID}/creditcards-kfp/model-card:latest') + +CLASS_NAMES = ['OK', 'Fraud'] +TARGET_COLUMN = 'Class' + +PIPELINE_NAME = os.getenv("PIPELINE_NAME", 'xgb-creditcards') +PIPELINE_ROOT = os.getenv("PIPELINES_STORE", f'gs://{PROJECT_ID}/pipeline_root/{PIPELINE_NAME}') +SERVICE_ACCOUNT = os.getenv("SERVICE_ACCOUNT") # returns None is not defined +NETWORK = os.getenv("NETWORK") # returns None is not defined +KEY_ID = os.getenv("CMEK_KEY_ID") # e.g. projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key + +BQ_INPUT_DATA=f"{PROJECT_ID}.{os.getenv('BQ_DATASET_NAME')}.{os.getenv('ML_TABLE')}" +PARENT_MODEL='' # f'projects/{PROJECT_ID}/locations/{REGION}/models/YOUR_NUMERIC_MODEL_ID_HERE' + +BQ_OUTPUT_DATASET_ID="creditcards_batch_out" + +MODEL_DISPLAY_NAME = os.getenv("MODEL_DISPLAY_NAME", 'creditcards-kfp') +MODEL_CARD_CONFIG='../model_card_config.json' + +PRED_CONTAINER='europe-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-6:latest' +ENDPOINT_NAME=PIPELINE_NAME + +EMAILS=['abcdef@google.com'] + +# Evaluation pipeline +DATAFLOW_SA = os.getenv("DATAFLOW_SA") +DATAFLOW_NETWORK = os.getenv("DATAFLOW_NETWORK") +DATAFLOW_PUBLIC_IPS = False diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/eval.py b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/eval.py new file mode 100644 index 0000000000..cdfcbef756 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/eval.py @@ -0,0 +1,111 @@ +from kfp.v2 import dsl +from kfp.v2.dsl import Artifact, Dataset, Input, Model, Output + +from config import IMAGE + + +@dsl.component( + packages_to_install=['shap'], + base_image=IMAGE) +def evaluate_model( + test_data: Input[Dataset], + trained_model: Input[Model], + reports: Output[Artifact], + class_names: list, + target_column: str +): + + import pickle as pkl + import pandas as pd + import xgboost as xgb + import numpy as np + import sklearn + from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score + from sklearn.metrics import ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay + from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc + from model_card_toolkit.utils.graphics import figure_to_base64str + import logging + + # This is a work-around for a bug in shap + np.int = int + + + def ShapDisplay(model, x, class_names): + """ + This function returns a SHAP summary plot. + Args: + model (sklearn.pipeline.Pipeline): The model pipeline. + x (numpy.ndarray): The data to explain. + column_names (list): The names of the columns. + class_names (list): The names of the classes. + Returns: + A Matplotlib figure. + """ + import matplotlib.pyplot as plt + import shap + + fig = plt.figure() + explainer = shap.TreeExplainer(model) + shap_values = explainer.shap_values(x) + shap.summary_plot(shap_values, x, feature_names=list(x.columns), + class_names=class_names, plot_type='bar', show=False) + fig.suptitle("Mean absolute SHAP Values") + fig.axes[0].set_xlabel('average impact on model output magnitude') + return fig + + # Read data + logging.info("Reading data...") + test = pd.read_csv(test_data.path) + + # Read model + logging.info("Reading model...") + model = xgb.Booster() + model.load_model(trained_model.path) + + # Evaluate model + logging.info("Evaluating model...") + y = test.pop(target_column) + X = test + y_pred_prob = model.predict(xgb.DMatrix(X)) + y_pred = list(map(lambda x: x >= 0.5, y_pred_prob)) + accuracy = accuracy_score(y, y_pred) + pos_label=1 + precision = precision_score(y, y_pred, pos_label=pos_label) + recall = recall_score(y, y_pred, pos_label=pos_label) + f1 = f1_score(y, y_pred, pos_label=pos_label) + logging.info(f"Accuracy: {accuracy:.3g}") + logging.info(f"Precision: {precision:.3g}") + logging.info(f"Recall: {recall:.3g}") + logging.info(f"F1: {f1:.3g}") + + # Save model reports + logging.info("Saving model reports...") + logging.info(f"sklearn {sklearn.__version__}") + + c_m = confusion_matrix(y, y_pred) + cm = ConfusionMatrixDisplay(confusion_matrix=c_m) + cm.plot() + cm.figure_.suptitle('Confusion Matrix') + + precision, recall, _ = precision_recall_curve(y, y_pred) + pr = PrecisionRecallDisplay(precision=precision, recall=recall) + pr.plot() + pr.figure_.suptitle('Precision Recall Curve') + + fpr, tpr, _ = roc_curve(y, y_pred) + roc_auc = auc(fpr, tpr) + roc = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc) + roc.plot() + roc.figure_.suptitle('ROC') + + # Calculate SHAP summary plot + shap_fig = ShapDisplay(model, X, class_names) + shap_fig.suptitle + reports_dict = { + "confusion_matrix": figure_to_base64str(cm.figure_), + "precision_recall": figure_to_base64str(pr.figure_), + "roc_curve": figure_to_base64str(roc.figure_), + "shap_plot": figure_to_base64str(shap_fig) + } + with open(reports.path, "wb") as f: + pkl.dump(reports_dict, f) diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/load.py b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/load.py new file mode 100644 index 0000000000..5d5b323f74 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/load.py @@ -0,0 +1,115 @@ +from kfp import dsl +from google_cloud_pipeline_components.types.artifact_types import BQTable +from typing import NamedTuple + +from config import IMAGE + +# Load data from BigQuery and save to CSV +@dsl.component(base_image=IMAGE) +def get_dataframe( + project_id: str, + bq_table: str, + train_data: dsl.OutputPath("Dataset"), + test_data: dsl.OutputPath("Dataset"), + val_data: dsl.OutputPath("Dataset"), + stats: dsl.Output[dsl.Artifact], + class_names: list +): + from google.cloud import bigquery + from model_card_toolkit.utils.graphics import figure_to_base64str + from sklearn.model_selection import train_test_split + import pickle + import seaborn as sns + import logging + + bqclient = bigquery.Client(project=project_id) + logging.info(f"Pulling data from {bq_table}") + table = bigquery.TableReference.from_string(bq_table) + rows = bqclient.list_rows(table) + dataframe = rows.to_dataframe(create_bqstorage_client=True) + # Drop the Time column, otherwise the model will just memorize when the fraud cases happened + # Also drop the ml_use column - we will split here. ML_use just splits in test+rest, and we need a 3-way split. + dataframe.drop(columns=['Time', 'ML_use'], inplace=True) + logging.info("Data loaded, writing splits") + + # 60 / 20 / 20 + df_train, df_test = train_test_split(dataframe, test_size=0.4) + df_test, df_val = train_test_split(df_test, test_size=0.5) + + df_train.to_csv(train_data, index=False) + df_test.to_csv(test_data, index=False) + df_val.to_csv(val_data, index=False) + + def get_fig(df, title): + n_fraud = (df.Class == '1').sum() + n_ok = len(df) - n_fraud + + logging.info(f"Stats for {title}: {n_ok=} {n_fraud=}") + + ys = [n_ok, n_fraud] + + g = sns.barplot(x=class_names, y=ys) + g.set_yscale('log') + g.set_ylim(1, n_ok*2) + fig = g.get_figure() + fig.suptitle(title) + return fig + + logging.info("Generating stats") + stats_dict = {} + fig = get_fig(df_train, "Training data") + stats_dict['train'] = figure_to_base64str(fig) + fig.clf() + + fig = get_fig(df_test, "Test data") + stats_dict['test'] = figure_to_base64str(fig) + fig.clf() + + fig = get_fig(df_val, "Validation data") + stats_dict['val'] = figure_to_base64str(fig) + fig.clf() + + logging.info(f"Writing stats to {stats.path}") + with open(stats.path, 'wb') as f: + pickle.dump(stats_dict, f) + + +@dsl.component(base_image=IMAGE) +def upload_to_bq( + project: str, + location: str, + dest_dataset_id: str, + dest_table_id: str, + csv_data: dsl.Input[dsl.Dataset], + bq_table: dsl.Output[BQTable]) -> NamedTuple('outputs', [('bq_table_uri', str)]): + + from collections import namedtuple + import logging + import pandas as pd + import numpy as np + + from config import CLASS_NAMES + + bq_table.metadata["projectId"] = project + bq_table.metadata["datasetId"] = dest_dataset_id + bq_table.metadata["tableId"] = dest_table_id + logging.info(f"BQ table: {bq_table}\nmetadata: {bq_table.metadata}") + + logging.info(f"Reading {csv_data.path}") + dest_table = f'{dest_dataset_id}.{dest_table_id}' + logging.info(f"Writing to {dest_table}") + + df = pd.read_csv(csv_data.path) + + # Convert Class column to int and map to CLASS_NAMES label + df_class = df.pop('Class') + df['Class'] = list(map(lambda f: CLASS_NAMES[f], np.rint(df_class).astype(np.int64))) + + df.to_gbq( + destination_table=f"{dest_table}", + project_id=project, + location=location) + + t = namedtuple('outputs', ['bq_table_uri']) + return t(f'bq://{project}.{dest_dataset_id}.{dest_table_id}') + \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/model_card.py b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/model_card.py new file mode 100644 index 0000000000..0a5f66ecb0 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/model_card.py @@ -0,0 +1,138 @@ +from kfp import dsl +from kfp.dsl import Input, Output, Dataset, Artifact, HTML + +from config import IMAGE_MODEL_CARD + + +@dsl.component(base_image=IMAGE_MODEL_CARD) +def plot_model_card( + project_id: str, + region: str, + model: Input[Artifact], + train_data: Input[Dataset], + test_data: Input[Dataset], + val_data: Input[Dataset], + stats: Input[Artifact], + reports: Input[Artifact], + model_card_config: str, + model_card: Output[HTML] +): + + # Libraries + import json + import pickle as pkl + import model_card_toolkit as mctlib + import pandas as pd + import logging + from google.cloud import aiplatform + + # Load model card config + model_card_cd = json.loads(model_card_config) + + # Read data + print(f"Reading data... {train_data.path}") + train = pd.read_csv(train_data.path) + test = pd.read_csv(test_data.path) + val = pd.read_csv(val_data.path) + + # Read stats + logging.info(f"Reading stats... {stats.path}") + with open(stats.path, "rb") as stats_file: + stats = pkl.load(stats_file) + + # Read reports + logging.info(f"Reading reports... {reports.path}") + with open(reports.path, "rb") as f: + reports = pkl.load(f) + + # Compile model card + logging.info(f"Compiling model card... {model_card.path}") + mct = mctlib.ModelCardToolkit(model_card.path) + mc = mct.scaffold_assets() + + ## Model Details section + uri = model.metadata['resourceName'] + if uri.find('@') >= 0: # has version or alias + model_resource_name = uri[:uri.find('@')] # strip version or alias + model_version = uri[uri.find('@')+1:] + else: + model_resource_name = uri + model_version = None + + vertex_models = [m for m in aiplatform.Model.list(project=project_id, location=region) if m.resource_name==model_resource_name] + + if model_version: + models = [m for m in vertex_models if m.version_id == model_version] + if len(models)>1: + logging.warning(f"Found {len(models)} models with for {uri}") + vertex_model = models[0] + else: + if len(vertex_models)>1: + logging.warning(f"Found {len(vertex_models)} models with for {uri}") + vertex_model = vertex_models[0] + + mc.model_details.name = model_card_cd['model_name'] + mc.model_details.overview = model_card_cd['model_overview'] + mc.model_details.owners = [ + mctlib.Owner(name=owner_d['name'], contact=owner_d['contact']) + for owner_d in model_card_cd['model_owners']] + mc.model_details.references = [ + mctlib.Reference(reference=reference) + for reference in model_card_cd['model_references']] + + mc.model_details.version.name = vertex_model.resource_name + mc.model_details.version.date = vertex_model.create_time.strftime("%H:%M:%S (%Z), %-d %b %Y") + + ## Considerations section + mc.considerations.ethical_considerations = [ + mctlib.Risk( + name=risk['name'], + mitigation_strategy=risk['mitigation_strategy']) + for risk in model_card_cd['model_ethical_consideration_risks']] + mc.considerations.limitations = [ + mctlib.Limitation(description=limitation['description']) + for limitation in model_card_cd['model_ethical_limitations'] + ] + mc.considerations.use_cases = [ + mctlib.UseCase(description=use_case['description']) + for use_case in model_card_cd['model_considerations_use_cases']] + mc.considerations.users = [ + mctlib.User(description=user['description']) + for user in model_card_cd['model_considerations_users']] + + ## Datasets section + mc.model_parameters.data.append(mctlib.Dataset( + name="Training dataset", + description=f'{train.shape[0]:,} rows with {train.shape[1]:,} columns (features + target)')) + mc.model_parameters.data[0].graphics.collection = [ + mctlib.Graphic(image=stats['train']) + ] + mc.model_parameters.data.append(mctlib.Dataset( + name="Test dataset", + description=f'{test.shape[0]:,} rows with {test.shape[1]:,} columns (features + target)')) + mc.model_parameters.data[1].graphics.collection = [ + mctlib.Graphic(image=stats['test']) + ] + mc.model_parameters.data.append(mctlib.Dataset( + name="Validation dataset", + description=f'{val.shape[0]:,} rows with {val.shape[1]:,} columns (features + target)')) + mc.model_parameters.data[2].graphics.collection = [ + mctlib.Graphic(image=stats['val']) + ] + + ## Quantative Analysis section + mc.quantitative_analysis.graphics.description = ( + 'This analysis is performed using the validation dataset, which was not used in training.') + mc.quantitative_analysis.graphics.collection = [ + mctlib.Graphic(image=reports['roc_curve']), + mctlib.Graphic(image=reports['precision_recall']), + mctlib.Graphic(image=reports['confusion_matrix']), + mctlib.Graphic(image=reports['shap_plot']) + ] + + # Write model card + model_card_file = model_card.path + "/model_card.html" + logging.info(f"Writing model card... {model_card_file}") + mct.update_model_card(mc) + mct.export_format(model_card=mc, output_file=model_card_file) + model_card.path = model_card_file diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/model_monitoring.py b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/model_monitoring.py new file mode 100644 index 0000000000..38156c3691 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/model_monitoring.py @@ -0,0 +1,62 @@ +from kfp import dsl +from google_cloud_pipeline_components.types.artifact_types import VertexEndpoint + +from config import IMAGE + +@dsl.component(base_image=IMAGE) +def model_monitoring( + project_id: str, + region: str, + endpoint: dsl.Input[VertexEndpoint], + pipeline_id: str, + bq_train_data: str, # bq://mco-mm.bqmlga4.train + skew_threshold: float, + sampling_rate: float, + monitoring_interval_hours: int, + user_emails: list +): + + from google.cloud import aiplatform + from google.cloud.aiplatform import model_monitoring + from train import TARGET_COLUMN + import logging + + bq_train_data = "bq://" + bq_train_data + + skew_config = model_monitoring.SkewDetectionConfig( + data_source= bq_train_data, + skew_thresholds=skew_threshold, # pass float to set one value across all featues, or dict to vary threshold by feature. + target_field=TARGET_COLUMN, + ) + + objective_config = model_monitoring.ObjectiveConfig(skew_detection_config=skew_config) + + + # Create sampling configuration + random_sampling = model_monitoring.RandomSampleConfig(sample_rate=sampling_rate) + + # Create schedule configuration + schedule_config = model_monitoring.ScheduleConfig(monitor_interval=monitoring_interval_hours) + + # Create alerting configuration. + alerting_config = model_monitoring.EmailAlertConfig( + user_emails=user_emails, enable_logging=True + ) + + endpoint_uri = endpoint.metadata['resourceName'] + logging.info("Endpoint URI is " + endpoint_uri) + logging.info("Using google_cloud_aiplatform " + aiplatform.__version__) + logging.info("Training data: " + bq_train_data) + + # Create the monitoring job. + job = aiplatform.ModelDeploymentMonitoringJob.create( + display_name=pipeline_id + "-monitoring", + logging_sampling_strategy=random_sampling, + schedule_config=schedule_config, + alert_config=alerting_config, + objective_configs=objective_config, + project=project_id, + location=region, + endpoint=endpoint_uri + ) + logging.info("Job: " + job) \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/model_upload.py b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/model_upload.py new file mode 100644 index 0000000000..b2f0bff801 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/model_upload.py @@ -0,0 +1,39 @@ +from kfp import dsl + +from config import IMAGE + +@dsl.component(base_image=IMAGE) +def upload_model( + project_id: str, + region: str, + model: dsl.Input[dsl.Model], + display_name: str, + serving_image: str, + parent_model: str, + uploaded_model: dsl.Output[dsl.Artifact], + run: str, + run_id: str +): + from google.cloud import aiplatform + import logging + + logging.info(f"Upload model for run {run} and run ID {run_id}") + + model_path = '/'.join(model.uri.split('/')[:-1]) # remove filename after last / - send dir rather than file + + vertex_model = aiplatform.Model.upload( + project=project_id, + location=region, + display_name=display_name, + artifact_uri=model_path, + serving_container_image_uri=serving_image, + parent_model=parent_model, + labels={ + 'run': run, + 'run_id': run_id + } + ) + + uploaded_model.metadata['resourceName'] = f'{vertex_model.resource_name}@{vertex_model.version_id}' + uploaded_model.uri = f'https://{region}-aiplatform.googleapis.com/v1/{vertex_model.resource_name}@{vertex_model.version_id}' + diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/pipeline.py b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/pipeline.py new file mode 100644 index 0000000000..d5467f3208 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/pipeline.py @@ -0,0 +1,248 @@ +from kfp import dsl +from kfp import compiler +from kfp.dsl import Artifact, Input, Model, Output + +import argparse + +from google.cloud import aiplatform + +from google_cloud_pipeline_components.v1.batch_predict_job import ModelBatchPredictOp +from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp +from google_cloud_pipeline_components.experimental.evaluation import ( + ModelEvaluationClassificationOp, ModelImportEvaluationOp) + +import logging +from datetime import datetime + +from config import (PIPELINE_ROOT, PIPELINE_NAME, BQ_INPUT_DATA, MODEL_CARD_CONFIG, + MODEL_DISPLAY_NAME, PRED_CONTAINER, ENDPOINT_NAME, PARENT_MODEL, + SERVICE_ACCOUNT, NETWORK, KEY_ID, + PROJECT_ID, REGION, IMAGE, CLASS_NAMES, TARGET_COLUMN, + DATAFLOW_NETWORK, DATAFLOW_PUBLIC_IPS, DATAFLOW_SA, + BQ_OUTPUT_DATASET_ID) +from train import xgb_train +from eval import evaluate_model +from load import get_dataframe, upload_to_bq +from model_card import plot_model_card +from model_upload import upload_model +from reformat_preds import reformat_predictions_bq + +caching = True + +TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M") + +parser = argparse.ArgumentParser() + +parser.add_argument( + '--compile-only', + action='store_true' # default: False +) + +args = parser.parse_args() + + +# Import model and convert to Artifact +@dsl.component(base_image=IMAGE) +def get_unmanaged_model(model: Input[Model], unmanaged_model: Output[Artifact]): + unmanaged_model.metadata = model.metadata + unmanaged_model.uri = '/'.join(model.uri.split('/')[:-1]) # remove filename after last / - send dir rather than file + + +######################### +### Define pipeline +######################### +@dsl.pipeline( + pipeline_root=PIPELINE_ROOT, + name=PIPELINE_NAME +) +def pipeline( + bq_table: str = "", + xgboost_param_max_depth: int=10, + xgboost_param_learning_rate: float=0.1, + xgboost_param_n_estimators: int=200, + serving_container_image_uri: str = PRED_CONTAINER, +): + + load_data_op = get_dataframe( + bq_table=bq_table, + project_id=PROJECT_ID, + class_names=CLASS_NAMES).set_display_name("Load And Split Data") + + + train_op = xgb_train( + train_data = load_data_op.outputs['train_data'], + test_data = load_data_op.outputs['test_data'], + xgboost_param_max_depth = xgboost_param_max_depth, + xgboost_param_learning_rate = xgboost_param_learning_rate, + xgboost_param_n_estimators = xgboost_param_n_estimators, + serving_container_image_uri=serving_container_image_uri + ).set_display_name("Train Model") + + + evaluate_model_op = evaluate_model( + test_data=load_data_op.outputs['val_data'], + trained_model=train_op.outputs['model'], + class_names=CLASS_NAMES, + target_column=TARGET_COLUMN + ).set_display_name("Model Card Graphics") + + + upload_op = upload_model( + project_id = PROJECT_ID, + region = REGION, + model = train_op.outputs['model'], + display_name = MODEL_DISPLAY_NAME, + serving_image = PRED_CONTAINER, + parent_model = PARENT_MODEL, + run = dsl.PIPELINE_JOB_NAME_PLACEHOLDER, + run_id = dsl.PIPELINE_JOB_ID_PLACEHOLDER + ).set_display_name("Upload Model") + + + with open(MODEL_CARD_CONFIG, 'r') as f: + model_card_config = ' '.join([x.strip() for x in f.readlines()]) + + _ = plot_model_card( + project_id = PROJECT_ID, + region = REGION, + model = upload_op.outputs['uploaded_model'], + train_data = load_data_op.outputs['train_data'], + test_data = load_data_op.outputs['test_data'], + val_data = load_data_op.outputs['val_data'], + stats = load_data_op.outputs['stats'], + reports = evaluate_model_op.outputs['reports'], + model_card_config = model_card_config + ).set_display_name("Generate Model Card") + + # + # Online Endpoint + # + + create_endpoint_op = EndpointCreateOp( + project = PROJECT_ID, + location = REGION, + display_name = ENDPOINT_NAME + ).set_display_name("Create Vertex AI Endpoint") + + _ = ModelDeployOp( + model=upload_op.outputs['uploaded_model'], + endpoint=create_endpoint_op.outputs['endpoint'], + dedicated_resources_machine_type = 'n1-standard-8', + dedicated_resources_min_replica_count = 1, + dedicated_resources_max_replica_count = 1, + enable_access_logging = True + ).set_display_name("Deploy Model To Endpoint") + + # Start Model Monitoring job. + # Fails intermittently. Enable after bugfix: https://github.com/googleapis/python-aiplatform/issues/2361 + # _ = model_monitoring( + # project_id=PROJECT_ID, + # region=REGION, + # endpoint=create_endpoint_op.outputs['endpoint'], + # pipeline_id=dsl.PIPELINE_JOB_NAME_PLACEHOLDER, + # bq_train_data=bq_table, + # skew_threshold=0.5, + # sampling_rate=1.0, + # monitoring_interval_hours=1, + # user_emails=EMAILS + # ).set_display_name("Enable Model Montoring") + + # + # Evaluation Pipeline + # + + upload_to_bq_op = upload_to_bq( + project=PROJECT_ID, + location=REGION, + csv_data=load_data_op.outputs['val_data'], + dest_dataset_id=BQ_OUTPUT_DATASET_ID, + dest_table_id=f'{PIPELINE_NAME}-val-{TIMESTAMP}' + ).set_display_name("Upload to BigQuery") + + # Run the batch prediction task + batch_predict_op = ModelBatchPredictOp( + project=PROJECT_ID, + location=REGION, + model=upload_op.outputs['uploaded_model'], + job_display_name=f"bp-{PIPELINE_NAME}-{TIMESTAMP}", + bigquery_source_input_uri=upload_to_bq_op.outputs['bq_table_uri'], + instances_format="bigquery", + predictions_format="bigquery", + bigquery_destination_output_uri=f"bq://{PROJECT_ID}.{BQ_OUTPUT_DATASET_ID}.{PIPELINE_NAME}-bp-{TIMESTAMP}", + excluded_fields=[TARGET_COLUMN], + machine_type="n1-standard-8", + starting_replica_count=2, + max_replica_count=8, + ).set_display_name("Batch Prediction") + + # Format the predictions column from "0.1" that xgboost produces to "[0.9, 0.1]" that sklearn produces + reformat_predictions_op = reformat_predictions_bq( + project=PROJECT_ID, + location=REGION, + input_predictions=batch_predict_op.outputs['bigquery_output_table'] + ).set_display_name("Reformat Predictions") + + # Run the evaluation based on prediction type + eval_task = ModelEvaluationClassificationOp( + project=PROJECT_ID, + location=REGION, + class_labels=CLASS_NAMES, + prediction_score_column= "prediction", + target_field_name=TARGET_COLUMN, + ground_truth_format="bigquery", + ground_truth_bigquery_source=upload_to_bq_op.outputs['bq_table_uri'], + predictions_format="bigquery", + predictions_bigquery_source=reformat_predictions_op.outputs['predictions'], + dataflow_service_account=DATAFLOW_SA, + dataflow_subnetwork=DATAFLOW_NETWORK, + dataflow_use_public_ips=DATAFLOW_PUBLIC_IPS, + force_runner_mode='Dataflow' + ).set_display_name("Model Evaluation") + + # Import the model evaluations to the Vertex AI model in Model Registry + ModelImportEvaluationOp( + classification_metrics=eval_task.outputs["evaluation_metrics"], + model=upload_op.outputs['uploaded_model'], + dataset_type="bigquery", + ).set_display_name("Import Model Evaluation") + +# Compile and run the pipeline +aiplatform.init(project=PROJECT_ID, location=REGION, encryption_spec_key_name=KEY_ID) + +logging.getLogger().setLevel(logging.INFO) +logging.info(f"Init with project {PROJECT_ID} in region {REGION}. Pipeline root: {PIPELINE_ROOT}") + +FORMAT = ".json" + +logging.info(f"Compiling pipeline to {PIPELINE_NAME + FORMAT}") +compiler.Compiler().compile( + pipeline_func=pipeline, + package_path=PIPELINE_NAME + FORMAT +) + +if not args.compile_only: + run = aiplatform.PipelineJob( + project=PROJECT_ID, + location=REGION, + display_name=PIPELINE_NAME, + template_path=PIPELINE_NAME + FORMAT, + job_id=f"{PIPELINE_NAME}-{TIMESTAMP}", + pipeline_root=PIPELINE_ROOT, + parameter_values={ + "bq_table": BQ_INPUT_DATA, + "xgboost_param_max_depth": 5, + "xgboost_param_learning_rate": 0.1, + "xgboost_param_n_estimators": 20}, + enable_caching=caching + ) + + run.submit(service_account=SERVICE_ACCOUNT, + network=NETWORK) + +# This can be used to test the online endpoint: +# +# { +# "instances": [ +# [1.18998913145894,-0.563413492993846,0.129352538697985,-0.302175771438239,-0.927677605983222,-0.784678753251055,-0.443713590138326,-0.0956435854887243,-0.648897198590765,0.0499810894390051,0.358011190903553,-0.445067055832097,-0.0982544178676521,-1.28002825726001,0.304411501372465,0.733464325722348,1.71246876228603,-1.78636925309304,0.163898890406551,0.180489467655959,0.0091417811964457,-0.074443134391428,-0.0011569207049818,0.327529344882462,0.332585093864499,-0.298508896918417,0.0256419259293034,0.0496775221663426,80.52] +# ] +# } \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/reformat_preds.py b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/reformat_preds.py new file mode 100644 index 0000000000..85e3037c22 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/reformat_preds.py @@ -0,0 +1,77 @@ +from kfp import dsl +from typing import NamedTuple +from google_cloud_pipeline_components.types.artifact_types import BQTable + +from config import IMAGE + + +# Format the predictions column from "0.1" that xgboost produces to "[0.9, 0.1]" that sklearn produces +@dsl.component(base_image=IMAGE) +def reformat_predictions_bq( + project: str, + location: str, + input_predictions: dsl.Input[BQTable], + predictions: dsl.Output[BQTable] +): + + from google.cloud.bigquery import Client + import logging + + bq = Client(project=project, location=location) + table_project = input_predictions.metadata['projectId'] + table_dataset = input_predictions.metadata['datasetId'] + table_table = input_predictions.metadata['tableId'] + table_ref = f"{table_project}.{table_dataset}.{table_table}" + + sql = f""" + CREATE OR REPLACE TABLE `{table_ref}_reformat` AS + SELECT * EXCEPT(prediction), + '[' || CAST(1.0-CAST(prediction AS FLOAT64) AS STRING) || ',' || prediction || ']' as prediction + FROM `{table_ref}`""" + + logging.info(f"Processing data in table {table_ref}") + logging.info(f"Query: {sql}") + job = bq.query(sql) + + job.result() # wait for completion + + predictions.metadata['projectId'] = table_project + predictions.metadata['datasetId'] = table_dataset + predictions.metadata['tableId'] = table_table + "_reformat" + + +@dsl.component(base_image=IMAGE) +def reformat_groundtruth_json( + gcs_sources: list, + gcs_groundtruth: dsl.OutputPath("Dataset")) -> NamedTuple('outputs', [('gcs_output_uris', list)]): + + from collections import namedtuple + import pandas as pd + import json + + df = None + for gcs_uri in gcs_sources: + fname = '/gcs' + '/'.join(gcs_uri.split('/')[1:]) # /gcs/bucket/a/b/c + + if df: + df = pd.concat(df, pd.read_csv(fname)) + else: + df = pd.read_csv(fname) + + json_data_raw = df.to_json(orient='records', lines=True) + json_data_raw_lines = [line for line in json_data_raw.split('\n') if len(line) > 0] + + with open(gcs_groundtruth, 'w') as f: + for row_str in json_data_raw_lines: + row = json.loads(row_str) + target_col = 'Class' + instance_data = [row[k] for k in row.keys() if k != target_col] + row_format = {'instance': instance_data, target_col: row[target_col]} + instance = json.dumps(row_format) + f.write(instance + '\n') + + t = namedtuple('outputs', ['gcs_output_uris']) + # transform from /gcs/a/b/c to gs://a/b/c + gcs_path = 'gs://' + '/'.join(gcs_groundtruth.split('/')[2:]) + + return t([gcs_path]) diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/train.py b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/train.py new file mode 100644 index 0000000000..4f6e8e1971 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/kfp_pipelines/src/train.py @@ -0,0 +1,145 @@ +from kfp.v2.dsl import (Dataset, Input, Model, Output, + ClassificationMetrics, Metrics) +from kfp import dsl +from sklearn.metrics import accuracy_score, confusion_matrix, f1_score + +import pandas as pd +import xgboost as xgb +import numpy as np +from hypertune import HyperTune + +import logging +import argparse +import sys +import pickle + +from config import CLASS_NAMES, IMAGE, PRED_CONTAINER + +def load_data(dataset_path: str): + df = pd.read_csv(dataset_path) + labels = list(np.rint(df.pop("Class")).astype(np.int64)) + + data = df.values.tolist() + + # we need to convert it to numpy to avoid + # 'list' object has no attribute 'shape' errors in xgb.fit() + return (np.asarray(data, dtype=object), labels) + +def train( + train_dataset_path: str, + test_dataset_path: str, + xgboost_param_max_depth: int, + xgboost_param_learning_rate: float, + xgboost_param_n_estimators: int, + model_output_path, + serving_container_image_uri, + metrics: Output[Metrics] = None, + metricsc: Output[ClassificationMetrics] = None, + model: Output[Model] = None,): + + x_train, y_train = load_data(train_dataset_path) + x_test, y_test = load_data(test_dataset_path) + + logging.info(f"Train X {type(x_train)} {len(x_train)}") + logging.info(f"Train Y {type(y_train)} {len(y_train)}: {y_train[:5]}") + + classifier = xgb.XGBClassifier( + max_depth=xgboost_param_max_depth, + learning_rate=xgboost_param_learning_rate, + n_estimators=xgboost_param_n_estimators) + logging.info(f"Model {classifier}") + classifier.fit(x_train, y_train) + + # log metrics + print(f"Type y_test {type(y_test[0])}") + print(f"Type pred {type(classifier.predict(x_test)[0])}") + + score = accuracy_score(y_test, classifier.predict(x_test)) + f1 = f1_score(y_test, classifier.predict(x_test)) + logging.info("accuracy is: %s", score) + logging.info("F1 score is: %s", f1) + if metrics: + metrics.log_metric("accuracy",(score * 100.0)) + metrics.log_metric("f1-score", f1) + metrics.log_metric("framework", f"XGBoost {xgb.__version__}") + metrics.log_metric("train_dataset_size", len(x_train)) + metrics.log_metric("test_dataset_size", len(x_test)) + + if metricsc: + # log the confusion matrix + y_pred = classifier.predict(x_test) + logging.info(f"Predictions: {','.join(map(str, y_pred[:10]))}") + + metricsc.log_confusion_matrix( + CLASS_NAMES, + confusion_matrix(y_test, y_pred).tolist() # to convert np array to list. + ) + + # report for hyperparameter tuning + hyper_tune = HyperTune() + hyper_tune.report_hyperparameter_tuning_metric( + hyperparameter_metric_tag='f1', + metric_value=f1) + + + # Write Pickle for convenience: This is what we trained and will have sklearn interface + pickle_output_path = model_output_path + '_sklearn.pkl' + with open(pickle_output_path, 'wb') as f: + pickle.dump(classifier, f) + + # Save using save_model method of XGB Classifier object + # -- this is important if we want to use the prebuilt xgb container for prediction + model_output_path = model_output_path + '.bst' + logging.info(f"Writing model to {model_output_path}") + classifier.save_model(model_output_path) + + if model: + model.metadata = { "containerSpec": { "imageUri": serving_container_image_uri } } + model.path = model_output_path + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument("--train_dataset_path", required=True) + parser.add_argument("--test_dataset_path", required=True) + parser.add_argument("--xgboost_param_max_depth", required=True) + parser.add_argument("--xgboost_param_learning_rate", required=True) + parser.add_argument("--xgboost_param_n_estimators", required=True) + parser.add_argument("--model_output_path", required=True) + + print(f'Got args: {" ".join(sys.argv)}') + args = parser.parse_args() + + train(train_dataset_path=args.train_dataset_path, + test_dataset_path=args.test_dataset_path, + xgboost_param_max_depth=int(args.xgboost_param_max_depth), + xgboost_param_learning_rate=float(args.xgboost_param_learning_rate), + xgboost_param_n_estimators=int(args.xgboost_param_n_estimators), + model_output_path=args.model_output_path, + serving_container_image_uri=PRED_CONTAINER) + +@dsl.component(base_image=f'{IMAGE}') +def xgb_train( + train_data: Input[Dataset], + test_data: Input[Dataset], + metrics: Output[Metrics], + model: Output[Model], + xgboost_param_max_depth: int, + xgboost_param_learning_rate: float, + xgboost_param_n_estimators: int, + metricsc: Output[ClassificationMetrics], + serving_container_image_uri: str +): + from train import train + + train( + train_dataset_path=train_data.path, + test_dataset_path=test_data.path, + xgboost_param_max_depth=xgboost_param_max_depth, + xgboost_param_learning_rate=xgboost_param_learning_rate, + xgboost_param_n_estimators=xgboost_param_n_estimators, + model_output_path=model.path, + metrics=metrics, + metricsc=metricsc, + model=model, + serving_container_image_uri=serving_container_image_uri) diff --git a/examples/vertex_mlops_enterprise/src/kfp_pipelines/train-eval-pipeline.png b/examples/vertex_mlops_enterprise/src/kfp_pipelines/train-eval-pipeline.png new file mode 100644 index 0000000000..1f1f2738ae Binary files /dev/null and b/examples/vertex_mlops_enterprise/src/kfp_pipelines/train-eval-pipeline.png differ diff --git a/examples/vertex_mlops_enterprise/src/model_training/data.py b/examples/vertex_mlops_enterprise/src/model_training/data.py new file mode 100644 index 0000000000..cd90ecc09a --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/model_training/data.py @@ -0,0 +1,49 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for reading data as tf.data.Dataset.""" + +import tensorflow as tf + +from src.common import features + + +def _gzip_reader_fn(filenames): + """Small utility returning a record reader that can read gzip'ed files.""" + return tf.data.TFRecordDataset(filenames, compression_type="GZIP") + + +def get_dataset(file_pattern, feature_spec, batch_size, epochs): + """Generates features and label for tuning/training. + Args: + file_pattern: input tfrecord file pattern. + feature_spec: a dictionary of feature specifications. + batch_size: representing the number of consecutive elements of returned + dataset to combine in a single batch + Returns: + A dataset that contains (features, indices) tuple where features is a + dictionary of Tensors, and indices is a single Tensor of label indices. + """ + + dataset = tf.data.experimental.make_batched_features_dataset( + file_pattern=file_pattern, + batch_size=batch_size, + features=feature_spec, + label_key=features.TARGET_FEATURE_NAME, + reader=_gzip_reader_fn, + num_epochs=2*epochs, + drop_final_batch=True, + ) + + + return dataset diff --git a/examples/vertex_mlops_enterprise/src/model_training/defaults.py b/examples/vertex_mlops_enterprise/src/model_training/defaults.py new file mode 100644 index 0000000000..1886ed6909 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/model_training/defaults.py @@ -0,0 +1,41 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Defaults for the model. + +These values can be tweaked to affect model training performance. +""" + + +HIDDEN_UNITS = [64, 32] +LEARNING_RATE = 0.0001 +BATCH_SIZE = 512 +NUM_EPOCHS = 10 +NUM_EVAL_STEPS = 100 + + +def update_hyperparams(hyperparams: dict) -> dict: + if "hidden_units" not in hyperparams: + hyperparams["hidden_units"] = HIDDEN_UNITS + else: + if not isinstance(hyperparams["hidden_units"], list): + hyperparams["hidden_units"] = [ + int(v) for v in hyperparams["hidden_units"].split(",") + ] + if "learning_rate" not in hyperparams: + hyperparams["learning_rate"] = LEARNING_RATE + if "batch_size" not in hyperparams: + hyperparams["batch_size"] = BATCH_SIZE + if "num_epochs" not in hyperparams: + hyperparams["num_epochs"] = NUM_EPOCHS + return hyperparams diff --git a/examples/vertex_mlops_enterprise/src/model_training/exporter.py b/examples/vertex_mlops_enterprise/src/model_training/exporter.py new file mode 100644 index 0000000000..bce5d31f4b --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/model_training/exporter.py @@ -0,0 +1,102 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions for exporting the model for serving.""" + +import logging + +import tensorflow as tf +import tensorflow_transform as tft +import tensorflow_data_validation as tfdv +from tensorflow_transform.tf_metadata import schema_utils +import tensorflow.keras as keras + +from src.common import features + + +def _get_serve_tf_examples_fn(classifier, tft_output, raw_feature_spec): + """Returns a function that parses a serialized tf.Example and applies TFT.""" + + classifier.tft_layer = tft_output.transform_features_layer() + + @tf.function + def serve_tf_examples_fn(serialized_tf_examples): + """Returns the output to be used in the serving signature.""" + #for key in list(raw_feature_spec.keys()): + # if key not in features.FEATURE_NAMES: + # raw_feature_spec.pop(key) + if features.TARGET_FEATURE_NAME in raw_feature_spec.keys(): + raw_feature_spec.pop(features.TARGET_FEATURE_NAME) + + parsed_features = tf.io.parse_example(serialized_tf_examples, raw_feature_spec) + + transformed_features = classifier.tft_layer(parsed_features) + logits = classifier(transformed_features) + probabilities = keras.activations.sigmoid(logits) + return {"probabilities": probabilities} + + return serve_tf_examples_fn + + +def _get_serve_features_fn(classifier, tft_output): + """Returns a function that accept a dictionary of features and applies TFT.""" + + classifier.tft_layer = tft_output.transform_features_layer() + + @tf.function + def serve_features_fn(raw_features): + """Returns the output to be used in the serving signature.""" + + transformed_features = classifier.tft_layer(raw_features) + logits = classifier(transformed_features) + neg_probabilities = keras.activations.sigmoid(logits) + pos_probabilities = 1 - neg_probabilities + probabilities = tf.concat([neg_probabilities, pos_probabilities], -1) + batch_size = tf.shape(probabilities)[0] + classes = tf.repeat([features.TARGET_LABELS], [batch_size], axis=0) + return {"classes": classes, "scores": probabilities} + + return serve_features_fn + + +def export_serving_model( + classifier, serving_model_dir, raw_schema_location, tft_output_dir +): + + raw_schema = tfdv.load_schema_text(raw_schema_location) + raw_feature_spec = schema_utils.schema_as_feature_spec(raw_schema).feature_spec + + tft_output = tft.TFTransformOutput(tft_output_dir) + + features_input_signature = { + feature_name: tf.TensorSpec( + shape=(None, 1), dtype=spec.dtype, name=feature_name + ) + for feature_name, spec in raw_feature_spec.items() + if feature_name != features.TARGET_FEATURE_NAME + } + + signatures = { + "serving_default": _get_serve_features_fn( + classifier, tft_output + ).get_concrete_function(features_input_signature), + "serving_tf_example": _get_serve_tf_examples_fn( + classifier, tft_output, raw_feature_spec + ).get_concrete_function( + tf.TensorSpec(shape=[None], dtype=tf.string, name="examples") + ), + } + + logging.info("Model export started...") + classifier.save(serving_model_dir, signatures=signatures) + logging.info("Model export completed.") diff --git a/examples/vertex_mlops_enterprise/src/model_training/model.py b/examples/vertex_mlops_enterprise/src/model_training/model.py new file mode 100644 index 0000000000..59220cbf0f --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/model_training/model.py @@ -0,0 +1,41 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A DNN keras classification model.""" + +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers, activations +import logging + +from src.common import features + + +def create_model(feature_keys, hyperparams) -> keras.Model: + + inputs = [layers.Input(shape=(1,), name=f) for f in feature_keys if f != features.TARGET_FEATURE_NAME] + d = layers.concatenate(inputs) + + for units in hyperparams['hidden_units']: + d = layers.Dense(units, activation=activations.relu)(d) + + if 'dropout' in hyperparams: + d = tf.keras.layers.Dropout(hyperparams['dropout'])(d) + + outputs = layers.Dense(1, activation=activations.sigmoid)(d) + + model = keras.Model(inputs=inputs, outputs=outputs) + + model.summary(print_fn=logging.info) + + return model diff --git a/examples/vertex_mlops_enterprise/src/model_training/runner.py b/examples/vertex_mlops_enterprise/src/model_training/runner.py new file mode 100644 index 0000000000..4ff52712c9 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/model_training/runner.py @@ -0,0 +1,63 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A run_fn method called by the TFX Trainer component.""" + +import os +import logging + +from src.model_training import trainer, exporter, defaults + + +# TFX Trainer will call this function. +def run_fn(fn_args): + """Train the model based on given args. + Args: + fn_args: Holds args used to train the model as name/value pairs. + """ + logging.info("Runner started...") + logging.info(f"fn_args: {fn_args}") + logging.info("") + + try: + log_dir = fn_args.model_run_dir + except KeyError: + log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), "logs") + + hyperparams = fn_args.hyperparameters + if not hyperparams: + hyperparams = dict() + + hyperparams = defaults.update_hyperparams(hyperparams) + logging.info("Hyperparameter:") + logging.info(hyperparams) + logging.info("") + + logging.info("Runner executing trainer...") + classifier = trainer.train( + train_data_dir=fn_args.train_files, + eval_data_dir=fn_args.eval_files, + tft_output_dir=fn_args.transform_output, + hyperparams=hyperparams, + log_dir=log_dir, + base_model_dir=fn_args.base_model, + ) + + logging.info("Runner executing exporter...") + exporter.export_serving_model( + classifier=classifier, + serving_model_dir=fn_args.serving_model_dir, + raw_schema_location=fn_args.schema_path, + tft_output_dir=fn_args.transform_output, + ) + logging.info("Runner completed.") diff --git a/examples/vertex_mlops_enterprise/src/model_training/trainer.py b/examples/vertex_mlops_enterprise/src/model_training/trainer.py new file mode 100644 index 0000000000..a6ec87aef7 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/model_training/trainer.py @@ -0,0 +1,142 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Train and evaluate the model.""" + +import logging +import tensorflow as tf +import tensorflow_transform as tft +from tensorflow import keras +import keras.backend as K + +from src.model_training import data, model + + +def train( + train_data_dir, + eval_data_dir, + tft_output_dir, + hyperparams, + log_dir, + base_model_dir=None, + run=None +): + + logging.info(f"Loading tft output from {tft_output_dir}") + tft_output = tft.TFTransformOutput(tft_output_dir) + transformed_feature_spec = tft_output.transformed_feature_spec() + + batch_size = int(hyperparams["batch_size"]) + epochs = int(hyperparams["num_epochs"]) + steps_per_epoch = int(hyperparams["steps_per_epoch"]) + + train_dataset = data.get_dataset( + train_data_dir, + transformed_feature_spec, + batch_size, + epochs + ) + + eval_dataset = data.get_dataset( + eval_data_dir, + transformed_feature_spec, + batch_size, + epochs + ) + + optimizer = keras.optimizers.Adam(learning_rate=hyperparams["learning_rate"]) + loss = keras.losses.BinaryCrossentropy(from_logits=True) + #loss = f1_weighted_loss + + acc_name = f'accuracy_{run}' if run else 'accuracy' + auc_name = f'auc_{run}' if run else 'auc' + metrics = [keras.metrics.BinaryAccuracy(name=acc_name), keras.metrics.AUC(curve='PR', name=auc_name)] + if run: + # we need one just called "accuracy" as well, to have one metric to optimize across runs for HP tuning + metrics.append(keras.metrics.BinaryAccuracy(name='accuracy')) + + + early_stopping = tf.keras.callbacks.EarlyStopping( + monitor="val_loss", patience=5, restore_best_weights=True + ) + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir) + + classifier = model.create_model(transformed_feature_spec.keys(), hyperparams) + if base_model_dir: + try: + classifier = keras.load_model(base_model_dir) + except: + pass + + classifier.compile(optimizer=optimizer, loss=loss, metrics=metrics) + + logging.info(f"Model training started... steps per epoch = {steps_per_epoch}") + classifier.fit( + train_dataset, + epochs=hyperparams["num_epochs"], + steps_per_epoch=steps_per_epoch, + validation_data=eval_dataset, + callbacks=[early_stopping, tensorboard_callback], + ) + logging.info("Model training completed.") + + return classifier + + +def evaluate(model, data_dir, raw_schema_location, tft_output_dir, hyperparams): + logging.info(f"Loading raw schema from {raw_schema_location}") + + logging.info(f"Loading tft output from {tft_output_dir}") + tft_output = tft.TFTransformOutput(tft_output_dir) + transformed_feature_spec = tft_output.transformed_feature_spec() + + logging.info("Model evaluation started...") + eval_dataset = data.get_dataset( + data_dir, + transformed_feature_spec, + int(hyperparams["batch_size"]), + 1 + ) + + evaluation_metrics = model.evaluate(eval_dataset) + logging.info("Model evaluation completed.") + + return evaluation_metrics + + +def f1(y_true, y_pred): + y_pred = K.round(y_pred) + tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0) + fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0) + fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0) + + p = tp / (tp + fp + K.epsilon()) + r = tp / (tp + fn + K.epsilon()) + + f1 = 2*p*r / (p+r+K.epsilon()) + f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1) + return K.mean(f1) + + +def f1_loss(y_true, y_pred): + + tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0) + fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0) + fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0) + + p = tp / (tp + fp + K.epsilon()) + r = tp / (tp + fn + K.epsilon()) + + f1 = 2*p*r / (p+r+K.epsilon()) + f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1) + return 1 - K.mean(f1) diff --git a/examples/vertex_mlops_enterprise/src/pipeline_triggering/main.py b/examples/vertex_mlops_enterprise/src/pipeline_triggering/main.py new file mode 100644 index 0000000000..fbd5da73d2 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/pipeline_triggering/main.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Cloud Function to be triggered by Pub/Sub.""" + +import os +import json +import logging +from google.cloud import aiplatform +from google.cloud import storage +import base64 + + +def trigger_pipeline(event, context): + + project = os.getenv("PROJECT") + region = os.getenv("REGION") + sa = os.getenv("SERVICE_ACCOUNT") + pipeline_name = os.getenv("PIPELINE_NAME") + gcs_pipeline_file_location = os.getenv("GCS_PIPELINE_FILE_LOCATION") + + if not project: + raise ValueError("Environment variable PROJECT is not set.") + if not region: + raise ValueError("Environment variable REGION is not set.") + if not gcs_pipeline_file_location: + raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.") + if not sa: + raise ValueError("Environment variable SERVICE_ACCOUNT is not set.") + + storage_client = storage.Client() + + path_parts = gcs_pipeline_file_location.replace("gs://", "").split("/") + bucket_name = path_parts[0] + blob_name = "/".join(path_parts[1:]) + + bucket = storage_client.bucket(bucket_name) + blob = storage.Blob(bucket=bucket, name=blob_name) + + if not blob.exists(storage_client): + raise ValueError(f"{gcs_pipeline_file_location} does not exist.") + + data = base64.b64decode(event["data"]).decode("utf-8") + logging.info(f"Event data: {data}") + + parameter_values = json.loads(data) + job = aiplatform.PipelineJob(display_name = pipeline_name, + template_path = gcs_pipeline_file_location, + parameter_values = parameter_values, + project = project, + location = region) + + response = job.submit(service_account=sa, + network=None) + + + logging.info(response) + + diff --git a/examples/vertex_mlops_enterprise/src/pipeline_triggering/requirements.txt b/examples/vertex_mlops_enterprise/src/pipeline_triggering/requirements.txt new file mode 100644 index 0000000000..1969b67ca1 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/pipeline_triggering/requirements.txt @@ -0,0 +1,3 @@ +kfp==1.8.12 +google-cloud-aiplatform +google-cloud-storage \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/src/prediction_cf/main.py b/examples/vertex_mlops_enterprise/src/prediction_cf/main.py new file mode 100644 index 0000000000..2a0798896a --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/prediction_cf/main.py @@ -0,0 +1,86 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Cloud Function to serve as the prediction endpoint.""" + +import os +import sys +import json +import logging +from google.cloud import aiplatform as vertex_ai + + +# To enable the Cloud Function to work out the +# next import +workdir = '/workspace' +if not workdir in sys.path: + sys.path.append(workdir) + +from src.feature_store import feature_store as fs + + +# if the trigger is HTTP, the argument is a request +def predict(request): + + def getenv(env_var_name): + v = os.getenv(env_var_name) + if not v: + raise ValueError(f"Environment variable {env_var_name} is not set.") + return v + + project = getenv("PROJECT") + region = getenv("REGION") + endpoint_name = getenv("ENDPOINT_NAME") + entity = getenv("ENTITY") + store_id = getenv("FEATURESTORE_ID") + + default_v27 = 0. + default_v28 = 0. + + raw_features = request.get_json(silent=False) + logging.error(f"Raw features: {raw_features}") + + # Get userid and query Feature Store + fs_features = {} + + if 'userid' in raw_features: + try: + fs_features = fs.read_features(project, region, store_id, entity, ['v27', 'v28'], raw_features['userid']) + except: + logging.warn("Feature store is not available") + if not 'v27' in fs_features: + logging.error(f'User {raw_features["userid"]} not present in Feature Store, using defaults') + fs_features['v27'] = default_v27 + if not 'v28' in fs_features: + fs_features['v28'] = default_v28 + else: + logging.error('No userid provided, using defaults') + return {'error': 'No userid'} + + if 'V1' not in raw_features: + return {'error': 'Missing features. Expect V1 to V26 and userid.'} + + feature_data = raw_features.copy() + del feature_data['userid'] + feature_data['V27'] = [fs_features['v27']] + feature_data['V28'] = [fs_features['v28']] + + # Query model on Vertex AI Endpoint + endpoint = vertex_ai.Endpoint(endpoint_name, project, region) + logging.error(f"Calling endpoint with: {feature_data}") + prediction = endpoint.predict([feature_data]).predictions[0] + logging.error(f"Prediction: {prediction}") + + return json.dumps(prediction) + + diff --git a/examples/vertex_mlops_enterprise/src/preprocessing/etl.py b/examples/vertex_mlops_enterprise/src/preprocessing/etl.py new file mode 100644 index 0000000000..b135a3a43f --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/preprocessing/etl.py @@ -0,0 +1,207 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data preprocessing pipelines.""" + +import os + +import tensorflow_transform as tft +import tensorflow_data_validation as tfdv +import apache_beam as beam +import tensorflow_transform.beam as tft_beam +from tensorflow_transform.tf_metadata import dataset_metadata +from tensorflow_transform.tf_metadata import schema_utils + + +from src.preprocessing import transformations + +RAW_SCHEMA_LOCATION = "src/raw_schema/schema.pbtxt" + + +def parse_bq_record(bq_record): + output = {} + for key in bq_record: + output[key] = [bq_record[key]] + return output + + +def split_dataset(bq_row, num_partitions, ratio): + import json + + assert num_partitions == len(ratio) + bucket = sum(map(ord, json.dumps(bq_row))) % sum(ratio) + total = 0 + for i, part in enumerate(ratio): + total += part + if bucket < total: + return i + return len(ratio) - 1 + + +def run_transform_pipeline(args): + + pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args) + + raw_data_query = args["raw_data_query"] + write_raw_data = args["write_raw_data"] + exported_data_prefix = args["exported_data_prefix"] + transformed_data_prefix = args["transformed_data_prefix"] + transform_artifact_dir = args["transform_artifact_dir"] + temporary_dir = args["temporary_dir"] + gcs_location = args["gcs_location"] + project = args["project"] + + source_raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION) + raw_feature_spec = schema_utils.schema_as_feature_spec( + source_raw_schema + ).feature_spec + + raw_metadata = dataset_metadata.DatasetMetadata( + schema_utils.schema_from_feature_spec(raw_feature_spec) + ) + + with beam.Pipeline(options=pipeline_options) as pipeline: + with tft_beam.Context(temporary_dir): + + # Read raw BigQuery data. + raw_train_data, raw_eval_data = ( + pipeline + | "Read Raw Data" + >> beam.io.ReadFromBigQuery( + query=raw_data_query, + project=project, + use_standard_sql=True, + gcs_location=gcs_location, + ) + | "Parse Data" >> beam.Map(parse_bq_record) + | "Split" >> beam.Partition(split_dataset, 2, ratio=[8, 2]) + ) + + # Create a train_dataset from the data and schema. + raw_train_dataset = (raw_train_data, raw_metadata) + + # Analyze and transform raw_train_dataset to produced transformed_train_dataset and transform_fn. + transformed_train_dataset, transform_fn = ( + raw_train_dataset + | "Analyze & Transform" + >> tft_beam.AnalyzeAndTransformDataset(transformations.preprocessing_fn) + ) + + # Get data and schema separately from the transformed_dataset. + transformed_train_data, transformed_metadata = transformed_train_dataset + + # write transformed train data. + _ = ( + transformed_train_data + | "Write Transformed Train Data" + >> beam.io.tfrecordio.WriteToTFRecord( + file_path_prefix=os.path.join( + transformed_data_prefix, "train/data" + ), + file_name_suffix=".gz", + coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema), + ) + ) + + # Create a eval_dataset from the data and schema. + raw_eval_dataset = (raw_eval_data, raw_metadata) + + # Transform raw_eval_dataset to produced transformed_eval_dataset using transform_fn. + transformed_eval_dataset = ( + raw_eval_dataset, + transform_fn, + ) | "Transform" >> tft_beam.TransformDataset() + + # Get data from the transformed_eval_dataset. + transformed_eval_data, _ = transformed_eval_dataset + + # write transformed train data. + _ = ( + transformed_eval_data + | "Write Transformed Eval Data" + >> beam.io.tfrecordio.WriteToTFRecord( + file_path_prefix=os.path.join(transformed_data_prefix, "eval/data"), + file_name_suffix=".gz", + coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema), + ) + ) + + # Write transform_fn. + _ = transform_fn | "Write Transform Artifacts" >> tft_beam.WriteTransformFn( + transform_artifact_dir + ) + + if write_raw_data: + # write raw eval data. + _ = ( + raw_eval_data + | "Write Raw Eval Data" + >> beam.io.tfrecordio.WriteToTFRecord( + file_path_prefix=os.path.join(exported_data_prefix, "data"), + file_name_suffix=".tfrecord", + coder=tft.coders.ExampleProtoCoder(raw_metadata.schema), + ) + ) + + +def convert_to_jsonl(bq_record): + import json + + output = {} + for key in bq_record: + output[key] = [bq_record[key]] + return json.dumps(output) + + +def run_extract_pipeline(args): + + pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args) + + sql_query = args["sql_query"] + exported_data_prefix = args["exported_data_prefix"] + temporary_dir = args["temporary_dir"] + gcs_location = args["gcs_location"] + project = args["project"] + + with beam.Pipeline(options=pipeline_options) as pipeline: + with tft_beam.Context(temporary_dir): + + # Read BigQuery data. + raw_data = ( + pipeline + | "Read Data" + >> beam.io.ReadFromBigQuery( + query=sql_query, + project=project, + use_standard_sql=True, + gcs_location=gcs_location, + ) + | "Parse Data" >> beam.Map(convert_to_jsonl) + ) + + # Write raw data to GCS as JSONL files. + _ = raw_data | "Write Data" >> beam.io.WriteToText( + file_path_prefix=exported_data_prefix, file_name_suffix=".jsonl" + ) + + +def parse_prediction_results(jsonl): + import uuid + import json + + prediction_results = json.loads(jsonl)["prediction"] + prediction_id = str(uuid.uuid4()) + scores = prediction_results["scores"] + classes = prediction_results["classes"] + + return {"prediction_id": prediction_id, "scores": scores, "classes": classes} diff --git a/examples/vertex_mlops_enterprise/src/preprocessing/transformations.py b/examples/vertex_mlops_enterprise/src/preprocessing/transformations.py new file mode 100644 index 0000000000..a2f2a9d1eb --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/preprocessing/transformations.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TensorFlow Transform preprocessing function.""" + +import tensorflow_transform as tft + + +def preprocessing_fn(inputs): + output_dict = { + 'Amount': tft.scale_to_z_score(inputs['Amount']), + 'Class': inputs['Class'] + } + + # Let's normalize all the columns that start with V + for col in inputs.keys(): + if col.startswith("V"): + output_dict[col] = tft.scale_to_z_score(inputs[col]) + + return output_dict \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/src/raw_schema/schema.pbtxt b/examples/vertex_mlops_enterprise/src/raw_schema/schema.pbtxt new file mode 100644 index 0000000000..9a64790266 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/raw_schema/schema.pbtxt @@ -0,0 +1,390 @@ +feature { + name: "V1" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V2" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V3" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V4" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V5" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V6" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V7" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V8" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V9" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V10" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V11" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V12" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V13" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V14" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V15" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V16" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V17" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V18" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V19" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V20" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V21" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V22" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V23" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V24" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V25" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V26" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V27" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "V28" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "Amount" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} +feature { + name: "Class" + type: FLOAT + presence { + min_fraction: 1.0 + min_count: 1 + } + shape { + dim { + size: 1 + } + } +} diff --git a/examples/vertex_mlops_enterprise/src/tests/__init__.py b/examples/vertex_mlops_enterprise/src/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/vertex_mlops_enterprise/src/tests/datasource_utils_tests.py b/examples/vertex_mlops_enterprise/src/tests/datasource_utils_tests.py new file mode 100644 index 0000000000..8536abe744 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/tests/datasource_utils_tests.py @@ -0,0 +1,124 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test utilities for generating BigQuery data querying scripts.""" + +import sys +import os +import logging +from google.cloud import bigquery + +from src.common import datasource_utils + +root = logging.getLogger() +root.setLevel(logging.INFO) +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.INFO) +root.addHandler(handler) + +LIMIT = 100 + +TARGET_COLUMN = "Class" + +EXPECTED_TRAINING_COLUMNS = [ + "V1", + "V2", + "V3", + "V4", + "V5", + "V6", + "V7", + "V8", + "V9", + "V10", + "V11", + "V12", + "V13", + "V14", + "V15", + "V16", + "V17", + "V18", + "V19", + "V20", + "V21", + "V22", + "V23", + "V24", + "V25", + "V26", + "V27", + "V28", + "Amount", + "Class", +] + + +def test_training_query(): + + project = os.getenv("PROJECT") + location = os.getenv("BQ_LOCATION") + bq_dataset_name = os.getenv("BQ_DATASET_NAME") + bq_table_name = os.getenv("ML_TABLE") + + assert project, "Environment variable PROJECT is None!" + assert location, "Environment variable BQ_LOCATION is None!" + assert bq_dataset_name, "Environment variable BQ_DATASET_NAME is None!" + assert bq_table_name, "Environment variable ML_TABLE is None!" + + logging.info("BigQuery Source: %s.%s.%s", project, bq_dataset_name, + bq_table_name) + + query = datasource_utils.get_source_query( + bq_dataset_name=bq_dataset_name, + bq_table_name=bq_table_name, + ml_use="UNASSIGNED", + limit=LIMIT, + ) + + bq_client = bigquery.Client(project=project, location=location) + df = bq_client.query(query).to_dataframe() + columns = set(df.columns) + assert columns == set(EXPECTED_TRAINING_COLUMNS) + assert df.shape == (LIMIT, len(columns)) + + +def test_serving_query(): + + project = os.getenv("PROJECT") + location = os.getenv("BQ_LOCATION") + bq_dataset_name = os.getenv("BQ_DATASET_NAME") + bq_table_name = os.getenv("ML_TABLE") + + assert project, "Environment variable PROJECT is None!" + assert location, "Environment variable BQ_LOCATION is None!" + assert bq_dataset_name, "Environment variable BQ_DATASET_NAME is None!" + assert bq_table_name, "Environment variable ML_TABLE is None!" + + logging.info("BigQuery Source: %s.%s.%s", project, bq_dataset_name, + bq_table_name) + + query = datasource_utils.get_source_query( + bq_dataset_name=bq_dataset_name, + bq_table_name=bq_table_name, + ml_use=None, + limit=LIMIT, + ) + + bq_client = bigquery.Client(project=project, location=location) + df = bq_client.query(query).to_dataframe() + columns = set(df.columns) + expected_serving_columns = EXPECTED_TRAINING_COLUMNS + expected_serving_columns.remove(TARGET_COLUMN) + assert columns == set(expected_serving_columns) + assert df.shape == (LIMIT, len(columns)) diff --git a/examples/vertex_mlops_enterprise/src/tests/model_deployment_tests.py b/examples/vertex_mlops_enterprise/src/tests/model_deployment_tests.py new file mode 100644 index 0000000000..fe15dc42f2 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/tests/model_deployment_tests.py @@ -0,0 +1,198 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test an uploaded model to Vertex AI.""" + +import os +import logging +import tensorflow as tf +import sys + +from google.cloud import aiplatform as vertex_ai + +# configure logging to print to stdout +root = logging.getLogger() +root.setLevel(logging.DEBUG) +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.DEBUG) +root.addHandler(handler) + +test_instance = { + "V1": [-0.906611], + "V2": [-0.906611], + "V3": [-0.906611], + "V4": [-0.906611], + "V5": [-0.906611], + "V6": [-0.906611], + "V7": [-0.906611], + "V8": [-0.906611], + "V9": [-0.906611], + "V10": [-0.906611], + "V11": [-0.906611], + "V12": [-0.906611], + "V13": [-0.906611], + "V14": [-0.906611], + "V15": [-0.906611], + "V16": [-0.906611], + "V17": [-0.906611], + "V18": [-0.906611], + "V19": [-0.906611], + "V20": [-0.906611], + "V21": [-0.906611], + "V22": [-0.906611], + "V23": [-0.906611], + "V24": [-0.906611], + "V25": [-0.906611], + "V26": [-0.906611], + "V27": [-0.906611], + "V28": [-0.906611], + "Amount": [15.99], +} + +SERVING_DEFAULT_SIGNATURE_NAME = "serving_default" + + +def test_model_artifact(): + + feature_types = { + "V1": tf.dtypes.float32, + "V2": tf.dtypes.float32, + "V3": tf.dtypes.float32, + "V4": tf.dtypes.float32, + "V5": tf.dtypes.float32, + "V6": tf.dtypes.float32, + "V7": tf.dtypes.float32, + "V8": tf.dtypes.float32, + "V9": tf.dtypes.float32, + "V10": tf.dtypes.float32, + "V11": tf.dtypes.float32, + "V12": tf.dtypes.float32, + "V13": tf.dtypes.float32, + "V14": tf.dtypes.float32, + "V15": tf.dtypes.float32, + "V16": tf.dtypes.float32, + "V17": tf.dtypes.float32, + "V18": tf.dtypes.float32, + "V19": tf.dtypes.float32, + "V20": tf.dtypes.float32, + "V21": tf.dtypes.float32, + "V22": tf.dtypes.float32, + "V23": tf.dtypes.float32, + "V24": tf.dtypes.float32, + "V25": tf.dtypes.float32, + "V26": tf.dtypes.float32, + "V27": tf.dtypes.float32, + "V28": tf.dtypes.float32, + "Amount": tf.dtypes.float32, + } + + new_test_instance = {} + for key, instance in test_instance.items(): + new_test_instance[key] = tf.constant(instance, + dtype=feature_types[key]) + + print(new_test_instance) + + project = os.getenv("PROJECT") + region = os.getenv("REGION") + model_display_name = os.getenv("MODEL_DISPLAY_NAME") + + assert project, "Environment variable PROJECT is None!" + assert region, "Environment variable REGION is None!" + assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!" + + vertex_ai.init( + project=project, + location=region, + ) + + models = vertex_ai.Model.list(filter=f"display_name={model_display_name}", + order_by="update_time") + + assert models, f"No model with display name {model_display_name} exists!" + + model = models[-1] + artifact_uri = model.gca_resource.artifact_uri + logging.info("Model artifact uri: %s", artifact_uri) + assert tf.io.gfile.exists(artifact_uri), \ + f"Model artifact uri {artifact_uri} does not exist!" + + saved_model = tf.saved_model.load(artifact_uri) + logging.info("Model loaded successfully.") + + assert (SERVING_DEFAULT_SIGNATURE_NAME in saved_model.signatures + ), f"{SERVING_DEFAULT_SIGNATURE_NAME} not in model signatures!" + + #Disabled until function is fixed +''' + prediction_fn = saved_model.signatures["serving_default"] + + predictions = prediction_fn(**new_test_instance) + logging.info("Model produced predictions.") + + keys = ["classes", "scores"] + for key in keys: + assert key in predictions, f"{key} in prediction outputs!" + + assert predictions["classes"].shape == ( + 1, + 2, + ), f"Invalid output classes shape: {predictions['classes'].shape}!" + assert predictions["scores"].shape == ( + 1, + 2, + ), f"Invalid output scores shape: {predictions['scores'].shape}!" + logging.info("Prediction output: %s", predictions) +''' + + +def test_model_endpoint(): + + project = os.getenv("PROJECT") + region = os.getenv("REGION") + model_display_name = os.getenv("MODEL_DISPLAY_NAME") + endpoint_display_name = os.getenv("ENDPOINT_DISPLAY_NAME") + + assert project, "Environment variable PROJECT is None!" + assert region, "Environment variable REGION is None!" + assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!" + assert endpoint_display_name, \ + "Environment variable ENDPOINT_DISPLAY_NAME is None!" + + vertex_ai.init( + project=project, + location=region, + ) + + endpoints = vertex_ai.Endpoint.list( + filter=f"display_name={endpoint_display_name}", order_by="update_time") + assert ( + endpoints + ), f"Endpoint with display name {endpoint_display_name} " + \ + "does not exist! in region {region}" + + endpoint = endpoints[-1] + logging.info("Calling endpoint: %s", endpoint) + + prediction = endpoint.predict([test_instance]).predictions[0] + + keys = ["classes", "scores"] + for key in keys: + assert key in prediction, f"{key} in prediction outputs!" + + assert (len(prediction["classes"]) == 2 + ), f"Invalid number of output classes: {len(prediction['classes'])}!" + assert (len(prediction["scores"]) == 2 + ), f"Invalid number output scores: {len(prediction['scores'])}!" + + logging.info("Prediction output: %s", prediction) diff --git a/examples/vertex_mlops_enterprise/src/tests/model_tests.py b/examples/vertex_mlops_enterprise/src/tests/model_tests.py new file mode 100644 index 0000000000..a13e56e216 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/tests/model_tests.py @@ -0,0 +1,82 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test model functions.""" + +import sys +import logging +import tensorflow as tf + +from src.model_training import model, defaults + +root = logging.getLogger() +root.setLevel(logging.INFO) +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.INFO) +root.addHandler(handler) + +EXPECTED_HYPERPARAMS_KEYS = [ + "hidden_units", + "learning_rate", + "batch_size", + "num_epochs", +] + + +def test_hyperparams_defaults(): + hyperparams = {"hidden_units": [64, 32]} + + hyperparams = defaults.update_hyperparams(hyperparams) + assert set(hyperparams.keys()) == set(EXPECTED_HYPERPARAMS_KEYS) + + +def test_create_model(): + + hyperparams = hyperparams = defaults.update_hyperparams({}) + + model_inputs = { + "V1": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V2": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V3": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V4": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V5": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V6": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V7": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V8": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V9": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V10": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V11": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V12": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V13": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V14": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V15": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V16": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V17": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V18": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V19": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V20": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V21": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V22": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V23": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V24": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V25": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V26": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V27": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "V28": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), + "Amount": tf.convert_to_tensor([10, 100, 1000]), + } + + classifier = model.create_model(model_inputs.keys(), hyperparams) + model_outputs = classifier(model_inputs) # .numpy() + assert model_outputs.shape == (3, 1) + assert model_outputs.dtype == "float32" diff --git a/examples/vertex_mlops_enterprise/src/tests/pipeline_deployment_tests.py b/examples/vertex_mlops_enterprise/src/tests/pipeline_deployment_tests.py new file mode 100644 index 0000000000..805b68ef1e --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/tests/pipeline_deployment_tests.py @@ -0,0 +1,91 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test training pipeline using local runner.""" + +import sys +import os +from tfx.orchestration.local.local_dag_runner import LocalDagRunner +import tensorflow as tf +from ml_metadata.proto import metadata_store_pb2 +import logging + +from src.tfx_pipelines import config +from src.tfx_pipelines import training_pipeline + +root = logging.getLogger() +root.setLevel(logging.INFO) +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.INFO) +root.addHandler(handler) + +MLMD_SQLLITE = "mlmd.sqllite" +NUM_EPOCHS = 1 +BATCH_SIZE = 512 +LEARNING_RATE = 0.001 +HIDDEN_UNITS = "128,128" + + +def test_e2e_pipeline(): + + project = os.getenv("PROJECT") + region = os.getenv("REGION") + model_display_name = os.getenv("MODEL_DISPLAY_NAME") + dataset_display_name = os.getenv("VERTEX_DATASET_NAME") + gcs_location = os.getenv("GCS_LOCATION") + model_registry = os.getenv("MODEL_REGISTRY_URI") + upload_model = os.getenv("UPLOAD_MODEL") + + assert project, "Environment variable PROJECT is None!" + assert region, "Environment variable REGION is None!" + assert dataset_display_name, \ + "Environment variable VERTEX_DATASET_NAME is None!" + assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!" + assert gcs_location, "Environment variable GCS_LOCATION is None!" + assert model_registry, "Environment variable MODEL_REGISTRY_URI is None!" + + logging.info("upload_model: %s", upload_model) + if tf.io.gfile.exists(gcs_location): + tf.io.gfile.rmtree(gcs_location) + logging.info("Pipeline e2e test artifacts stored in: %s", gcs_location) + + if tf.io.gfile.exists(MLMD_SQLLITE): + tf.io.gfile.remove(MLMD_SQLLITE) + + metadata_connection_config = metadata_store_pb2.ConnectionConfig() + metadata_connection_config.sqlite.filename_uri = MLMD_SQLLITE + metadata_connection_config.sqlite.connection_mode = 3 + logging.info("ML metadata store is ready.") + + pipeline_root = os.path.join( + config.ARTIFACT_STORE_URI, + config.PIPELINE_NAME, + ) + + runner = LocalDagRunner() + + pipeline = training_pipeline.create_pipeline( + pipeline_root=pipeline_root, + num_epochs=NUM_EPOCHS, + batch_size=BATCH_SIZE, + steps_per_epoch=100, + learning_rate=LEARNING_RATE, + hidden_units=HIDDEN_UNITS, + metadata_connection_config=metadata_connection_config, + ) + + runner.run(pipeline) + + logging.info( + "Model output: %s", os.path.join(model_registry, model_display_name)) + assert tf.io.gfile.exists(os.path.join(model_registry, model_display_name)) diff --git a/examples/vertex_mlops_enterprise/src/tfx_pipelines/components.py b/examples/vertex_mlops_enterprise/src/tfx_pipelines/components.py new file mode 100644 index 0000000000..fd68650d48 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/tfx_pipelines/components.py @@ -0,0 +1,133 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TFX Custom Python Components.""" + + +import sys +import os +import json +import logging +import tensorflow as tf + +from tfx.types import artifact_utils +from tfx.utils import io_utils +from tfx.components.util import model_utils +from tfx.dsl.component.experimental.decorators import component +from tfx.dsl.component.experimental.annotations import ( + InputArtifact, + OutputArtifact, + Parameter, +) +from tfx.types.standard_artifacts import HyperParameters, ModelBlessing +from tfx.types.experimental.simple_artifacts import File as UploadedModel + +from google.cloud import aiplatform as vertex_ai + + +SCRIPT_DIR = os.path.dirname( + os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) +) +sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) + + +HYPERPARAM_FILENAME = "hyperparameters.json" +SERVING_DATA_PREFIX = "serving-data-" +PREDICTION_RESULTS_PREFIX = "prediction.results-*" + + +@component +def hyperparameters_gen( + num_epochs: Parameter[int], + batch_size: Parameter[int], + steps_per_epoch: Parameter[int], + learning_rate: Parameter[float], + hidden_units: Parameter[str], + hyperparameters: OutputArtifact[HyperParameters], +): + + hp_dict = dict() + hp_dict["num_epochs"] = num_epochs + hp_dict["steps_per_epoch"] = steps_per_epoch + hp_dict["batch_size"] = batch_size + hp_dict["learning_rate"] = learning_rate + hp_dict["hidden_units"] = [int(units) for units in hidden_units.split(",")] + logging.info(f"Hyperparameters: {hp_dict}") + + hyperparams_uri = os.path.join( + artifact_utils.get_single_uri([hyperparameters]), HYPERPARAM_FILENAME + ) + io_utils.write_string_file(hyperparams_uri, json.dumps(hp_dict)) + logging.info(f"Hyperparameters are written to: {hyperparams_uri}") + + +@component +def vertex_model_uploader( + project: Parameter[str], + region: Parameter[str], + model_display_name: Parameter[str], + pushed_model_location: Parameter[str], + serving_image_uri: Parameter[str], + model_blessing: InputArtifact[ModelBlessing], + uploaded_model: OutputArtifact[UploadedModel], + explanation_config: Parameter[str]="", + labels: Parameter[str]="", +): + + vertex_ai.init(project=project, location=region) + + blessing = artifact_utils.get_single_instance([model_blessing]) + if not model_utils.is_model_blessed(blessing): + logging.info(f"Model is not uploaded to Vertex AI because it was not blessed by the evaluator.") + uploaded_model.set_int_custom_property("uploaded", 0) + return + + pushed_model_dir = os.path.join( + pushed_model_location, tf.io.gfile.listdir(pushed_model_location)[-1] + ) + + logging.info(f"Model registry location: {pushed_model_dir}") + + try: + explanation_config = json.loads(explanation_config) + explanation_metadata = vertex_ai.explain.ExplanationMetadata( + inputs=explanation_config["inputs"], + outputs=explanation_config["outputs"], + ) + explanation_parameters = vertex_ai.explain.ExplanationParameters( + explanation_config["params"] + ) + except: + explanation_metadata = None + explanation_parameters = None + + try: + labels = json.loads(labels) + except: + labels = None + + vertex_model = vertex_ai.Model.upload( + display_name=model_display_name, + artifact_uri=pushed_model_dir, + serving_container_image_uri=serving_image_uri, + parameters_schema_uri=None, + instance_schema_uri=None, + explanation_metadata=explanation_metadata, + explanation_parameters=explanation_parameters, + labels=labels + ) + + model_uri = vertex_model.gca_resource.name + logging.info(f"Model uploaded to Vertex AI: {model_uri}") + uploaded_model.set_string_custom_property("model_uri", model_uri) + uploaded_model.set_int_custom_property("uploaded", 1) diff --git a/examples/vertex_mlops_enterprise/src/tfx_pipelines/config.py b/examples/vertex_mlops_enterprise/src/tfx_pipelines/config.py new file mode 100644 index 0000000000..8b9e5d1f24 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/tfx_pipelines/config.py @@ -0,0 +1,130 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TFX pipeline configurations.""" + +import os +from tfx import v1 as tfx + +PROJECT = os.getenv("PROJECT", "") +REGION = os.getenv("REGION", "") +GCS_LOCATION = os.getenv("GCS_LOCATION", "") +DOCKER_REPO_NAME = os.getenv("DOCKER_REPO_NAME", "docker-repo") + +ARTIFACT_STORE_URI = os.path.join(GCS_LOCATION, "tfx_artifacts") +MODEL_REGISTRY_URI = os.getenv( + "MODEL_REGISTRY_URI", + os.path.join(GCS_LOCATION, "model_registry"), +) + +VERTEX_DATASET_NAME = os.getenv("VERTEX_DATASET_NAME", "creditcards") +MODEL_DISPLAY_NAME = os.getenv( + "MODEL_DISPLAY_NAME", f"{VERTEX_DATASET_NAME}-classifier" +) +PIPELINE_NAME = os.getenv("PIPELINE_NAME", f"{MODEL_DISPLAY_NAME}-train-pipeline") + +ML_USE_COLUMN = "ml_use" +EXCLUDE_COLUMNS = ",".join(["trip_start_timestamp"]) +TRAIN_LIMIT = os.getenv("TRAIN_LIMIT", "0") +TEST_LIMIT = os.getenv("TEST_LIMIT", "0") +SERVE_LIMIT = os.getenv("SERVE_LIMIT", "0") + +NUM_TRAIN_SPLITS = os.getenv("NUM_TRAIN_SPLITS", "4") +NUM_EVAL_SPLITS = os.getenv("NUM_EVAL_SPLITS", "1") +ACCURACY_THRESHOLD = os.getenv("ACCURACY_THRESHOLD", "0.8") + +USE_KFP_SA = os.getenv("USE_KFP_SA", "False") + +TFX_IMAGE_URI = os.getenv( + "TFX_IMAGE_URI", f"{REGION}-docker.pkg.dev/{PROJECT}/{DOCKER_REPO_NAME}/vertex:latest" +) + +DATAFLOW_IMAGE_URI = os.getenv( + "DATAFLOW_IMAGE_URI", f"{REGION}-docker.pkg.dev/{PROJECT}/{DOCKER_REPO_NAME}/dataflow:latest" +) + +BEAM_RUNNER = os.getenv("BEAM_RUNNER", "DirectRunner") +SERVICE_ACCOUNT = os.getenv("SERVICE_ACCOUNT", "") +SUBNETWORK = os.getenv("SUBNETWORK", "") + +BEAM_DIRECT_PIPELINE_ARGS = [ + f"--project={PROJECT}", + f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}", +] +BEAM_DATAFLOW_PIPELINE_ARGS = [ + f"--project={PROJECT}", + f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}", + f"--region={REGION}", + f"--runner={BEAM_RUNNER}", + f"--service_account_email={SERVICE_ACCOUNT}", + f"--no_use_public_ips", + f"--subnetwork={SUBNETWORK}", + f"--sdk_container_image={DATAFLOW_IMAGE_URI}" +] + +TRAINING_RUNNER = os.getenv("TRAINING_RUNNER", "local") +VERTEX_TRAINING_ARGS = { + 'project': PROJECT, + 'worker_pool_specs': [{ + 'machine_spec': { + 'machine_type': 'n1-standard-4', +# 'accelerator_type': 'NVIDIA_TESLA_K80', +# 'accelerator_count': 1 + }, + 'replica_count': 1, + 'container_spec': { + 'image_uri': TFX_IMAGE_URI, + }, + }], +} +VERTEX_TRAINING_CONFIG = { + tfx.extensions.google_cloud_ai_platform.ENABLE_UCAIP_KEY: True, + tfx.extensions.google_cloud_ai_platform.UCAIP_REGION_KEY: REGION, + tfx.extensions.google_cloud_ai_platform.TRAINING_ARGS_KEY: VERTEX_TRAINING_ARGS, + 'use_gpu': False, +} + +SERVING_RUNTIME = os.getenv("SERVING_RUNTIME", "tf2-cpu.2-5") +SERVING_IMAGE_URI = f"us-docker.pkg.dev/vertex-ai/prediction/{SERVING_RUNTIME}:latest" + +BATCH_PREDICTION_BQ_DATASET_NAME = os.getenv( + "BATCH_PREDICTION_BQ_DATASET_NAME", "playground_us" +) +BATCH_PREDICTION_BQ_TABLE_NAME = os.getenv( + "BATCH_PREDICTION_BQ_TABLE_NAME", "chicago_taxitrips_prep" +) +BATCH_PREDICTION_BEAM_ARGS = { + "runner": f"{BEAM_RUNNER}", + "temporary_dir": os.path.join(GCS_LOCATION, "temp"), + "gcs_location": os.path.join(GCS_LOCATION, "temp"), + "project": PROJECT, + "region": REGION, + "setup_file": "./setup.py", +} +BATCH_PREDICTION_JOB_RESOURCES = { + "machine_type": "n1-standard-2", + #'accelerator_count': 1, + #'accelerator_type': 'NVIDIA_TESLA_T4' + "starting_replica_count": 1, + "max_replica_count": 10, +} +DATASTORE_PREDICTION_KIND = f"{MODEL_DISPLAY_NAME}-predictions" + +ENABLE_CACHE = os.getenv("ENABLE_CACHE", "0") +UPLOAD_MODEL = os.getenv("UPLOAD_MODEL", "1") + +os.environ["PROJECT"] = PROJECT +os.environ["PIPELINE_NAME"] = PIPELINE_NAME +os.environ["DATAFLOW_IMAGE_URI"] = DATAFLOW_IMAGE_URI +os.environ["TFX_IMAGE_URI"] = TFX_IMAGE_URI +os.environ["MODEL_REGISTRY_URI"] = MODEL_REGISTRY_URI diff --git a/examples/vertex_mlops_enterprise/src/tfx_pipelines/runner.py b/examples/vertex_mlops_enterprise/src/tfx_pipelines/runner.py new file mode 100644 index 0000000000..9bfa4fd0e2 --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/tfx_pipelines/runner.py @@ -0,0 +1,68 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Define KubeflowV2DagRunner to run the training pipeline using Managed Pipelines.""" + + +import os +from tfx.orchestration import data_types +from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner + + +from src.tfx_pipelines import config, training_pipeline +from src.model_training import defaults + + +def compile_training_pipeline(pipeline_definition_file): + + pipeline_root = os.path.join( + config.ARTIFACT_STORE_URI, + config.PIPELINE_NAME, + ) + managed_pipeline = training_pipeline.create_pipeline( + pipeline_root=pipeline_root, + num_epochs=data_types.RuntimeParameter( + name="num_epochs", + default=defaults.NUM_EPOCHS, + ptype=int, + ), + batch_size=data_types.RuntimeParameter( + name="batch_size", + default=defaults.BATCH_SIZE, + ptype=int, + ), + steps_per_epoch=data_types.RuntimeParameter( + name="steps_per_epoch", + default=int(config.TRAIN_LIMIT) // defaults.BATCH_SIZE, + ptype=int, + ), + learning_rate=data_types.RuntimeParameter( + name="learning_rate", + default=defaults.LEARNING_RATE, + ptype=float, + ), + hidden_units=data_types.RuntimeParameter( + name="hidden_units", + default=",".join(str(u) for u in defaults.HIDDEN_UNITS), + ptype=str, + ), + ) + + runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( + config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( + default_image=config.TFX_IMAGE_URI + ), + output_filename=pipeline_definition_file, + ) + + return runner.run(managed_pipeline, write_out=True) diff --git a/examples/vertex_mlops_enterprise/src/tfx_pipelines/training_pipeline.py b/examples/vertex_mlops_enterprise/src/tfx_pipelines/training_pipeline.py new file mode 100644 index 0000000000..4a3d39f0fa --- /dev/null +++ b/examples/vertex_mlops_enterprise/src/tfx_pipelines/training_pipeline.py @@ -0,0 +1,320 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TFX training pipeline definition.""" + +import os +import sys +import logging +import json + +import tensorflow_model_analysis as tfma + +from ml_metadata.proto import metadata_store_pb2 + +from tfx.proto import example_gen_pb2, transform_pb2, pusher_pb2 +from tfx.types import Channel, standard_artifacts +from tfx.orchestration import pipeline, data_types +from tfx.dsl.components.common.importer import Importer +from tfx.dsl.components.common.resolver import Resolver +from tfx.dsl.experimental import latest_artifacts_resolver +from tfx.dsl.experimental import latest_blessed_model_resolver +from tfx.v1.extensions.google_cloud_big_query import BigQueryExampleGen +from tfx.v1.extensions.google_cloud_ai_platform import Trainer as VertexTrainer +from tfx.v1.components import ( + StatisticsGen, + ExampleValidator, + Transform, + Trainer, + Evaluator, + Pusher +) + + +SCRIPT_DIR = os.path.dirname( + os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) +) +sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) + + +from src.tfx_pipelines import config +from src.tfx_pipelines import components as custom_components +from src.common import features, datasource_utils + + +RAW_SCHEMA_DIR = "src/raw_schema" +TRANSFORM_MODULE_FILE = "src/preprocessing/transformations.py" +TRAIN_MODULE_FILE = "src/model_training/runner.py" + + +def create_pipeline( + pipeline_root: str, + num_epochs: data_types.RuntimeParameter, + batch_size: data_types.RuntimeParameter, + steps_per_epoch: data_types.RuntimeParameter, + learning_rate: data_types.RuntimeParameter, + hidden_units: data_types.RuntimeParameter, + metadata_connection_config: metadata_store_pb2.ConnectionConfig = None, +): + + # Hyperparameter generation. + hyperparams_gen = custom_components.hyperparameters_gen( + num_epochs=num_epochs, + batch_size=batch_size, + steps_per_epoch=steps_per_epoch, + learning_rate=learning_rate, + hidden_units=hidden_units + ).with_id("HyperparamsGen") + + # Get train source query. + logging.info(f'Using dataset {config.VERTEX_DATASET_NAME}') + train_sql_query = datasource_utils.get_training_source_query( + config.PROJECT, + config.REGION, + config.VERTEX_DATASET_NAME, + ml_use="UNASSIGNED", + limit=int(config.TRAIN_LIMIT), + ) + + train_output_config = example_gen_pb2.Output( + split_config=example_gen_pb2.SplitConfig( + splits=[ + example_gen_pb2.SplitConfig.Split( + name="train", hash_buckets=int(config.NUM_TRAIN_SPLITS) + ), + example_gen_pb2.SplitConfig.Split( + name="eval", hash_buckets=int(config.NUM_EVAL_SPLITS) + ), + ] + ) + ) + + # Train example generation. + train_example_gen = BigQueryExampleGen( + query=train_sql_query, + output_config=train_output_config, + ).with_id("TrainDataGen") + + # Get test source query. + test_sql_query = datasource_utils.get_training_source_query( + config.PROJECT, + config.REGION, + config.VERTEX_DATASET_NAME, + ml_use="TEST", + limit=int(config.TEST_LIMIT), + ) + + test_output_config = example_gen_pb2.Output( + split_config=example_gen_pb2.SplitConfig( + splits=[ + example_gen_pb2.SplitConfig.Split(name="test", hash_buckets=1), + ] + ) + ) + + # Test example generation. + test_example_gen = BigQueryExampleGen( + query=test_sql_query, + output_config=test_output_config, + ).with_id("TestDataGen") + + # Schema importer. + schema_importer = Importer( + source_uri=RAW_SCHEMA_DIR, + artifact_type=standard_artifacts.Schema, + ).with_id("SchemaImporter") + + # Statistics generation. + statistics_gen = StatisticsGen(examples=train_example_gen.outputs["examples"]).with_id( + "StatisticsGen" + ) + + # Example validation. + example_validator = ExampleValidator( + statistics=statistics_gen.outputs["statistics"], + schema=schema_importer.outputs["result"], + ).with_id("ExampleValidator") + + # Data transformation. + transform = Transform( + examples=train_example_gen.outputs["examples"], + schema=schema_importer.outputs["result"], + module_file=TRANSFORM_MODULE_FILE, + # This is a temporary workaround to run on Dataflow. + force_tf_compat_v1=config.BEAM_RUNNER == "DataflowRunner", + splits_config=transform_pb2.SplitsConfig( + analyze=["train"], transform=["train", "eval"] + ), + ).with_id("DataTransformer") + + # Add dependency from example_validator to transform. + transform.add_upstream_node(example_validator) + + # Get the latest model to warmstart + warmstart_model_resolver = Resolver( + strategy_class=latest_artifacts_resolver.LatestArtifactsResolver, + latest_model=Channel(type=standard_artifacts.Model), + ).with_id("WarmstartModelResolver") + + # Model training. + trainer = Trainer( + module_file=TRAIN_MODULE_FILE, + examples=transform.outputs["transformed_examples"], + schema=schema_importer.outputs["result"], + base_model=warmstart_model_resolver.outputs["latest_model"], + transform_graph=transform.outputs["transform_graph"], + hyperparameters=hyperparams_gen.outputs["hyperparameters"], + ).with_id("ModelTrainer") + + if config.TRAINING_RUNNER == "vertex": + trainer = VertexTrainer( + module_file=TRAIN_MODULE_FILE, + examples=transform.outputs["transformed_examples"], + schema=schema_importer.outputs["result"], + base_model=warmstart_model_resolver.outputs["latest_model"], + transform_graph=transform.outputs["transform_graph"], + hyperparameters=hyperparams_gen.outputs["hyperparameters"], + custom_config=config.VERTEX_TRAINING_CONFIG + ).with_id("ModelTrainer") + + + # Get the latest blessed model (baseline) for model validation. + baseline_model_resolver = Resolver( + strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver, + model=Channel(type=standard_artifacts.Model), + model_blessing=Channel(type=standard_artifacts.ModelBlessing), + ).with_id("BaselineModelResolver") + + # Prepare evaluation config. + eval_config = tfma.EvalConfig( + model_specs=[ + tfma.ModelSpec( + signature_name="serving_tf_example", + label_key=features.TARGET_FEATURE_NAME, + prediction_key="probabilities", + ) + ], + slicing_specs=[ + tfma.SlicingSpec(), + ], + metrics_specs=[ + tfma.MetricsSpec( + metrics=[ + tfma.MetricConfig(class_name="ExampleCount"), + tfma.MetricConfig( + class_name="BinaryAccuracy", + threshold=tfma.MetricThreshold( + value_threshold=tfma.GenericValueThreshold( + lower_bound={"value": float(config.ACCURACY_THRESHOLD)} + ), + # Change threshold will be ignored if there is no + # baseline model resolved from MLMD (first run). + change_threshold=tfma.GenericChangeThreshold( + direction=tfma.MetricDirection.HIGHER_IS_BETTER, + absolute={"value": -1e-10}, + ), + ), + ), + ] + ) + ], + ) + + # Model evaluation. + evaluator = Evaluator( + examples=test_example_gen.outputs["examples"], + example_splits=["test"], + model=trainer.outputs["model"], + baseline_model=baseline_model_resolver.outputs["model"], + eval_config=eval_config, + schema=schema_importer.outputs["result"], + ).with_id("ModelEvaluator") + + exported_model_location = os.path.join( + config.MODEL_REGISTRY_URI, config.MODEL_DISPLAY_NAME + ) + push_destination = pusher_pb2.PushDestination( + filesystem=pusher_pb2.PushDestination.Filesystem( + base_directory=exported_model_location + ) + ) + + # Push custom model to model registry. + gcs_pusher = Pusher( + model=trainer.outputs["model"], + model_blessing=evaluator.outputs["blessing"], + push_destination=push_destination, + ).with_id("GcsModelPusher") + + # Push to Vertex AI + labels = { + "dataset_name": config.VERTEX_DATASET_NAME[:62], + "pipeline_name": config.PIPELINE_NAME[:62], + "pipeline_root": pipeline_root[:62] + } + + labels = json.dumps(labels) + explanation_config = json.dumps(features.generate_explanation_config()) + + print(f"Labels for model: {labels}") + + # Custom implementation. + vertex_model_uploader = custom_components.vertex_model_uploader( + project=config.PROJECT, + region=config.REGION, + model_display_name=config.MODEL_DISPLAY_NAME, + pushed_model_location=exported_model_location, + serving_image_uri=config.SERVING_IMAGE_URI, + model_blessing=evaluator.outputs["blessing"], + explanation_config=explanation_config #, + #labels=labels # currently labels have values too long or too many keys or something else invalid + ).with_id("VertexUploader") + + pipeline_components = [ + hyperparams_gen, + train_example_gen, + test_example_gen, + statistics_gen, + schema_importer, + example_validator, + transform, + warmstart_model_resolver, + trainer, + baseline_model_resolver, + evaluator, + gcs_pusher, + ] + + if int(config.UPLOAD_MODEL): + pipeline_components.append(vertex_model_uploader) + # Add dependency from pusher to aip_model_uploader. + vertex_model_uploader.add_upstream_node(gcs_pusher) + + logging.info( + f"Pipeline components: {[component.id for component in pipeline_components]}" + ) + + beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS + if config.BEAM_RUNNER == "DataflowRunner": + beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS + + logging.info(f"Beam pipeline args: {beam_pipeline_args}") + print(f"Beam pipeline args: {beam_pipeline_args}") + return pipeline.Pipeline( + pipeline_name=config.PIPELINE_NAME, + pipeline_root=pipeline_root, + components=pipeline_components, + beam_pipeline_args=beam_pipeline_args, + metadata_connection_config=metadata_connection_config, + enable_cache=int(config.ENABLE_CACHE), + ) diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/firewall-policies/cidrs.yaml b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/firewall-policies/cidrs.yaml new file mode 100644 index 0000000000..90dabfb6a7 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/firewall-policies/cidrs.yaml @@ -0,0 +1,15 @@ +# skip boilerplate check + +healthchecks: + - 35.191.0.0/16 + - 130.211.0.0/22 + - 209.85.152.0/22 + - 209.85.204.0/22 + +rfc1918: + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + +onprem_probes: + - 10.255.255.254/32 \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/firewall-policies/hierarchical-policy-rules.yaml b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/firewall-policies/hierarchical-policy-rules.yaml new file mode 100644 index 0000000000..6a3b313356 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/firewall-policies/hierarchical-policy-rules.yaml @@ -0,0 +1,50 @@ +# skip boilerplate check + +allow-admins: + description: Access from the admin subnet to all subnets + direction: INGRESS + action: allow + priority: 1000 + ranges: + - $rfc1918 + ports: + all: [] + target_resources: null + enable_logging: false + +allow-healthchecks: + description: Enable HTTP and HTTPS healthchecks + direction: INGRESS + action: allow + priority: 1001 + ranges: + - $healthchecks + ports: + tcp: ["80", "443"] + target_resources: null + enable_logging: false + +allow-ssh-from-iap: + description: Enable SSH from IAP + direction: INGRESS + action: allow + priority: 1002 + ranges: + - 35.235.240.0/20 + ports: + tcp: ["22"] + target_resources: null + enable_logging: false + +allow-icmp: + description: Enable ICMP + direction: INGRESS + action: allow + priority: 1003 + ranges: + - 0.0.0.0/0 + ports: + icmp: [] + target_resources: null + enable_logging: false + \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/compute.yaml b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/compute.yaml new file mode 100644 index 0000000000..a3f96b1b1c --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/compute.yaml @@ -0,0 +1,92 @@ +# skip boilerplate check +# +# sample subset of useful organization policies, edit to suit requirements + +compute.disableGuestAttributesAccess: + rules: + - enforce: true + +compute.requireOsLogin: + rules: + - enforce: true + +compute.restrictLoadBalancerCreationForTypes: + rules: + - allow: + values: + - in:INTERNAL + +compute.skipDefaultNetworkCreation: + rules: + - enforce: true + +compute.vmExternalIpAccess: + rules: + - deny: + all: true + + +# compute.disableInternetNetworkEndpointGroup: +# rules: +# - enforce: true + +# compute.disableNestedVirtualization: +# rules: +# - enforce: true + +# compute.disableSerialPortAccess: +# rules: +# - enforce: true + +# compute.restrictCloudNATUsage: +# rules: +# - deny: +# all: true + +# compute.restrictDedicatedInterconnectUsage: +# rules: +# - deny: +# all: true + +# compute.restrictPartnerInterconnectUsage: +# rules: +# - deny: +# all: true + +# compute.restrictProtocolForwardingCreationForTypes: +# rules: +# - deny: +# all: true + +# compute.restrictSharedVpcHostProjects: +# rules: +# - deny: +# all: true + +# compute.restrictSharedVpcSubnetworks: +# rules: +# - deny: +# all: true + +# compute.restrictVpcPeering: +# rules: +# - deny: +# all: true + +# compute.restrictVpnPeerIPs: +# rules: +# - deny: +# all: true + +# compute.restrictXpnProjectLienRemoval: +# rules: +# - enforce: true + +# compute.setNewProjectDefaultToZonalDNSOnly: +# rules: +# - enforce: true + +# compute.vmCanIpForward: +# rules: +# - deny: +# all: true diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/iam.yaml b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/iam.yaml new file mode 100644 index 0000000000..58e0032cb3 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/iam.yaml @@ -0,0 +1,15 @@ +# skip boilerplate check +# +# sample subset of useful organization policies, edit to suit requirements + +iam.automaticIamGrantsForDefaultServiceAccounts: + rules: + - enforce: true + +iam.disableServiceAccountKeyCreation: + rules: + - enforce: true + +iam.disableServiceAccountKeyUpload: + rules: + - enforce: true diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/serverless.yaml b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/serverless.yaml new file mode 100644 index 0000000000..3efb23cdee --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/serverless.yaml @@ -0,0 +1,31 @@ +# skip boilerplate check +# +# sample subset of useful organization policies, edit to suit requirements + +run.allowedIngress: + rules: + - allow: + values: + - is:internal + +# run.allowedVPCEgress: +# rules: +# - allow: +# values: +# - is:private-ranges-only + +# cloudfunctions.allowedIngressSettings: +# rules: +# - allow: +# values: +# - is:ALLOW_INTERNAL_ONLY + +# cloudfunctions.allowedVpcConnectorEgressSettings: +# rules: +# - allow: +# values: +# - is:PRIVATE_RANGES_ONLY + +# cloudfunctions.requireVPCConnector: +# rules: +# - enforce: true diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/sql.yaml b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/sql.yaml new file mode 100644 index 0000000000..0eee80453a --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/sql.yaml @@ -0,0 +1,11 @@ +# skip boilerplate check +# +# sample subset of useful organization policies, edit to suit requirements + +sql.restrictAuthorizedNetworks: + rules: + - enforce: true + +sql.restrictPublicIp: + rules: + - enforce: true diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/storage.yaml b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/storage.yaml new file mode 100644 index 0000000000..448357b8bc --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/org-policies/storage.yaml @@ -0,0 +1,7 @@ +# skip boilerplate check +# +# sample subset of useful organization policies, edit to suit requirements + +storage.uniformBucketLevelAccess: + rules: + - enforce: true diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/vpc-sc/accessible-services.yaml b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/vpc-sc/accessible-services.yaml new file mode 100644 index 0000000000..2107d2ff1e --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/vpc-sc/accessible-services.yaml @@ -0,0 +1,119 @@ +# skip boilerplate check + +- accessapproval.googleapis.com +- adsdatahub.googleapis.com +- aiplatform.googleapis.com +- alloydb.googleapis.com +- alpha-documentai.googleapis.com +- analyticshub.googleapis.com +- apigee.googleapis.com +- apigeeconnect.googleapis.com +- artifactregistry.googleapis.com +- assuredworkloads.googleapis.com +- automl.googleapis.com +- baremetalsolution.googleapis.com +- batch.googleapis.com +- beyondcorp.googleapis.com +- bigquery.googleapis.com +- bigquerydatapolicy.googleapis.com +- bigquerydatatransfer.googleapis.com +- bigquerymigration.googleapis.com +- bigqueryreservation.googleapis.com +- bigtable.googleapis.com +- binaryauthorization.googleapis.com +- cloudasset.googleapis.com +- cloudbuild.googleapis.com +- clouddebugger.googleapis.com +- clouderrorreporting.googleapis.com +- cloudfunctions.googleapis.com +- cloudkms.googleapis.com +- cloudprofiler.googleapis.com +- cloudresourcemanager.googleapis.com +- cloudsearch.googleapis.com +- cloudtrace.googleapis.com +- composer.googleapis.com +- compute.googleapis.com +- connectgateway.googleapis.com +- contactcenterinsights.googleapis.com +- container.googleapis.com +- containeranalysis.googleapis.com +- containerfilesystem.googleapis.com +- containerregistry.googleapis.com +- containerthreatdetection.googleapis.com +- contentwarehouse.googleapis.com +- datacatalog.googleapis.com +- dataflow.googleapis.com +- datafusion.googleapis.com +- datalineage.googleapis.com +- datamigration.googleapis.com +- datapipelines.googleapis.com +- dataplex.googleapis.com +- dataproc.googleapis.com +- datastream.googleapis.com +- dialogflow.googleapis.com +- dlp.googleapis.com +- dns.googleapis.com +- documentai.googleapis.com +- domains.googleapis.com +- essentialcontacts.googleapis.com +- eventarc.googleapis.com +- file.googleapis.com +- firebaseappcheck.googleapis.com +- firebaserules.googleapis.com +- firestore.googleapis.com +- gameservices.googleapis.com +- gkebackup.googleapis.com +- gkeconnect.googleapis.com +- gkehub.googleapis.com +- gkemulticloud.googleapis.com +- healthcare.googleapis.com +- iam.googleapis.com +- iamcredentials.googleapis.com +- iaptunnel.googleapis.com +- ids.googleapis.com +- integrations.googleapis.com +- language.googleapis.com +- lifesciences.googleapis.com +- logging.googleapis.com +- managedidentities.googleapis.com +- memcache.googleapis.com +- meshca.googleapis.com +- metastore.googleapis.com +- ml.googleapis.com +- monitoring.googleapis.com +- networkconnectivity.googleapis.com +- networkmanagement.googleapis.com +- networksecurity.googleapis.com +- networkservices.googleapis.com +- notebooks.googleapis.com +- opsconfigmonitoring.googleapis.com +- osconfig.googleapis.com +- oslogin.googleapis.com +- policytroubleshooter.googleapis.com +- privateca.googleapis.com +- pubsub.googleapis.com +- pubsublite.googleapis.com +- recaptchaenterprise.googleapis.com +- recommender.googleapis.com +- redis.googleapis.com +- retail.googleapis.com +- run.googleapis.com +- secretmanager.googleapis.com +- servicecontrol.googleapis.com +- servicedirectory.googleapis.com +- spanner.googleapis.com +- speakerid.googleapis.com +- speech.googleapis.com +- sqladmin.googleapis.com +- storage.googleapis.com +- storagetransfer.googleapis.com +- texttospeech.googleapis.com +- tpu.googleapis.com +- trafficdirector.googleapis.com +- transcoder.googleapis.com +- translate.googleapis.com +- videointelligence.googleapis.com +- vision.googleapis.com +- visionai.googleapis.com +- vpcaccess.googleapis.com +- workstations.googleapis.com \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/vpc-sc/restricted-services.yaml b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/vpc-sc/restricted-services.yaml new file mode 100644 index 0000000000..2107d2ff1e --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/data/vpc-sc/restricted-services.yaml @@ -0,0 +1,119 @@ +# skip boilerplate check + +- accessapproval.googleapis.com +- adsdatahub.googleapis.com +- aiplatform.googleapis.com +- alloydb.googleapis.com +- alpha-documentai.googleapis.com +- analyticshub.googleapis.com +- apigee.googleapis.com +- apigeeconnect.googleapis.com +- artifactregistry.googleapis.com +- assuredworkloads.googleapis.com +- automl.googleapis.com +- baremetalsolution.googleapis.com +- batch.googleapis.com +- beyondcorp.googleapis.com +- bigquery.googleapis.com +- bigquerydatapolicy.googleapis.com +- bigquerydatatransfer.googleapis.com +- bigquerymigration.googleapis.com +- bigqueryreservation.googleapis.com +- bigtable.googleapis.com +- binaryauthorization.googleapis.com +- cloudasset.googleapis.com +- cloudbuild.googleapis.com +- clouddebugger.googleapis.com +- clouderrorreporting.googleapis.com +- cloudfunctions.googleapis.com +- cloudkms.googleapis.com +- cloudprofiler.googleapis.com +- cloudresourcemanager.googleapis.com +- cloudsearch.googleapis.com +- cloudtrace.googleapis.com +- composer.googleapis.com +- compute.googleapis.com +- connectgateway.googleapis.com +- contactcenterinsights.googleapis.com +- container.googleapis.com +- containeranalysis.googleapis.com +- containerfilesystem.googleapis.com +- containerregistry.googleapis.com +- containerthreatdetection.googleapis.com +- contentwarehouse.googleapis.com +- datacatalog.googleapis.com +- dataflow.googleapis.com +- datafusion.googleapis.com +- datalineage.googleapis.com +- datamigration.googleapis.com +- datapipelines.googleapis.com +- dataplex.googleapis.com +- dataproc.googleapis.com +- datastream.googleapis.com +- dialogflow.googleapis.com +- dlp.googleapis.com +- dns.googleapis.com +- documentai.googleapis.com +- domains.googleapis.com +- essentialcontacts.googleapis.com +- eventarc.googleapis.com +- file.googleapis.com +- firebaseappcheck.googleapis.com +- firebaserules.googleapis.com +- firestore.googleapis.com +- gameservices.googleapis.com +- gkebackup.googleapis.com +- gkeconnect.googleapis.com +- gkehub.googleapis.com +- gkemulticloud.googleapis.com +- healthcare.googleapis.com +- iam.googleapis.com +- iamcredentials.googleapis.com +- iaptunnel.googleapis.com +- ids.googleapis.com +- integrations.googleapis.com +- language.googleapis.com +- lifesciences.googleapis.com +- logging.googleapis.com +- managedidentities.googleapis.com +- memcache.googleapis.com +- meshca.googleapis.com +- metastore.googleapis.com +- ml.googleapis.com +- monitoring.googleapis.com +- networkconnectivity.googleapis.com +- networkmanagement.googleapis.com +- networksecurity.googleapis.com +- networkservices.googleapis.com +- notebooks.googleapis.com +- opsconfigmonitoring.googleapis.com +- osconfig.googleapis.com +- oslogin.googleapis.com +- policytroubleshooter.googleapis.com +- privateca.googleapis.com +- pubsub.googleapis.com +- pubsublite.googleapis.com +- recaptchaenterprise.googleapis.com +- recommender.googleapis.com +- redis.googleapis.com +- retail.googleapis.com +- run.googleapis.com +- secretmanager.googleapis.com +- servicecontrol.googleapis.com +- servicedirectory.googleapis.com +- spanner.googleapis.com +- speakerid.googleapis.com +- speech.googleapis.com +- sqladmin.googleapis.com +- storage.googleapis.com +- storagetransfer.googleapis.com +- texttospeech.googleapis.com +- tpu.googleapis.com +- trafficdirector.googleapis.com +- transcoder.googleapis.com +- translate.googleapis.com +- videointelligence.googleapis.com +- vision.googleapis.com +- visionai.googleapis.com +- vpcaccess.googleapis.com +- workstations.googleapis.com \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/main.tf b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/main.tf new file mode 100644 index 0000000000..39d39b1382 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/main.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# tfdoc:file:description Vertex MLOps + +module "shielded-folder" { + source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//blueprints/data-solutions/shielded-folder" + access_policy_config = var.access_policy_config + enable_features = var.enable_features + folder_config = var.folder_config + kms_keys = var.kms_keys + log_locations = var.log_locations + organization = var.organization + prefix = var.prefix + project_config = var.project_config + vpc_sc_access_levels = var.vpc_sc_access_levels +} diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/providers.tf b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/providers.tf new file mode 100644 index 0000000000..4be271944e --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/providers.tf @@ -0,0 +1,13 @@ +terraform { + backend "gcs" { + bucket = "pcorp-iac-core-bucket" + prefix = "ShieldedFolder" + } +} + +provider "google" { + impersonate_service_account = "pcorp-iac-core@pcorp-iac-core.iam.gserviceaccount.com" +} +provider "google-beta" { + impersonate_service_account = "pcorp-iac-core@pcorp-iac-core.iam.gserviceaccount.com" +} diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/terraform.tfvars b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/terraform.tfvars new file mode 100644 index 0000000000..1d38d63b39 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/terraform.tfvars @@ -0,0 +1,42 @@ +access_policy_config = { + access_policy_create = { + parent = "organizations/863496320224" + title = "ShieldedFolder" + } +} + +enable_features = { + encryption = true +} + +folder_config = { + folder_create = { + display_name = "ShieldedFolder" + parent = "folders/443543909887" #Dev + } +} + + +kms_keys = { + compute = { + locations = ["europe-west4"] + }, + +} + +organization = { + domain = "pcorp.joonix.net" + id = "863496320224" +} +prefix = "pcorp" +project_config = { + billing_account_id = "0189FA-E139FD-136A58" +} + +vpc_sc_access_levels = { + users = { + conditions = [ + { members = ["user:jgpuga@google.com"] } + ] + } +} diff --git a/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/variables.tf b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/variables.tf new file mode 100644 index 0000000000..8154e86220 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/00-shielded-folder/variables.tf @@ -0,0 +1,229 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Variables definition. + +variable "access_policy_config" { + description = "Provide 'access_policy_create' values if a folder scoped Access Policy creation is needed, uses existing 'policy_name' otherwise. Parent is in 'organizations/123456' format. Policy will be created scoped to the folder." + type = object({ + policy_name = optional(string, null) + access_policy_create = optional(object({ + parent = string + title = string + }), null) + }) + nullable = false +} + +variable "data_dir" { + description = "Relative path for the folder storing configuration data." + type = string + default = "data" +} + +variable "enable_features" { + description = "Flag to enable features on the solution." + type = object({ + encryption = optional(bool, false) + log_sink = optional(bool, true) + vpc_sc = optional(bool, true) + }) + default = { + encryption = false + log_sink = true + vpc_sc = true + } +} + +variable "folder_config" { + description = "Provide 'folder_create' values if folder creation is needed, uses existing 'folder_id' otherwise. Parent is in 'folders/nnn' or 'organizations/nnn' format." + type = object({ + folder_id = optional(string, null) + folder_create = optional(object({ + display_name = string + parent = string + }), null) + }) + validation { + condition = var.folder_config.folder_id != null || var.folder_config.folder_create != null + error_message = "At least one attribute should be set." + } + nullable = false +} + +variable "groups" { + description = "User groups." + type = object({ + workload-engineers = optional(string, "gcp-data-engineers") + workload-security = optional(string, "gcp-data-security") + }) + default = {} + nullable = false +} + +variable "kms_keys" { + description = "KMS keys to create, keyed by name." + type = map(object({ + iam = optional(map(list(string)), {}) + labels = optional(map(string), {}) + locations = optional(list(string), ["global", "europe", "europe-west1"]) + rotation_period = optional(string, "7776000s") + })) + default = {} +} + +variable "log_locations" { + description = "Optional locations for GCS, BigQuery, and logging buckets created here." + type = object({ + bq = optional(string, "europe") + storage = optional(string, "europe") + logging = optional(string, "global") + pubsub = optional(string, "global") + }) + default = { + bq = "europe" + storage = "europe" + logging = "global" + pubsub = null + } + nullable = false +} + +variable "log_sinks" { + description = "Org-level log sinks, in name => {type, filter} format." + type = map(object({ + filter = string + type = string + })) + default = { + audit-logs = { + filter = "logName:\"/logs/cloudaudit.googleapis.com%2Factivity\" OR logName:\"/logs/cloudaudit.googleapis.com%2Fsystem_event\"" + type = "bigquery" + } + vpc-sc = { + filter = "protoPayload.metadata.@type=\"type.googleapis.com/google.cloud.audit.VpcServiceControlAuditMetadata\"" + type = "bigquery" + } + } + validation { + condition = alltrue([ + for k, v in var.log_sinks : + contains(["bigquery", "logging", "pubsub", "storage"], v.type) + ]) + error_message = "Type must be one of 'bigquery', 'logging', 'pubsub', 'storage'." + } +} + +variable "organization" { + description = "Organization details." + type = object({ + domain = string + id = string + }) +} + +variable "prefix" { + description = "Prefix used for resources that need unique names." + type = string +} + +variable "project_config" { + description = "Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format." + type = object({ + billing_account_id = optional(string, null) + project_ids = optional(object({ + sec-core = string + audit-logs = string + }), { + sec-core = "sec-core" + audit-logs = "audit-logs" + } + ) + }) + nullable = false + validation { + condition = var.project_config.billing_account_id != null || var.project_config.project_ids != null + error_message = "At least one attribute should be set." + } +} + +variable "vpc_sc_access_levels" { + description = "VPC SC access level definitions." + type = map(object({ + combining_function = optional(string) + conditions = optional(list(object({ + device_policy = optional(object({ + allowed_device_management_levels = optional(list(string)) + allowed_encryption_statuses = optional(list(string)) + require_admin_approval = bool + require_corp_owned = bool + require_screen_lock = optional(bool) + os_constraints = optional(list(object({ + os_type = string + minimum_version = optional(string) + require_verified_chrome_os = optional(bool) + }))) + })) + ip_subnetworks = optional(list(string), []) + members = optional(list(string), []) + negate = optional(bool) + regions = optional(list(string), []) + required_access_levels = optional(list(string), []) + })), []) + description = optional(string) + })) + default = {} + nullable = false +} + +variable "vpc_sc_egress_policies" { + description = "VPC SC egress policy defnitions." + type = map(object({ + from = object({ + identity_type = optional(string, "ANY_IDENTITY") + identities = optional(list(string)) + }) + to = object({ + operations = optional(list(object({ + method_selectors = optional(list(string)) + service_name = string + })), []) + resources = optional(list(string)) + resource_type_external = optional(bool, false) + }) + })) + default = {} + nullable = false +} + +variable "vpc_sc_ingress_policies" { + description = "VPC SC ingress policy defnitions." + type = map(object({ + from = object({ + access_levels = optional(list(string), []) + identity_type = optional(string) + identities = optional(list(string)) + resources = optional(list(string), []) + }) + to = object({ + operations = optional(list(object({ + method_selectors = optional(list(string)) + service_name = string + })), []) + resources = optional(list(string)) + }) + })) + default = {} + nullable = false +} \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/terraform/01-dev/main.tf b/examples/vertex_mlops_enterprise/terraform/01-dev/main.tf new file mode 100644 index 0000000000..0df2fa0ce6 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/01-dev/main.tf @@ -0,0 +1,52 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# tfdoc:file:description Vertex MLOps + +locals { + bucket_name = "${var.bucket_name}-${var.environment}" + env_label = { + "env" : "${var.environment}" + } + labels = merge(local.env_label, var.labels) + + github = { + organization = var.github.organization + repo = var.github.repo + branch = var.environment + } + + identity_pool_claims = try("attribute.repository/${var.github.organization}/${var.github.repo}", null) + + project_config = { + billing_account_id = var.project_config.billing_account_id + parent = var.project_config.parent + project_id = "${var.project_config.project_id}-${var.environment}" + } + +} +module "mlops" { + source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//blueprints/data-solutions/vertex-mlops" + project_config = local.project_config + prefix = var.prefix + bucket_name = local.bucket_name + dataset_name = var.dataset_name + groups = var.groups + identity_pool_claims = local.identity_pool_claims + labels = local.labels + notebooks = var.notebooks + service_encryption_keys = var.service_encryption_keys +} diff --git a/examples/vertex_mlops_enterprise/terraform/01-dev/outputs.tf b/examples/vertex_mlops_enterprise/terraform/01-dev/outputs.tf new file mode 100644 index 0000000000..936ab1a63a --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/01-dev/outputs.tf @@ -0,0 +1,208 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +locals { + mainconfig_yaml = templatefile("${path.module}/../../mainconfig.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + subnetwork = module.mlops.github.SUBNETWORK + }) + + gh_containers_yaml = templatefile("${path.module}/../../.github/workflows/containers.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO + environment = var.environment + }) + + gh_main_tfx_yaml = templatefile("${path.module}/../../.github/workflows/main.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO, + environment = var.environment, + framework = "tfx" + }) + + gh_main_kfp_yaml = templatefile("${path.module}/../../.github/workflows/main.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO, + environment = var.environment, + framework = "kfp" + }) + + gh_run_tfx_yaml = templatefile("${path.module}/../../.github/workflows/run.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO, + environment = var.environment, + framework = "tfx" + }) + + gh_run_kfp_yaml = templatefile("${path.module}/../../.github/workflows/run.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO, + environment = var.environment, + framework = "kfp" + }) + + gh_deploy_yaml = templatefile("${path.module}/../../.github/workflows/deploy.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO, + environment = var.environment + }) + + pipeline_deploy_tfx = templatefile("${path.module}/../../build/pipeline-deployment-tfx.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + subnetwork = module.mlops.github.SUBNETWORK, + bucket_name = "${var.prefix}-${var.bucket_name}-${var.environment}" + }) + + pipeline_deploy_kfp = templatefile("${path.module}/../../build/pipeline-deployment-kfp.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + dataflow_network = "regions/europe-west4/subnetworks/subnet-europe-west4", + subnetwork = module.mlops.github.SUBNETWORK, + bucket_name = "${var.prefix}-${var.bucket_name}-${var.environment}" + }) + + pipeline_run_tfx = templatefile("${path.module}/../../build/pipeline-run.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + sa_mlops = module.mlops.github.SA_MLOPS, + bucket_name = "${var.prefix}-${var.bucket_name}-${var.environment}", + pipeline_name = "creditcards-classifier-v02-train-pipeline", + pipeline_params = "\"{\\\"num_epochs\\\": 7, \\\"learning_rate\\\": 0.0015, \\\"batch_size\\\": 512, \\\"steps_per_epoch\\\": 9, \\\"hidden_units\\\": \\\"256,126\\\"}\"" + }) + + pipeline_run_kfp = templatefile("${path.module}/../../build/pipeline-run.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + sa_mlops = module.mlops.github.SA_MLOPS, + bucket_name = "${var.prefix}-${var.bucket_name}-${var.environment}", + pipeline_name = "creditcards-classifier-kfp-train", + pipeline_params = "{\"bq_table\": \"${module.mlops.github.PROJECT_ID}.${var.dataset_name}.creditcards_ml\", \"xgboost_param_max_depth\": 5, \"xgboost_param_learning_rate\": 0.1, \"xgboost_param_n_estimators\": 20}" + }) + + + model_deployment = templatefile("${path.module}/../../build/model-deployment.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + }) +} + + +resource "local_file" "mainconfig_yml" { + filename = "${path.module}/../../mainconfig.yaml" + content = local.mainconfig_yaml +} + +resource "local_file" "containers_yml" { + filename = "${path.module}/../../.github/workflows/containers.yml" + content = local.gh_containers_yaml +} + +resource "local_file" "main_tfx_yml" { + filename = "${path.module}/../../.github/workflows/main-tfx.yml" + content = local.gh_main_tfx_yaml +} + +resource "local_file" "main_kfp_yml" { + filename = "${path.module}/../../.github/workflows/main-kfp.yml" + content = local.gh_main_kfp_yaml +} + +resource "local_file" "run_tfx_yml" { + filename = "${path.module}/../../.github/workflows/run-tfx.yml" + content = local.gh_run_tfx_yaml +} + +resource "local_file" "run_kfp_yml" { + filename = "${path.module}/../../.github/workflows/run-kfp.yml" + content = local.gh_run_kfp_yaml +} + +resource "local_file" "deploy_yml" { + filename = "${path.module}/../../.github/workflows/deploy.yml" + content = local.gh_deploy_yaml +} + +resource "local_file" "deployment_tfx_yml" { + filename = "${path.module}/../../build/${var.environment}/pipeline-deployment-tfx.yaml" + content = local.pipeline_deploy_tfx +} + +resource "local_file" "deployment_kfp_yml" { + filename = "${path.module}/../../build/${var.environment}/pipeline-deployment-kfp.yaml" + content = local.pipeline_deploy_kfp +} + +resource "local_file" "pipeline_run_tfx_ml" { + filename = "${path.module}/../../build/${var.environment}/pipeline-run-tfx.yaml" + content = local.pipeline_run_tfx +} + +resource "local_file" "pipeline_run_kfp_ml" { + filename = "${path.module}/../../build/${var.environment}/pipeline-run-kfp.yaml" + content = local.pipeline_run_kfp +} + +resource "local_file" "model_deploy_yml" { + filename = "${path.module}/../../build/${var.environment}/model-deployment.yaml" + content = local.model_deployment +} + +output "mlops" { + description = "Created project, service accounts and associates resources." + value = module.mlops +} diff --git a/examples/vertex_mlops_enterprise/terraform/01-dev/terraform.tfvars.sample b/examples/vertex_mlops_enterprise/terraform/01-dev/terraform.tfvars.sample new file mode 100644 index 0000000000..b4a15f40a8 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/01-dev/terraform.tfvars.sample @@ -0,0 +1,43 @@ +bucket_name = "creditcards" # -env will be added as suffix +dataset_name = "creditcards" +environment = "dev" +groups = { + gcp-ml-ds = null + gcp-ml-eng = null + gcp-ml-viewer = null +} + +# env will be added as branch name +github = { + organization = "GITHUB_ORG" + repo = "GITHUB_REPO" +} + +# Additional labels. env label will be added automatically +labels = { + "team" : "ml" +} + +notebooks = { + "nb" : { + type = "USER_MANAGED" + }, +} + +prefix = "myprefix" +project_config = { + billing_account_id = "000000-111111-222222" # Use only billing BA if it is required to create the project + parent = "folders/123456789012" + project_id = "creditcards" # -env will be added as suffix +} +region = "europe-west4" + + +service_encryption_keys = { + aiplatform = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" + bq = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/bq" + notebooks = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" + secretmanager = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" + storage = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" +} + diff --git a/examples/vertex_mlops_enterprise/terraform/01-dev/variables.tf b/examples/vertex_mlops_enterprise/terraform/01-dev/variables.tf new file mode 100644 index 0000000000..4580d223bd --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/01-dev/variables.tf @@ -0,0 +1,114 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "bucket_name" { + description = "Create GCS Bucket." + type = string + default = null +} + +variable "dataset_name" { + description = "Create BigQuery Datasets." + type = string + default = null +} + +variable "environment" { + description = "Environment prefix that will be used when creating some resources and outputs (Github Actions, build files, etc.): dev, stg, prod" + type = string +} + +variable "groups" { + description = "Name of the groups (name@domain.org) to apply IAM permissions." + type = object({ + gcp-ml-ds = string + gcp-ml-eng = string + gcp-ml-viewer = string + }) + default = { + gcp-ml-ds = null + gcp-ml-eng = null + gcp-ml-viewer = null + } + nullable = false +} + +variable "github" { + description = "Github organization and repo, i.e: https://github.com/ORGANIZATION/REPO " + type = object({ + organization = string + repo = string + }) + default = null +} + +variable "labels" { + description = "Labels to be assigned at project level." + type = map(string) + default = {} +} + +variable "notebooks" { + description = "Vertex AI workbenchs to be deployed." + type = any + default = {} +} + +variable "prefix" { + description = "Prefix used for the project id." + type = string + default = null +} + +variable "project_config" { + description = "Provide 'billing_account_id' value if project creation is needed, uses existing 'project_id' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format." + type = object({ + billing_account_id = optional(string) + parent = optional(string) + project_id = string + }) + validation { + condition = var.project_config.project_id != null + error_message = "Project id must be set." + } + nullable = false +} + +variable "region" { + description = "Region used for regional resources." + type = string + default = "europe-west4" +} + +variable "sa_mlops_name" { + description = "Name for the MLOPs Service Account." + type = string + default = "sa-mlops" +} + +variable "service_encryption_keys" { + description = "Cloud KMS to use to encrypt different services. Key location should match service region." + type = object({ + aiplatform = optional(string) + bq = optional(string) + notebooks = optional(string) + secretmanager = optional(string) + storage = optional(string) + }) + default = {} + nullable = false +} \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/terraform/02-staging/main.tf b/examples/vertex_mlops_enterprise/terraform/02-staging/main.tf new file mode 100644 index 0000000000..37743f7dce --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/02-staging/main.tf @@ -0,0 +1,52 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# tfdoc:file:description Vertex MLOps + +locals { + bucket_name = "${var.bucket_name}-${var.environment}" + env_label = { + "env" : "${var.environment}" + } + labels = merge(local.env_label, var.labels) + + github = { + organization = var.github.organization + repo = var.github.repo + branch = var.environment + } + + identity_pool_claims = try("attribute.repository/${var.github.organization}/${var.github.repo}", null) + + project_config = { + billing_account_id = var.project_config.billing_account_id + parent = var.project_config.parent + project_id = "${var.project_config.project_id}-${var.environment}" + } + +} +module "mlops" { + source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//blueprints/data-solutions/vertex-mlops" + project_config = local.project_config + prefix = var.prefix + bucket_name = local.bucket_name + dataset_name = var.dataset_name + groups = var.groups + identity_pool_claims = local.identity_pool_claims + labels = local.labels + notebooks = var.notebooks + service_encryption_keys = var.service_encryption_keys +} diff --git a/examples/vertex_mlops_enterprise/terraform/02-staging/outputs.tf b/examples/vertex_mlops_enterprise/terraform/02-staging/outputs.tf new file mode 100644 index 0000000000..633ea98199 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/02-staging/outputs.tf @@ -0,0 +1,144 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +locals { + mainconfig_yaml = templatefile("${path.module}/../../mainconfig.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + subnetwork = module.mlops.github.SUBNETWORK + }) + + gh_containers_yaml = templatefile("${path.module}/../../.github/workflows/containers.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO + environment = var.environment + }) + + gh_main_yaml = templatefile("${path.module}/../../.github/workflows/main.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO + environment = var.environment + }) + + gh_run_yaml = templatefile("${path.module}/../../.github/workflows/run.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO + environment = var.environment + }) + + gh_deploy_yaml = templatefile("${path.module}/../../.github/workflows/deploy.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO + environment = var.environment + }) + + pipeline_deploy = templatefile("${path.module}/../../build/pipeline-deployment.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + subnetwork = module.mlops.github.SUBNETWORK + }) + + pipeline_run = templatefile("${path.module}/../../build/pipeline-run.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + sa_mlops = module.mlops.github.SA_MLOPS, + }) + + + model_deployment = templatefile("${path.module}/../../build/model-deployment.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + }) +} + + +resource "local_file" "mainconfig_yml" { + filename = "${path.module}/../../mainconfig.yaml" + content = local.mainconfig_yaml +} + + +resource "local_file" "containers_yml" { + filename = "${path.module}/../../.github/workflows/containers.yml" + content = local.gh_containers_yaml +} + + + +resource "local_file" "main_yml" { + filename = "${path.module}/../../.github/workflows/main.yml" + content = local.gh_main_yaml +} + +resource "local_file" "run_yml" { + filename = "${path.module}/../../.github/workflows/run.yml" + content = local.gh_run_yaml +} + +resource "local_file" "deploy_yml" { + filename = "${path.module}/../../.github/workflows/deploy.yml" + content = local.gh_deploy_yaml +} + +resource "local_file" "deployment_yml" { + filename = "${path.module}/../../build/${var.environment}/pipeline-deployment.yaml" + content = local.pipeline_deploy +} + + +resource "local_file" "pipeline_run_yml" { + filename = "${path.module}/../../build/${var.environment}/pipeline-run.yaml" + content = local.pipeline_run +} + +resource "local_file" "model_deploy_yml" { + filename = "${path.module}/../../build/${var.environment}/model-deployment.yaml" + content = local.model_deployment +} + + +output "mlops" { + description = "Created project, service accounts and associates resources." + value = module.mlops +} diff --git a/examples/vertex_mlops_enterprise/terraform/02-staging/providers.tf b/examples/vertex_mlops_enterprise/terraform/02-staging/providers.tf new file mode 100644 index 0000000000..e6511a281b --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/02-staging/providers.tf @@ -0,0 +1,13 @@ +terraform { + backend "gcs" { + bucket = "pcorp-iac-core-bucket" + prefix = "mlops1" + } +} + +provider "google" { + impersonate_service_account = "pcorp-iac-core@pcorp-iac-core.iam.gserviceaccount.com" +} +provider "google-beta" { + impersonate_service_account = "pcorp-iac-core@pcorp-iac-core.iam.gserviceaccount.com" +} diff --git a/examples/vertex_mlops_enterprise/terraform/02-staging/terraform.tfvars.sample b/examples/vertex_mlops_enterprise/terraform/02-staging/terraform.tfvars.sample new file mode 100644 index 0000000000..6e20ecea11 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/02-staging/terraform.tfvars.sample @@ -0,0 +1,37 @@ +bucket_name = "creditcards" +dataset_name = "creditcards" +environment = "stg" +groups = { + gcp-ml-ds = null + gcp-ml-eng = null + gcp-ml-viewer = null +} + +# env will be added as branch name +github = { + organization = "GITHUB_ORG" + repo = "GITHUB_REPO" +} + +# Additional labels. env label will be added automatically +labels = { + "team" : "ml" +} + +prefix = "myprefix" +project_config = { + billing_account_id = "000000-111111-222222" # Use only billing BA if it is required to create the project + parent = "folders/123456789012" + project_id = "creditcards" # -env will be added as suffix +} +region = "europe-west4" + + +service_encryption_keys = { + aiplatform = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" + bq = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/bq" + notebooks = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" + secretmanager = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" + storage = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" +} + diff --git a/examples/vertex_mlops_enterprise/terraform/02-staging/variables.tf b/examples/vertex_mlops_enterprise/terraform/02-staging/variables.tf new file mode 100644 index 0000000000..82b4d85d03 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/02-staging/variables.tf @@ -0,0 +1,114 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "bucket_name" { + description = "Create GCS Bucket." + type = string + default = null +} + +variable "dataset_name" { + description = "Create BigQuery Datasets." + type = string + default = null +} + +variable "environment" { + description = "Environment prefix that will be used when creating some resources and outputs (Github Actions, build files, etc.): dev, stg, prod" + type = string +} + +variable "groups" { + description = "Name of the groups (name@domain.org) to apply IAM permissions." + type = object({ + gcp-ml-ds = string + gcp-ml-eng = string + gcp-ml-viewer = string + }) + default = { + gcp-ml-ds = null + gcp-ml-eng = null + gcp-ml-viewer = null + } + nullable = false +} + +variable "github" { + description = "Github organization and repo, i.e: https://github.com/ORGANIZATION/REPO " + type = object({ + organization = string + repo = string + }) + default = null +} + +variable "labels" { + description = "Labels to be assigned at project level." + type = map(string) + default = {} +} + +variable "notebooks" { + description = "Vertex AI workbenchs to be deployed." + type = any + default = {} +} + +variable "prefix" { + description = "Prefix used for the project id." + type = string + default = null +} + +variable "project_config" { + description = "Provide 'billing_account_id' value if project creation is needed, uses existing 'project_id' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format." + type = object({ + billing_account_id = optional(string) + parent = optional(string) + project_id = string + }) + validation { + condition = var.project_config.project_id != null + error_message = "Project id must be set." + } + nullable = false +} + +variable "region" { + description = "Region used for regional resources." + type = string + default = "europe-west4" +} + +variable "sa_mlops_name" { + description = "Name for the MLOPs Service Account." + type = string + default = "sa-mlops" +} + +variable "service_encryption_keys" { + description = "Cloud KMS to use to encrypt different services. Key location should match service region." + type = object({ + aiplatform = optional(string) + bq = optional(string) + notebooks = optional(string) + secretmanager = optional(string) + storage = optional(string) + }) + default = {} + nullable = false +} \ No newline at end of file diff --git a/examples/vertex_mlops_enterprise/terraform/03-prod/main.tf b/examples/vertex_mlops_enterprise/terraform/03-prod/main.tf new file mode 100644 index 0000000000..37743f7dce --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/03-prod/main.tf @@ -0,0 +1,52 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# tfdoc:file:description Vertex MLOps + +locals { + bucket_name = "${var.bucket_name}-${var.environment}" + env_label = { + "env" : "${var.environment}" + } + labels = merge(local.env_label, var.labels) + + github = { + organization = var.github.organization + repo = var.github.repo + branch = var.environment + } + + identity_pool_claims = try("attribute.repository/${var.github.organization}/${var.github.repo}", null) + + project_config = { + billing_account_id = var.project_config.billing_account_id + parent = var.project_config.parent + project_id = "${var.project_config.project_id}-${var.environment}" + } + +} +module "mlops" { + source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//blueprints/data-solutions/vertex-mlops" + project_config = local.project_config + prefix = var.prefix + bucket_name = local.bucket_name + dataset_name = var.dataset_name + groups = var.groups + identity_pool_claims = local.identity_pool_claims + labels = local.labels + notebooks = var.notebooks + service_encryption_keys = var.service_encryption_keys +} diff --git a/examples/vertex_mlops_enterprise/terraform/03-prod/outputs.tf b/examples/vertex_mlops_enterprise/terraform/03-prod/outputs.tf new file mode 100644 index 0000000000..633ea98199 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/03-prod/outputs.tf @@ -0,0 +1,144 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +locals { + mainconfig_yaml = templatefile("${path.module}/../../mainconfig.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + subnetwork = module.mlops.github.SUBNETWORK + }) + + gh_containers_yaml = templatefile("${path.module}/../../.github/workflows/containers.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO + environment = var.environment + }) + + gh_main_yaml = templatefile("${path.module}/../../.github/workflows/main.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO + environment = var.environment + }) + + gh_run_yaml = templatefile("${path.module}/../../.github/workflows/run.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO + environment = var.environment + }) + + gh_deploy_yaml = templatefile("${path.module}/../../.github/workflows/deploy.yml.TEMPLATE", { + wip = module.mlops.github.WORKLOAD_ID_PROVIDER, + project_id = module.mlops.github.PROJECT_ID, + sa = module.mlops.github.SERVICE_ACCOUNT, + docker_repo = module.mlops.github.DOCKER_REPO + environment = var.environment + }) + + pipeline_deploy = templatefile("${path.module}/../../build/pipeline-deployment.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + subnetwork = module.mlops.github.SUBNETWORK + }) + + pipeline_run = templatefile("${path.module}/../../build/pipeline-run.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + sa_mlops = module.mlops.github.SA_MLOPS, + }) + + + model_deployment = templatefile("${path.module}/../../build/model-deployment.yaml.TEMPLATE", { + project_id = module.mlops.github.PROJECT_ID, + region = var.region, + github_org = try(var.github.organization, null), + github_repo = try(var.github.repo, null), + github_branch = try(local.github.branch, null), + docker_repo = module.mlops.github.DOCKER_REPO, + sa_mlops = module.mlops.github.SA_MLOPS, + }) +} + + +resource "local_file" "mainconfig_yml" { + filename = "${path.module}/../../mainconfig.yaml" + content = local.mainconfig_yaml +} + + +resource "local_file" "containers_yml" { + filename = "${path.module}/../../.github/workflows/containers.yml" + content = local.gh_containers_yaml +} + + + +resource "local_file" "main_yml" { + filename = "${path.module}/../../.github/workflows/main.yml" + content = local.gh_main_yaml +} + +resource "local_file" "run_yml" { + filename = "${path.module}/../../.github/workflows/run.yml" + content = local.gh_run_yaml +} + +resource "local_file" "deploy_yml" { + filename = "${path.module}/../../.github/workflows/deploy.yml" + content = local.gh_deploy_yaml +} + +resource "local_file" "deployment_yml" { + filename = "${path.module}/../../build/${var.environment}/pipeline-deployment.yaml" + content = local.pipeline_deploy +} + + +resource "local_file" "pipeline_run_yml" { + filename = "${path.module}/../../build/${var.environment}/pipeline-run.yaml" + content = local.pipeline_run +} + +resource "local_file" "model_deploy_yml" { + filename = "${path.module}/../../build/${var.environment}/model-deployment.yaml" + content = local.model_deployment +} + + +output "mlops" { + description = "Created project, service accounts and associates resources." + value = module.mlops +} diff --git a/examples/vertex_mlops_enterprise/terraform/03-prod/providers.tf b/examples/vertex_mlops_enterprise/terraform/03-prod/providers.tf new file mode 100644 index 0000000000..e6511a281b --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/03-prod/providers.tf @@ -0,0 +1,13 @@ +terraform { + backend "gcs" { + bucket = "pcorp-iac-core-bucket" + prefix = "mlops1" + } +} + +provider "google" { + impersonate_service_account = "pcorp-iac-core@pcorp-iac-core.iam.gserviceaccount.com" +} +provider "google-beta" { + impersonate_service_account = "pcorp-iac-core@pcorp-iac-core.iam.gserviceaccount.com" +} diff --git a/examples/vertex_mlops_enterprise/terraform/03-prod/terraform.tfvars.sample b/examples/vertex_mlops_enterprise/terraform/03-prod/terraform.tfvars.sample new file mode 100644 index 0000000000..fa5dd769db --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/03-prod/terraform.tfvars.sample @@ -0,0 +1,37 @@ +bucket_name = "creditcards-dev" +dataset_name = "creditcards" +environment = "prd" +groups = { + gcp-ml-ds = null + gcp-ml-eng = null + gcp-ml-viewer = null +} + +# env will be added as branch name +github = { + organization = "GITHUB_ORG" + repo = "GITHUB_REPO" +} + +# Additional labels. env label will be added automatically +labels = { + "team" : "ml" +} + +prefix = "myprefix" +project_config = { + billing_account_id = "000000-111111-222222" # Use only billing BA if it is required to create the project + parent = "folders/123456789012" + project_id = "creditcards" # -env will be added as suffix +} +region = "europe-west4" + + +service_encryption_keys = { + aiplatform = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" + bq = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/bq" + notebooks = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" + secretmanager = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" + storage = "projects/CMEK_PROJECT_ID/locations/europe-west4/keyRings/europe-west4/cryptoKeys/storage" +} + diff --git a/examples/vertex_mlops_enterprise/terraform/03-prod/variables.tf b/examples/vertex_mlops_enterprise/terraform/03-prod/variables.tf new file mode 100644 index 0000000000..82b4d85d03 --- /dev/null +++ b/examples/vertex_mlops_enterprise/terraform/03-prod/variables.tf @@ -0,0 +1,114 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "bucket_name" { + description = "Create GCS Bucket." + type = string + default = null +} + +variable "dataset_name" { + description = "Create BigQuery Datasets." + type = string + default = null +} + +variable "environment" { + description = "Environment prefix that will be used when creating some resources and outputs (Github Actions, build files, etc.): dev, stg, prod" + type = string +} + +variable "groups" { + description = "Name of the groups (name@domain.org) to apply IAM permissions." + type = object({ + gcp-ml-ds = string + gcp-ml-eng = string + gcp-ml-viewer = string + }) + default = { + gcp-ml-ds = null + gcp-ml-eng = null + gcp-ml-viewer = null + } + nullable = false +} + +variable "github" { + description = "Github organization and repo, i.e: https://github.com/ORGANIZATION/REPO " + type = object({ + organization = string + repo = string + }) + default = null +} + +variable "labels" { + description = "Labels to be assigned at project level." + type = map(string) + default = {} +} + +variable "notebooks" { + description = "Vertex AI workbenchs to be deployed." + type = any + default = {} +} + +variable "prefix" { + description = "Prefix used for the project id." + type = string + default = null +} + +variable "project_config" { + description = "Provide 'billing_account_id' value if project creation is needed, uses existing 'project_id' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format." + type = object({ + billing_account_id = optional(string) + parent = optional(string) + project_id = string + }) + validation { + condition = var.project_config.project_id != null + error_message = "Project id must be set." + } + nullable = false +} + +variable "region" { + description = "Region used for regional resources." + type = string + default = "europe-west4" +} + +variable "sa_mlops_name" { + description = "Name for the MLOPs Service Account." + type = string + default = "sa-mlops" +} + +variable "service_encryption_keys" { + description = "Cloud KMS to use to encrypt different services. Key location should match service region." + type = object({ + aiplatform = optional(string) + bq = optional(string) + notebooks = optional(string) + secretmanager = optional(string) + storage = optional(string) + }) + default = {} + nullable = false +} \ No newline at end of file diff --git a/tools/bq-visualizer/package-lock.json b/tools/bq-visualizer/package-lock.json index 326b90eab9..c33ce7d771 100644 --- a/tools/bq-visualizer/package-lock.json +++ b/tools/bq-visualizer/package-lock.json @@ -3318,9 +3318,9 @@ } }, "node_modules/@babel/traverse": { - "version": "7.23.0", - "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.0.tgz", - "integrity": "sha512-t/QaEvyIoIkwzpiZ7aoSKK8kObQYeF7T2v+dazAYCb8SXtp58zEVkWW7zAnju8FNKNdr4ScAOEDmMItbyOmEYw==", + "version": "7.23.2", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.2.tgz", + "integrity": "sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw==", "dev": true, "dependencies": { "@babel/code-frame": "^7.22.13", @@ -21424,9 +21424,9 @@ } }, "@babel/traverse": { - "version": "7.23.0", - "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.0.tgz", - "integrity": "sha512-t/QaEvyIoIkwzpiZ7aoSKK8kObQYeF7T2v+dazAYCb8SXtp58zEVkWW7zAnju8FNKNdr4ScAOEDmMItbyOmEYw==", + "version": "7.23.2", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.2.tgz", + "integrity": "sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw==", "dev": true, "requires": { "@babel/code-frame": "^7.22.13",