From a98a1f83db725fb9597278f660df88075b180007 Mon Sep 17 00:00:00 2001 From: Abdou Seck Date: Wed, 28 Sep 2022 23:33:17 -0400 Subject: [PATCH] feat: Add a job to detect new raw schema fields to add to safe schema manual models --- .../DetectNewDBTManualModelsFields.groovy | 64 +++++++++++++++ dataeng/jobs/createJobsNew.groovy | 2 + .../detect-new-dbt-manual-models-fields.sh | 79 +++++++++++++++++++ 3 files changed, 145 insertions(+) create mode 100644 dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy create mode 100644 dataeng/resources/detect-new-dbt-manual-models-fields.sh diff --git a/dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy b/dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy new file mode 100644 index 000000000..0975c1a7d --- /dev/null +++ b/dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy @@ -0,0 +1,64 @@ +package analytics + +import static org.edx.jenkins.dsl.AnalyticsConstants.common_log_rotator +import static org.edx.jenkins.dsl.AnalyticsConstants.common_publishers +import static org.edx.jenkins.dsl.AnalyticsConstants.common_triggers +import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm +import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm_parameters + + +class DetectNewDBTManualModelsFields { + public static def job = { dslFactory, allVars -> + dslFactory.job("detect-new-dbt-manual-models-fields") { + // If the DISABLED is set to true by the job's extra vars, then disable the job. + disabled(allVars.get('DISABLED', false)) + description("This job detects new columns in tables in raw schemas that have yet to be manually added to safe schema models.") + // Set a definite log rotation, if defined. + logRotator common_log_rotator(allVars) + // Set the analytics-secure parameters for repo and branch from the common helpers + parameters secure_scm_parameters(allVars) + // Add the analytics-tools parameters for repo and branch information + parameters { + stringParam('ANALYTICS_TOOLS_URL', allVars.get('ANALYTICS_TOOLS_URL'), 'URL for the analytics tools repo.') + stringParam('ANALYTICS_TOOLS_BRANCH', allVars.get('ANALYTICS_TOOLS_BRANCH'), 'Branch of analytics tools repo to use.') + stringParam('NOTIFY', allVars.get('NOTIFY','$PAGER_NOTIFY'), 'Space separated list of emails to send notifications to.') + } + // Set the necessary VAULT kv paths of credentials as environment variables + environmentVariables { + env('JIRA_WEBHOOK_VAULT_KV_PATH', allVars.get('JIRA_WEBHOOK_VAULT_KV_PATH')) + env('JIRA_WEBHOOK_VAULT_KV_VERSION', allVars.get('JIRA_WEBHOOK_VAULT_KV_VERSION')) + env('AUTOMATION_TASK_VAULT_KV_PATH', allVars.get('AUTOMATION_TASK_VAULT_KV_PATH')) + env('AUTOMATION_TASK_VAULT_KV_VERSION', allVars.get('AUTOMATION_TASK_VAULT_KV_VERSION')) + } + // SCM settings for analytics-secure and analytics-tools + multiscm secure_scm(allVars) << { + git { + remote { + url('$ANALYTICS_TOOLS_URL') + branch('$ANALYTICS_TOOLS_BRANCH') + credentials('1') + } + extensions { + relativeTargetDirectory('analytics-tools') + pruneBranches() + cleanAfterCheckout() + } + } + } + wrappers { + colorizeOutput('xterm') + timestamps() + credentialsBinding { + usernamePassword('ANALYTICS_VAULT_ROLE_ID', 'ANALYTICS_VAULT_SECRET_ID', 'analytics-vault') + } + } + // Set the trigger using cron + triggers common_triggers(allVars) + // Notifications on build failures + publishers common_publishers(allVars) + steps { + shell(dslFactory.readFileFromWorkspace('dataeng/resources/detect-new-dbt-manual-models-fields.sh')) + } + } + } +} diff --git a/dataeng/jobs/createJobsNew.groovy b/dataeng/jobs/createJobsNew.groovy index a52168894..f4282f652 100644 --- a/dataeng/jobs/createJobsNew.groovy +++ b/dataeng/jobs/createJobsNew.groovy @@ -2,6 +2,7 @@ import static analytics.DBTDocs.job as DBTDocsJob import static analytics.DBTRun.job as DBTRunJob import static analytics.DBTSourceFreshness.job as DBTSourceFreshnessJob import static analytics.DeployCluster.job as DeployClusterJob +import static analytics.DetectNewDBTManualModelsFields.job as DetectNewDBTManualModelsFields import static analytics.EmrCostReporter.job as EmrCostReporterJob import static analytics.ModelTransfers.job as ModelTransfersJob import static analytics.RetirementJobEdxTriggers.job as RetirementJobEdxTriggersJob @@ -45,6 +46,7 @@ def taskMap = [ DBT_RUN_JOB: DBTRunJob, DBT_SOURCE_FRESHNESS_JOB: DBTSourceFreshnessJob, DEPLOY_CLUSTER_JOB: DeployClusterJob, + DETECT_NEW_DBT_MANUAL_MODELS_FIELDS_JOB: DetectNewDBTManualModelsFields, EMR_COST_REPORTER_JOB: EmrCostReporterJob, MODEL_TRANSFERS_JOB: ModelTransfersJob, RETIREMENT_JOB_EDX_TRIGGERS_JOB: RetirementJobEdxTriggersJob, diff --git a/dataeng/resources/detect-new-dbt-manual-models-fields.sh b/dataeng/resources/detect-new-dbt-manual-models-fields.sh new file mode 100644 index 000000000..12b36c2ec --- /dev/null +++ b/dataeng/resources/detect-new-dbt-manual-models-fields.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -ex + +# Setup a virtual environment +PYTHON38_VENV="py38_venv" +virtualenv --python=python3.8 --clear "${PYTHON38_VENV}" +source "${PYTHON38_VENV}/bin/activate" + +# Go into analytics-tools and install the dependencies +cd ${WORKSPACE}/analytics-tools/snowflake +make requirements + +# Fetch credentials from vault +# Do not print commands in this function since they may contain secrets. +set +x + +# Retrieve a vault token corresponding to the jenkins AppRole. The token is then stored in the VAULT_TOKEN variable +# which is implicitly used by subsequent vault commands within this script. +# Instructions followed: https://learn.hashicorp.com/tutorials/vault/approle#step-4-login-with-roleid-secretid +export VAULT_TOKEN=$(vault write -field=token auth/approle/login \ + role_id=${ANALYTICS_VAULT_ROLE_ID} \ + secret_id=${ANALYTICS_VAULT_SECRET_ID} +) + +# JIRA webhook URL and secret string from vault +WEBHOOK_URL=$( + vault kv get \ + -version=${JIRA_WEBHOOK_VAULT_KV_VERSION} \ + -field=JIRA_WEBHOOK_URL \ + ${JIRA_WEBHOOK_VAULT_KV_PATH} \ +) +WEBHOOK_SECRET=$( + vault kv get \ + -version=${JIRA_WEBHOOK_VAULT_KV_VERSION} \ + -field=JIRA_WEBHOOK_SECRET \ + ${JIRA_WEBHOOK_VAULT_KV_PATH} \ +) + +# Snowflake credentials from vault +SNOWFLAKE_ACCOUNT=$( + vault kv get \ + -version=${AUTOMATION_TASK_VAULT_KV_VERSION} \ + -field=account \ + ${AUTOMATION_TASK_VAULT_KV_PATH} \ +) + +SNOWFLAKE_USER=$( + vault kv get \ + -version=${AUTOMATION_TASK_VAULT_KV_VERSION} \ + -field=user \ + ${AUTOMATION_TASK_VAULT_KV_PATH} \ +) +# The detect_new_raw_columns.py script, much like all other scripts that connect to Snowflake, +# expects the private key and the privarte key passphrase to be in files. +# As a result, SNOWFLAKE_PRIVATE_KEY and SNOWFLAKE_PRIVATE_KEY_PASSPHRASE are stored in files. +vault kv get \ + -version=${AUTOMATION_TASK_VAULT_KV_VERSION} \ + -field=private_key \ + ${AUTOMATION_TASK_VAULT_KV_PATH} > .private_key_file + +vault kv get \ + -version=${AUTOMATION_TASK_VAULT_KV_VERSION} \ + -field=private_key_passphrase \ + ${AUTOMATION_TASK_VAULT_KV_PATH} > .private_key_passphrase_file +set -x + +# The extra vars file for this job contains both field mappings and the necessary credentials for Snowflake and Jenkins. +# Therefore, the options to the script are read from the config file. +CONFIG_PATH=${WORKSPACE}/analytics-secure/job-configs/DETECT_NEW_DBT_MANUAL_MODELS_FIELDS_JOB_MAPPINGS.yaml + +# Invoke the script to detect new fields that need to be added manually +python detect_new_raw_columns.py ${CONFIG_PATH} \ + --user ${SNOWFLAKE_USER} --account ${SNOWFLAKE_ACCOUNT} \ + --key-path .private_key_file --passphrase-path .private_key_passphrase_file \ + --jira-webhook-url ${WEBHOOK_URL} \ + --jira-webhook-secret ${WEBHOOK_SECRET} + +# Clean up the temporary files with SNOWFLAKE_PRIVATE_KEY and SNOWFLAKE_PRIVATE_KEY_PASSPHRASE +rm -rf .private_key_file .private_key_passphrase_file