From 449ab037c58cd59f49f7dd5eaa2658096011d106 Mon Sep 17 00:00:00 2001 From: Kegan Maher Date: Wed, 13 Nov 2024 00:41:28 +0000 Subject: [PATCH 1/5] fix(benefits): use json_extract_column to get single-value data --- warehouse/models/mart/benefits/fct_benefits_events.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/warehouse/models/mart/benefits/fct_benefits_events.sql b/warehouse/models/mart/benefits/fct_benefits_events.sql index 2b903b2bf5..5497f9cf56 100644 --- a/warehouse/models/mart/benefits/fct_benefits_events.sql +++ b/warehouse/models/mart/benefits/fct_benefits_events.sql @@ -31,8 +31,8 @@ WITH fct_benefits_events AS ( -- Historical data existed in `payment_group` but new data is in `enrollment_group` -- https://github.com/cal-itp/benefits/pull/2391 COALESCE( - {{ json_extract_flattened_column('event_properties', 'enrollment_group', no_alias = true) }}, - {{ json_extract_flattened_column('event_properties', 'payment_group', no_alias = true) }} + {{ json_extract_column('event_properties', 'enrollment_group', no_alias = true) }}, + {{ json_extract_column('event_properties', 'payment_group', no_alias = true) }} ) AS event_properties_enrollment_group, -- New column `enrollment_method`, historical values should be set to "digital" -- https://github.com/cal-itp/benefits/pull/2402 From 09033264c476f08194202d16901f5dbe3be086fd Mon Sep 17 00:00:00 2001 From: Kegan Maher Date: Wed, 13 Nov 2024 01:25:35 +0000 Subject: [PATCH 2/5] refactor(benefits): separate JSON extract, data migration steps - CTE fct_benefits_events_raw extracts JSON columns and COALESCEs old columns - CTE fct_benefits_events applies migration / cleanup for data values in fct_benefits_events_raw - CTE fct_benefits_historic_enrollments converts old-style enrollments in fct_benefits_events to current-style final table is combination of 2 CTEs: fct_benefits_events + fct_benefits_historic_enrollments --- .../mart/benefits/fct_benefits_events.sql | 275 +++++++++++------- 1 file changed, 172 insertions(+), 103 deletions(-) diff --git a/warehouse/models/mart/benefits/fct_benefits_events.sql b/warehouse/models/mart/benefits/fct_benefits_events.sql index 5497f9cf56..be76e9fc5b 100644 --- a/warehouse/models/mart/benefits/fct_benefits_events.sql +++ b/warehouse/models/mart/benefits/fct_benefits_events.sql @@ -1,8 +1,12 @@ {{ config(materialized='table') }} -WITH fct_benefits_events AS ( +WITH fct_benefits_events_raw AS ( + -- fct_benefits_events_raw extracts JSON columns and + -- COALESCEs old columns as they evolve + -- this is an intermediate CTE used to build the final table + + -- Keep fields in alphabetical order, list all fields in the final table SELECT - -- Keep fields in alphabetical order amplitude_id, app, city, @@ -50,6 +54,80 @@ WITH fct_benefits_events AS ( {{ json_extract_column('event_properties', 'status') }}, {{ json_extract_column('event_properties', 'transit_agency') }}, event_time, + event_type, + language, + library, + os_name, + os_version, + processed_time, + region, + server_received_time, + server_upload_time, + session_id, + start_version, + user_id, + {{ json_extract_column('user_properties', 'eligibility_verifier') }}, + -- Historical data existed in `eligibility_types` but new data is in `enrollment_flows` + -- https://github.com/cal-itp/benefits/pull/2379 + COALESCE( + {{ json_extract_flattened_column('user_properties', 'enrollment_flows', no_alias = true) }}, + {{ json_extract_flattened_column('user_properties', 'eligibility_types', no_alias = true) }} + ) AS user_properties_enrollment_flows, + -- New column `enrollment_method`, historical values should be set to "digital" + -- https://github.com/cal-itp/benefits/pull/2402 + COALESCE( + {{ json_extract_column('user_properties', 'enrollment_method', no_alias = true) }}, + "digital" + ) AS user_properties_enrollment_method, + {{ json_extract_column('user_properties', 'initial_referrer') }}, + {{ json_extract_column('user_properties', 'initial_referring_domain') }}, + {{ json_extract_column('user_properties', 'referrer') }}, + {{ json_extract_column('user_properties', 'referring_domain') }}, + -- Historical data existed in `provider_name` but new data is in `transit_agency` + -- https://github.com/cal-itp/benefits/pull/901 + COALESCE( + {{ json_extract_column('user_properties', 'transit_agency', no_alias = true) }}, + {{ json_extract_column('user_properties', 'provider_name', no_alias = true) }} + ) AS user_properties_transit_agency, + {{ json_extract_column('user_properties', 'user_agent') }}, + uuid, + version_name + FROM {{ ref('stg_amplitude__benefits_events') }} +), +fct_benefits_events AS ( + -- fct_benefits_events applies data cleanup and transformations + -- on top of the fct_benefits_events_raw CTE + -- this is an intermediate CTE used to build the final table + + -- Keep fields in alphabetical order, list all fields in the final table + SELECT + amplitude_id, + app, + city, + client_event_time, + client_upload_time, + country, + device_family, + device_id, + device_type, + event_id, + event_properties_card_tokenize_func, + event_properties_card_tokenize_url, + event_properties_claims_provider, + event_properties_eligibility_verifier, + event_properties_enrollment_flows, + event_properties_enrollment_group, + event_properties_enrollment_method, + event_properties_error_name, + event_properties_error_status, + event_properties_error_sub, + event_properties_href, + event_properties_language, + event_properties_origin, + event_properties_path, + event_properties_status, + event_properties_transit_agency, + event_time, CASE WHEN event_type = "selected eligibility verifier" THEN "selected enrollment flow" @@ -86,30 +164,15 @@ WITH fct_benefits_events AS ( ELSE start_version END AS start_version, user_id, - {{ json_extract_column('user_properties', 'eligibility_verifier') }}, - -- Historical data existed in `eligibility_types` but new data is in `enrollment_flows` - -- https://github.com/cal-itp/benefits/pull/2379 - COALESCE( - {{ json_extract_flattened_column('user_properties', 'enrollment_flows', no_alias = true) }}, - {{ json_extract_flattened_column('user_properties', 'eligibility_types', no_alias = true) }} - ) AS user_properties_enrollment_flows, - -- New column `enrollment_method`, historical values should be set to "digital" - -- https://github.com/cal-itp/benefits/pull/2402 - COALESCE( - {{ json_extract_column('user_properties', 'enrollment_method', no_alias = true) }}, - "digital" - ) AS user_properties_enrollment_method, - {{ json_extract_column('user_properties', 'initial_referrer') }}, - {{ json_extract_column('user_properties', 'initial_referring_domain') }}, - {{ json_extract_column('user_properties', 'referrer') }}, - {{ json_extract_column('user_properties', 'referring_domain') }}, - -- Historical data existed in `provider_name` but new data is in `transit_agency` - -- https://github.com/cal-itp/benefits/pull/901 - COALESCE( - {{ json_extract_column('user_properties', 'transit_agency', no_alias = true) }}, - {{ json_extract_column('user_properties', 'provider_name', no_alias = true) }} - ) AS user_properties_transit_agency, - {{ json_extract_column('user_properties', 'user_agent') }}, + user_properties_eligibility_verifier, + user_properties_enrollment_flows, + user_properties_enrollment_method, + user_properties_initial_referrer, + user_properties_initial_referring_domain, + user_properties_referrer, + user_properties_referring_domain, + user_properties_transit_agency, + user_properties_user_agent, uuid, -- Fix bug in Docker build process resulting in incorrect version strings -- https://github.com/cal-itp/benefits/pull/2392 @@ -128,85 +191,91 @@ WITH fct_benefits_events AS ( THEN "2024.10.1" ELSE version_name END AS version_name + FROM fct_benefits_events_raw +), +fct_benefits_historic_enrollments AS ( + -- fct_benefits_historic_enrollments transforms old enrollment events + -- from the fct_benefits_events CTE into the newer style + -- this is an intermediate CTE used to build the final table - FROM {{ ref('stg_amplitude__benefits_events') }} + -- Keep fields in alphabetical order, list all fields in the final table + SELECT + amplitude_id, + app, + city, + client_event_time, + client_upload_time, + country, + device_family, + device_id, + device_type, + event_id, + event_properties_card_tokenize_func, + event_properties_card_tokenize_url, + CASE + WHEN client_event_time < '2022-08-12T07:00:00Z' + THEN "ca-dmv" + WHEN client_event_time >= '2022-08-12T07:00:00Z' + THEN "cdt-logingov" + END AS event_properties_claims_provider, + CASE + WHEN client_event_time < '2022-08-12T07:00:00Z' + THEN "ca-dmv" + WHEN client_event_time >= '2022-08-12T07:00:00Z' + THEN "cdt-logingov" + END AS event_properties_eligibility_verifier, + "senior" AS event_properties_enrollment_flows, + "5170d37b-43d5-4049-899c-b4d850e14990" AS event_properties_enrollment_group, + event_properties_enrollment_method, + event_properties_error_name, + event_properties_error_status, + event_properties_error_sub, + event_properties_href, + event_properties_language, + event_properties_origin, + event_properties_path, + "success" AS event_properties_status, + "Monterey-Salinas Transit" AS event_properties_transit_agency, + event_time, + "returned enrollment" AS event_type, + language, + library, + os_name, + os_version, + processed_time, + region, + server_received_time, + server_upload_time, + session_id, + start_version, + user_id, + CASE + WHEN client_event_time < '2022-08-12T07:00:00Z' + THEN "ca-dmv" + WHEN client_event_time >= '2022-08-12T07:00:00Z' + THEN "cdt-logingov" + END AS user_properties_eligibility_verifier, + "senior" AS user_properties_enrollment_flows, + user_properties_enrollment_method, + user_properties_initial_referrer, + user_properties_initial_referring_domain, + user_properties_referrer, + user_properties_referring_domain, + "Monterey-Salinas Transit" AS user_properties_transit_agency, + user_properties_user_agent, + uuid, + version_name + FROM fct_benefits_events + WHERE client_event_time >= '2021-12-08T08:00:00Z' + and client_event_time < '2022-08-29T07:00:00Z' + and (region = 'California' or region is null) + and (city != 'Los Angeles' or city is null) + and event_type = 'viewed page' + and event_properties_path = '/enrollment/success' ) -SELECT - -- Keep fields in alphabetical order, match fields from fct_benefits_events CTE - amplitude_id, - app, - city, - client_event_time, - client_upload_time, - country, - device_family, - device_id, - device_type, - event_id, - event_properties_card_tokenize_func, - event_properties_card_tokenize_url, - CASE - WHEN client_event_time < '2022-08-12T07:00:00Z' - THEN "ca-dmv" - WHEN client_event_time >= '2022-08-12T07:00:00Z' - THEN "cdt-logingov" - END AS event_properties_claims_provider, - CASE - WHEN client_event_time < '2022-08-12T07:00:00Z' - THEN "ca-dmv" - WHEN client_event_time >= '2022-08-12T07:00:00Z' - THEN "cdt-logingov" - END AS event_properties_eligibility_verifier, - "senior" AS event_properties_enrollment_flows, - "5170d37b-43d5-4049-899c-b4d850e14990" AS event_properties_enrollment_group, - event_properties_enrollment_method, - event_properties_error_name, - event_properties_error_status, - event_properties_error_sub, - event_properties_href, - event_properties_language, - event_properties_origin, - event_properties_path, - "success" AS event_properties_status, - "Monterey-Salinas Transit" AS event_properties_transit_agency, - event_time, - "returned enrollment" AS event_type, - language, - library, - os_name, - os_version, - processed_time, - region, - server_received_time, - server_upload_time, - session_id, - start_version, - user_id, - CASE - WHEN client_event_time < '2022-08-12T07:00:00Z' - THEN "ca-dmv" - WHEN client_event_time >= '2022-08-12T07:00:00Z' - THEN "cdt-logingov" - END AS user_properties_eligibility_verifier, - "senior" AS user_properties_enrollment_flows, - user_properties_enrollment_method, - user_properties_initial_referrer, - user_properties_initial_referring_domain, - user_properties_referrer, - user_properties_referring_domain, - "Monterey-Salinas Transit" AS user_properties_transit_agency, - user_properties_user_agent, - uuid, - version_name -FROM fct_benefits_events -WHERE client_event_time >= '2021-12-08T08:00:00Z' - and client_event_time < '2022-08-29T07:00:00Z' - and (region = 'California' or region is null) - and (city != 'Los Angeles' or city is null) - and event_type = 'viewed page' - and event_properties_path = '/enrollment/success' - +-- the final table is the combination of +-- fct_benefits_historic_enrollments + fct_benefits_events +SELECT * FROM fct_benefits_historic_enrollments UNION DISTINCT - SELECT * FROM fct_benefits_events From f5c7a42c29661fc2225d24a09e3078eb6aca20c5 Mon Sep 17 00:00:00 2001 From: Kegan Maher Date: Wed, 13 Nov 2024 01:39:43 +0000 Subject: [PATCH 3/5] chore(benefits): update old event_properties_claims_provider values --- warehouse/models/mart/benefits/fct_benefits_events.sql | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warehouse/models/mart/benefits/fct_benefits_events.sql b/warehouse/models/mart/benefits/fct_benefits_events.sql index be76e9fc5b..87f3c34e11 100644 --- a/warehouse/models/mart/benefits/fct_benefits_events.sql +++ b/warehouse/models/mart/benefits/fct_benefits_events.sql @@ -113,7 +113,11 @@ fct_benefits_events AS ( event_id, event_properties_card_tokenize_func, event_properties_card_tokenize_url, - event_properties_claims_provider, + CASE + WHEN event_properties_claims_provider = "cdt-logingov-ial2" + THEN "cdt-logingov" + ELSE event_properties_claims_provider + END AS event_properties_claims_provider, event_properties_eligibility_verifier, event_properties_enrollment_flows, event_properties_enrollment_group, From 0115098f370aa158791b6e205551d1fe9999abb1 Mon Sep 17 00:00:00 2001 From: Kegan Maher Date: Wed, 13 Nov 2024 01:56:56 +0000 Subject: [PATCH 4/5] chore(benefits): update old event_properties_eligibility_verifier values --- .../mart/benefits/fct_benefits_events.sql | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/warehouse/models/mart/benefits/fct_benefits_events.sql b/warehouse/models/mart/benefits/fct_benefits_events.sql index 87f3c34e11..b4cf428a9c 100644 --- a/warehouse/models/mart/benefits/fct_benefits_events.sql +++ b/warehouse/models/mart/benefits/fct_benefits_events.sql @@ -118,7 +118,34 @@ fct_benefits_events AS ( THEN "cdt-logingov" ELSE event_properties_claims_provider END AS event_properties_claims_provider, - event_properties_eligibility_verifier, + -- Normalize historic data into current format + -- https://github.com/cal-itp/benefits/issues/2521 + CASE + WHEN event_properties_eligibility_verifier IN ( + '(MST) CDT claims via Login.gov', + '(SBMTD) CDT claims via Login.gov', + 'CDT claims via Login.gov (MST)', + 'CDT claims via Login.gov (SBMTD)', + 'OAuth claims via Login.gov', + 'senior' + ) THEN "cdt-logingov" + WHEN event_properties_eligibility_verifier IN ( + '(MST) VA.gov - Veteran', + 'VA.gov - Veteran (MST)', + 'veteran' + ) THEN "cdt-vagov" + WHEN event_properties_eligibility_verifier IN ( + '(MST) Courtesy Card Eligibility Server Verifier (prod)', + 'MST Courtesy Card Eligibility Server Verifier', + 'courtesy_card' + ) THEN "https://mst-courtesy-cards-eligibility-server-prod-azcscsbmembwcugk.z01.azurefd.net/verify" + WHEN event_properties_eligibility_verifier IN ( + '(SBMTD) Mobility Pass Eligibility Server Verifier (prod)', + 'SBMTD Mobility Pass Eligibility Server Verifier (prod)', + 'mobility_pass' + ) THEN "https://sbmtd-mobility-pass-eligibility-server-prod-h3d3djedb7ahfqeg.z01.azurefd.net/verify" + ELSE event_properties_eligibility_verifier + END AS event_properties_eligibility_verifier, event_properties_enrollment_flows, event_properties_enrollment_group, event_properties_enrollment_method, From 4645bc86f792fd608a3e7fbdc000be8ba8e62d23 Mon Sep 17 00:00:00 2001 From: Kegan Maher Date: Wed, 13 Nov 2024 01:58:19 +0000 Subject: [PATCH 5/5] chore(benefits): update old user_properties_eligibility_verifier values --- .../mart/benefits/fct_benefits_events.sql | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/warehouse/models/mart/benefits/fct_benefits_events.sql b/warehouse/models/mart/benefits/fct_benefits_events.sql index b4cf428a9c..764c081d82 100644 --- a/warehouse/models/mart/benefits/fct_benefits_events.sql +++ b/warehouse/models/mart/benefits/fct_benefits_events.sql @@ -195,7 +195,34 @@ fct_benefits_events AS ( ELSE start_version END AS start_version, user_id, - user_properties_eligibility_verifier, + -- Normalize historic data into current format + -- https://github.com/cal-itp/benefits/issues/2521 + CASE + WHEN user_properties_eligibility_verifier IN ( + '(MST) CDT claims via Login.gov', + '(SBMTD) CDT claims via Login.gov', + 'CDT claims via Login.gov (MST)', + 'CDT claims via Login.gov (SBMTD)', + 'OAuth claims via Login.gov', + 'senior' + ) THEN "cdt-logingov" + WHEN user_properties_eligibility_verifier IN ( + '(MST) VA.gov - Veteran', + 'VA.gov - Veteran (MST)', + 'veteran' + ) THEN "cdt-vagov" + WHEN user_properties_eligibility_verifier IN ( + '(MST) Courtesy Card Eligibility Server Verifier (prod)', + 'MST Courtesy Card Eligibility Server Verifier', + 'courtesy_card' + ) THEN "https://mst-courtesy-cards-eligibility-server-prod-azcscsbmembwcugk.z01.azurefd.net/verify" + WHEN user_properties_eligibility_verifier IN ( + '(SBMTD) Mobility Pass Eligibility Server Verifier (prod)', + 'SBMTD Mobility Pass Eligibility Server Verifier (prod)', + 'mobility_pass' + ) THEN "https://sbmtd-mobility-pass-eligibility-server-prod-h3d3djedb7ahfqeg.z01.azurefd.net/verify" + ELSE user_properties_eligibility_verifier + END AS user_properties_eligibility_verifier, user_properties_enrollment_flows, user_properties_enrollment_method, user_properties_initial_referrer,