From 7a33cb76c558590c369a18c67eb673d128aa3cc3 Mon Sep 17 00:00:00 2001 From: Luke Couzens Date: Tue, 5 Dec 2023 17:23:11 +0000 Subject: [PATCH] [COST-4333] - Adding dtypes for Azure (#4817) * Update Azure Dtypes for Trino --- ...porting_ocpinfrastructure_provider_map.sql | 2 +- ...g_azurecostentrylineitem_daily_summary.sql | 14 +- ...ing_ocpazurecostlineitem_daily_summary.sql | 36 ++--- .../util/azure/test_azure_post_processor.py | 6 +- koku/masu/util/azure/azure_post_processor.py | 13 +- koku/reporting/provider/azure/models.py | 131 +++++++++--------- 6 files changed, 102 insertions(+), 100 deletions(-) diff --git a/koku/masu/database/trino_sql/azure/reporting_ocpinfrastructure_provider_map.sql b/koku/masu/database/trino_sql/azure/reporting_ocpinfrastructure_provider_map.sql index 1f84d03833..fd5c2179bf 100644 --- a/koku/masu/database/trino_sql/azure/reporting_ocpinfrastructure_provider_map.sql +++ b/koku/masu/database/trino_sql/azure/reporting_ocpinfrastructure_provider_map.sql @@ -1,6 +1,6 @@ WITH cte_azure_instances AS ( - SELECT DISTINCT split_part(coalesce(azure.resourceid, azure.instanceid), '/', 9) as instance, + SELECT DISTINCT split_part(coalesce(nullif(azure.resourceid, ''), azure.instanceid), '/', 9) as instance, azure.source FROM hive.{{schema | sqlsafe}}.azure_line_items AS azure WHERE coalesce(azure.date, azure.usagedatetime) >= {{start_date}} diff --git a/koku/masu/database/trino_sql/reporting_azurecostentrylineitem_daily_summary.sql b/koku/masu/database/trino_sql/reporting_azurecostentrylineitem_daily_summary.sql index b4a23b379e..a247d158cc 100644 --- a/koku/masu/database/trino_sql/reporting_azurecostentrylineitem_daily_summary.sql +++ b/koku/masu/database/trino_sql/reporting_azurecostentrylineitem_daily_summary.sql @@ -21,17 +21,17 @@ INSERT INTO postgres.{{schema | sqlsafe}}.reporting_azurecostentrylineitem_daily WITH cte_line_items AS ( SELECT date(coalesce(date, usagedatetime)) as usage_date, INTEGER '{{bill_id | sqlsafe}}' as cost_entry_bill_id, - coalesce(subscriptionid, subscriptionguid) as subscription_guid, + coalesce(nullif(subscriptionid, ''), subscriptionguid) as subscription_guid, resourcelocation as resource_location, - coalesce(servicename, metercategory) as service_name, + coalesce(nullif(servicename, ''), metercategory) as service_name, json_extract_scalar(json_parse(additionalinfo), '$.ServiceType') as instance_type, - cast(coalesce(quantity, usagequantity) as DECIMAL(24,9)) as usage_quantity, - cast(coalesce(costinbillingcurrency, pretaxcost) as DECIMAL(24,9)) as pretax_cost, - coalesce(billingcurrencycode, currency, billingcurrency) as currency, + cast(coalesce(nullif(quantity, 0), usagequantity) as DECIMAL(24,9)) as usage_quantity, + cast(coalesce(nullif(costinbillingcurrency, 0), pretaxcost) as DECIMAL(24,9)) as pretax_cost, + coalesce(nullif(billingcurrencycode, ''), nullif(currency, ''), billingcurrency) as currency, json_parse(tags) as tags, - coalesce(resourceid, instanceid) as instance_id, + coalesce(nullif(resourceid, ''), instanceid) as instance_id, cast(source as UUID) as source_uuid, - coalesce(subscriptionname, subscriptionid, subscriptionguid) as subscription_name, + coalesce(nullif(subscriptionname, ''), nullif(subscriptionid, ''), subscriptionguid) as subscription_name, CASE WHEN regexp_like(split_part(unitofmeasure, ' ', 1), '^\d+(\.\d+)?$') AND NOT (unitofmeasure = '100 Hours' AND metercategory='Virtual Machines') AND NOT split_part(unitofmeasure, ' ', 2) = '' THEN cast(split_part(unitofmeasure, ' ', 1) as INTEGER) diff --git a/koku/masu/database/trino_sql/reporting_ocpazurecostlineitem_daily_summary.sql b/koku/masu/database/trino_sql/reporting_ocpazurecostlineitem_daily_summary.sql index f78a878271..ec393ec7fd 100644 --- a/koku/masu/database/trino_sql/reporting_ocpazurecostlineitem_daily_summary.sql +++ b/koku/masu/database/trino_sql/reporting_ocpazurecostlineitem_daily_summary.sql @@ -149,10 +149,10 @@ INSERT INTO hive.{{schema | sqlsafe}}.azure_openshift_daily_resource_matched_tem ) SELECT cast(uuid() as varchar) as uuid, coalesce(azure.date, azure.usagedatetime) as usage_start, - split_part(coalesce(resourceid, instanceid), '/', 9) as resource_id, - coalesce(servicename, metercategory) as service_name, + split_part(coalesce(nullif(resourceid, ''), instanceid), '/', 9) as resource_id, + coalesce(nullif(servicename, ''), metercategory) as service_name, max(json_extract_scalar(json_parse(azure.additionalinfo), '$.ServiceType')) as instance_type, - coalesce(azure.subscriptionid, azure.subscriptionguid) as subscription_guid, + coalesce(nullif(azure.subscriptionid, ''), azure.subscriptionguid) as subscription_guid, azure.resourcelocation as resource_location, max(CASE WHEN split_part(unitofmeasure, ' ', 2) = 'Hours' @@ -163,9 +163,9 @@ SELECT cast(uuid() as varchar) as uuid, THEN split_part(unitofmeasure, ' ', 2) ELSE unitofmeasure END) as unit_of_measure, - sum(coalesce(azure.quantity, azure.usagequantity)) as usage_quantity, - coalesce(azure.billingcurrencycode, azure.currency) as currency, - sum(coalesce(azure.costinbillingcurrency, azure.pretaxcost)) as pretax_cost, + sum(coalesce(nullif(azure.quantity, 0), azure.usagequantity)) as usage_quantity, + coalesce(nullif(azure.billingcurrencycode, ''), azure.currency) as currency, + sum(coalesce(nullif(azure.costinbillingcurrency, 0), azure.pretaxcost)) as pretax_cost, azure.tags, max(azure.resource_id_matched) as resource_id_matched, {{ocp_source_uuid}} as ocp_source, @@ -179,11 +179,11 @@ WHERE azure.source = {{azure_source_uuid}} AND coalesce(azure.date, azure.usagedatetime) < date_add('day', 1, {{end_date}}) AND azure.resource_id_matched = TRUE GROUP BY coalesce(azure.date, azure.usagedatetime), - split_part(coalesce(resourceid, instanceid), '/', 9), - coalesce(servicename, metercategory), - coalesce(subscriptionid, subscriptionguid), + split_part(coalesce(nullif(resourceid, ''), instanceid), '/', 9), + coalesce(nullif(servicename, ''), metercategory), + coalesce(nullif(subscriptionid, ''), subscriptionguid), azure.resourcelocation, - coalesce(azure.billingcurrencycode, azure.currency), + coalesce(nullif(azure.billingcurrencycode, ''), azure.currency), azure.tags ; @@ -219,9 +219,9 @@ WITH cte_enabled_tag_keys AS ( SELECT cast(uuid() as varchar) as uuid, coalesce(azure.date, azure.usagedatetime) as usage_start, split_part(coalesce(resourceid, instanceid), '/', 9) as resource_id, - coalesce(servicename, metercategory) as service_name, + coalesce(nullif(servicename, ''), metercategory) as service_name, max(json_extract_scalar(json_parse(azure.additionalinfo), '$.ServiceType')) as instance_type, - coalesce(azure.subscriptionid, azure.subscriptionguid) as subscription_guid, + coalesce(nullif(azure.subscriptionid, ''), azure.subscriptionguid) as subscription_guid, azure.resourcelocation as resource_location, max(CASE WHEN split_part(unitofmeasure, ' ', 2) = 'Hours' @@ -232,9 +232,9 @@ SELECT cast(uuid() as varchar) as uuid, THEN split_part(unitofmeasure, ' ', 2) ELSE unitofmeasure END) as unit_of_measure, - sum(coalesce(azure.quantity, azure.usagequantity)) as usage_quantity, - coalesce(azure.billingcurrencycode, azure.currency) as currency, - sum(coalesce(azure.costinbillingcurrency, azure.pretaxcost)) as pretax_cost, + sum(coalesce(nullif(azure.quantity, 0), azure.usagequantity)) as usage_quantity, + coalesce(nullif(azure.billingcurrencycode, ''), azure.currency) as currency, + sum(coalesce(nullif(azure.costinbillingcurrency, 0), azure.pretaxcost)) as pretax_cost, json_format( cast( map_filter( @@ -257,10 +257,10 @@ WHERE azure.source = {{azure_source_uuid}} AND (azure.resource_id_matched = FALSE OR azure.resource_id_matched IS NULL) GROUP BY coalesce(azure.date, azure.usagedatetime), split_part(coalesce(resourceid, instanceid), '/', 9), - coalesce(servicename, metercategory), - coalesce(subscriptionid, subscriptionguid), + coalesce(nullif(servicename, ''), metercategory), + coalesce(nullif(subscriptionid, ''), subscriptionguid), azure.resourcelocation, - coalesce(azure.billingcurrencycode, azure.currency), + coalesce(nullif(azure.billingcurrencycode, ''), azure.currency), 12, -- tags azure.matched_tag ; diff --git a/koku/masu/test/util/azure/test_azure_post_processor.py b/koku/masu/test/util/azure/test_azure_post_processor.py index 200a2d7cc1..b6a23b8e09 100644 --- a/koku/masu/test/util/azure/test_azure_post_processor.py +++ b/koku/masu/test/util/azure/test_azure_post_processor.py @@ -16,7 +16,7 @@ from masu.util.azure.azure_post_processor import AzurePostProcessor from masu.util.azure.common import INGRESS_REQUIRED_COLUMNS from reporting.provider.all.models import EnabledTagKeys -from reporting.provider.azure.models import TRINO_COLUMNS +from reporting.provider.azure.models import TRINO_REQUIRED_COLUMNS class TestAzurePostProcessor(MasuTestCase): @@ -44,9 +44,9 @@ def test_azure_process_dataframe(self): result, _ = self.post_processor.process_dataframe(df) columns = list(result) expected_columns = sorted( - col.replace("-", "_").replace("/", "_").replace(":", "_").lower() for col in TRINO_COLUMNS + col.replace("-", "_").replace("/", "_").replace(":", "_").lower() for col in TRINO_REQUIRED_COLUMNS ) - self.assertEqual(columns, expected_columns) + self.assertEqual(sorted(columns), sorted(expected_columns)) def test_azure_date_converter(self): """Test that we convert the new Azure date format.""" diff --git a/koku/masu/util/azure/azure_post_processor.py b/koku/masu/util/azure/azure_post_processor.py index 8323f7f5ee..ecc8ddf441 100644 --- a/koku/masu/util/azure/azure_post_processor.py +++ b/koku/masu/util/azure/azure_post_processor.py @@ -1,4 +1,5 @@ import json +import logging import ciso8601 import pandas @@ -10,7 +11,9 @@ from masu.util.common import populate_enabled_tag_rows_with_limit from masu.util.common import safe_float from masu.util.common import strip_characters_from_column_name -from reporting.provider.azure.models import TRINO_COLUMNS +from reporting.provider.azure.models import TRINO_REQUIRED_COLUMNS + +LOG = logging.getLogger(__name__) def azure_json_converter(tag_str): @@ -101,11 +104,9 @@ def process_dataframe(self, data_frame): data_frame = data_frame.rename(columns=column_name_map) - columns = set(data_frame) - columns = set(TRINO_COLUMNS).union(columns) - columns = sorted(columns) - - data_frame = data_frame.reindex(columns=columns) + missing = set(TRINO_REQUIRED_COLUMNS).difference(data_frame) + to_add = {k: TRINO_REQUIRED_COLUMNS[k] for k in missing} + data_frame = data_frame.assign(**to_add) unique_tags = set() for tags_json in data_frame["tags"].values: diff --git a/koku/reporting/provider/azure/models.py b/koku/reporting/provider/azure/models.py index 55444493dc..8099ab302b 100644 --- a/koku/reporting/provider/azure/models.py +++ b/koku/reporting/provider/azure/models.py @@ -5,6 +5,7 @@ """Models for Azure cost and usage entry tables.""" from uuid import uuid4 +import pandas as pd from django.contrib.postgres.fields import ArrayField from django.db import models from django.db.models import JSONField @@ -14,71 +15,71 @@ TRINO_LINE_ITEM_DAILY_TABLE = TRINO_LINE_ITEM_TABLE TRINO_OCP_ON_AZURE_DAILY_TABLE = "azure_openshift_daily" -TRINO_COLUMNS = [ - "billingperiodstartdate", - "billingperiodenddate", - "usagedatetime", - "date", - "accountname", - "accountownerid", - "additionalinfo", - "availabilityzone", - "billingaccountid", - "billingaccountname", - "billingcurrencycode", - "billingcurrency", - "billingprofileid", - "billingprofilename", - "chargetype", - "consumedservice", - "costcenter", - "costinbillingcurrency", - "currency", - "effectiveprice", - "frequency", - "instanceid", - "invoicesectionid", - "invoicesectionname", - "isazurecrediteligible", - "metercategory", - "meterid", - "metername", - "meterregion", - "metersubcategory", - "offerid", - "partnumber", - "paygprice", - "planname", - "pretaxcost", - "pricingmodel", - "productname", - "productorderid", - "productordername", - "publishername", - "publishertype", - "quantity", - "reservationid", - "reservationname", - "resourcegroup", - "resourceid", - "resourcelocation", - "resourcename", - "resourcerate", - "resourcetype", - "servicefamily", - "serviceinfo1", - "serviceinfo2", - "servicename", - "servicetier", - "subscriptionguid", - "subscriptionid", - "subscriptionname", - "tags", - "term", - "unitofmeasure", - "unitprice", - "usagequantity", -] +TRINO_REQUIRED_COLUMNS = { + "billingperiodstartdate": pd.NaT, + "billingperiodenddate": pd.NaT, + "usagedatetime": pd.NaT, + "date": pd.NaT, + "accountname": "", + "accountownerid": "", + "additionalinfo": "", + "availabilityzone": "", + "billingaccountid": "", + "billingaccountname": "", + "billingcurrencycode": "", + "billingcurrency": "", + "billingprofileid": "", + "billingprofilename": "", + "chargetype": "", + "consumedservice": "", + "costcenter": "", + "costinbillingcurrency": 0.0, + "currency": "", + "effectiveprice": 0.0, + "frequency": "", + "instanceid": "", + "invoicesectionid": "", + "invoicesectionname": "", + "isazurecrediteligible": "", + "metercategory": "", + "meterid": "", + "metername": "", + "meterregion": "", + "metersubcategory": "", + "offerid": "", + "partnumber": "", + "paygprice": 0.0, + "planname": "", + "pretaxcost": 0.0, + "pricingmodel": "", + "productname": "", + "productorderid": "", + "productordername": "", + "publishername": "", + "publishertype": "", + "quantity": 0.0, + "reservationid": "", + "reservationname": "", + "resourcegroup": "", + "resourceid": "", + "resourcelocation": "", + "resourcename": "", + "resourcerate": 0.0, + "resourcetype": "", + "servicefamily": "", + "serviceinfo1": "", + "serviceinfo2": "", + "servicename": "", + "servicetier": "", + "subscriptionguid": "", + "subscriptionid": "", + "subscriptionname": "", + "tags": "", + "term": "", + "unitofmeasure": "", + "unitprice": 0.0, + "usagequantity": 0.0, +} UI_SUMMARY_TABLES = ( "reporting_azure_compute_summary_p",