Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test chart data endpoint w/ duckdb #148

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ test = [
"responses",
]
dev = [
"duckdb",
"pre-commit",
"ruff < 0.9",
"sqlfluff >= 3.2.5"
Expand Down
5 changes: 4 additions & 1 deletion src/dashboard/get_chart_data/get_chart_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,14 @@
"sameYearOrBefore",
"sameDayOrAfter",
"sameWeekOrAfter",
"sameMonthOrAfter",
"sameYearOrAfter",
"beforeDay",
"beforeWeek",
"beforeMonth",
"beforeYear",
"afterDay",
"afterWeek",
"afterMonth",
"afterYear",
# Boolean filters (one param only)
Expand Down Expand Up @@ -140,7 +142,6 @@ def _build_query(query_params: dict, filter_groups: list, path_params: dict) ->
"""
dp_id = path_params["data_package_id"]
columns = _get_table_cols(dp_id)

inline_configs = []
none_configs = []
for filter_group in filter_groups:
Expand Down Expand Up @@ -183,6 +184,7 @@ def _build_query(query_params: dict, filter_groups: list, path_params: dict) ->
columns.remove(query_params["column"])
if query_params.get("stratifier") in columns:
columns.remove(query_params["stratifier"])

with open(pathlib.Path(__file__).parent / "templates/get_chart_data.sql.jinja") as file:
template = file.read()
loader = jinja2.FileSystemLoader(pathlib.Path(__file__).parent / "templates/")
Expand All @@ -196,6 +198,7 @@ def _build_query(query_params: dict, filter_groups: list, path_params: dict) ->
coalesce_columns=columns,
inline_configs=inline_configs,
none_configs=none_configs,
extra_filter_configs=[], # extra_filter_configs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's this line about? Did you mean to leave that comment like that?

Copy link
Contributor Author

@dogversioning dogversioning Feb 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was from an in process attempt to fix the underlying jinja template, which moved away from; will delete

)
return query_str, count_col

Expand Down
10 changes: 5 additions & 5 deletions src/dashboard/get_chart_data/templates/filter_inline.sql.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -90,18 +90,18 @@ date_trunc('month',from_iso8601_timestamp("{{ data }}")) > date_trunc('month',fr
date_trunc('year',from_iso8601_timestamp("{{ data }}")) > date_trunc('year',from_iso8601_timestamp('{{ bound }}'))
{#- Boolean filters -#}
{%- elif filter_type == 'isTrue' -%}
"{{ data }}" IS TRUE
"{{ data }}" IS TRUE AND "{{ data }}" IS NOT NULL
{%- elif filter_type == 'isNotTrue' -%}
"{{ data }}" IS NOT TRUE
"{{ data }}" IS NOT TRUE AND "{{ data }}" IS NOT NULL
{%- elif filter_type == 'isFalse' -%}
"{{ data }}" IS FALSE
"{{ data }}" IS FALSE AND "{{ data }}" IS NOT NULL
{%- elif filter_type == 'isNotFalse' -%}
"{{ data }}" IS NOT FALSE
"{{ data }}" IS NOT FALSE AND "{{ data }}" IS NOT NULL
{#- Null filters -#}
{%- elif filter_type == 'isNull' -%}
"{{ data }}" IS NULL
{%- elif filter_type == 'isNotNull' -%}
"{{ data }}" IS NOT NULL
"{{ data }}" IS NOT NULL
{#- Numeric filters -#}
{%- elif filter_type == 'eq'-%}
"{{ data }}" = {{ bound }}
Expand Down
6 changes: 0 additions & 6 deletions src/shared/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,6 @@ def update_metadata(
if extra_items is None:
extra_items = {}
check_meta_type(meta_type)
logger.info(f"### Updating metadata {meta_type}")
logger.info(f"{study} {data_package} {version}")
logger.info(f"Key: {target} Value: {value}")
logger.info(f"Pre-update size: {len(metadata.keys())}")

match meta_type:
case enums.JsonFilename.TRANSACTIONS.value:
Expand Down Expand Up @@ -152,8 +148,6 @@ def update_metadata(
case _:
raise ValueError(f"{meta_type} does not have a handler for updates.")
data_version_metadata.update(extra_items)
logger.info(f"Post-update size: {len(metadata.keys())}")
logger.info(f"### Updated metadata {meta_type}")
return metadata


Expand Down
12 changes: 9 additions & 3 deletions src/site_upload/cache_api/cache_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,19 @@ def cache_api_data(s3_client, s3_bucket_name: str, db: str, target: str) -> None
"study": dp.split("__")[0],
"name": dp.split("__")[1],
}
versions = column_types[dp_detail["study"]][dp_detail["name"]]
studies = column_types.get(dp_detail["study"], {"name": None})
versions = studies.get(dp_detail["name"], None)
if versions is None:
print(f"{dp} not found in column_types")
continue
for version in versions:
if version != dp:
continue
dp_dict = {
**dp_detail,
**versions[version],
"version": version,
"id": f"{dp_detail['study']}__{dp_detail['name']}__{version}",
"version": version.split("__")[2],
"id": f"{dp_detail['study']}__{dp_detail['name']}__{version.split('__')[2]}",
Comment on lines +51 to +52
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Is version worth splitting separately into a variable, for clarity?

Like _version_package, version_number = version.split("__", 2) or something. (for example, I don't know what the first part of the version is, so I just made up a variable name)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I buy it

}
if "__flat" in dp:
dp_dict["type"] = "flat"
Expand Down
35 changes: 34 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,13 @@
aggregator
"""

import datetime
import os
import re
from unittest import mock

import boto3
import duckdb
import pytest
from moto import mock_athena, mock_s3, mock_sns

Expand Down Expand Up @@ -155,7 +158,7 @@ def mock_notification():


@pytest.fixture
def mock_db():
def mock_athena_db():
"""Leaving this unused here for now - there are some low level inconsistencies
between moto and AWS wrangler w.r.t. how workgroups are mocked out, but we might
be able to use this in the future/mock AWSwranger below the entrypoint if we are
Expand All @@ -176,6 +179,36 @@ def mock_db():
athena.stop()


@pytest.fixture
def mock_db(tmp_path):
def _compat_regexp_like(string: str | None, pattern: str | None) -> bool:
match = re.search(pattern, string)
return match is not None

def _compat_from_iso8601_timestamp(
value: str | datetime.datetime,
) -> datetime.datetime:
if type(value) is str:
return datetime.datetime.fromisoformat(value)
return value

db = duckdb.connect(tmp_path / "duck.db")
db.create_function(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Obviously you thought about this, but let's spill a tiny amount of digital ink. Pros/cons of having tests depend on cumulus-library and grabbing its DuckdbDatabase class? Not worth it yet, I'm guessing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, a couple thoughts

  • the aggregator doesn't have Python version matrix compatibility problems. It runs on a specific version and there's some workaround code for Python 311 in there that just doesn't apply here.
    • there's one weird case where we want this to perform slightly differently around, just not caring about finding real time stamps in the database with test data
  • we're only dealing with this one function for now, so the library class is a little heavy and having its own thing provides a nice isolation. If that changed that I might feel more strongly about importing that class.

# DuckDB's version is regexp_matches.
"regexp_like",
_compat_regexp_like,
None,
duckdb.typing.BOOLEAN,
)
db.create_function(
"from_iso8601_timestamp",
_compat_from_iso8601_timestamp,
None,
duckdb.typing.TIMESTAMP,
)
yield db


def test_mock_bucket():
s3_client = boto3.client("s3", region_name="us-east-1")
item = s3_client.list_objects_v2(Bucket=os.environ["TEST_BUCKET"])
Expand Down
8 changes: 4 additions & 4 deletions tests/dashboard/test_filter_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,28 +194,28 @@
["col:isTrue"],
"""
(
"col" IS TRUE
"col" IS TRUE AND "col" IS NOT NULL
)""",
),
(
["col:isNotTrue"],
"""
(
"col" IS NOT TRUE
"col" IS NOT TRUE AND "col" IS NOT NULL
)""",
),
(
["col:isFalse"],
"""
(
"col" IS FALSE
"col" IS FALSE AND "col" IS NOT NULL
)""",
),
(
["col:isNotFalse"],
"""
(
"col" IS NOT FALSE
"col" IS NOT FALSE AND "col" IS NOT NULL
)""",
),
(
Expand Down
115 changes: 115 additions & 0 deletions tests/dashboard/test_get_chart_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,3 +401,118 @@ def test_handler():
'"rowCount": 2, "totalCount": 20, "data": [{"rows": [["male", 10], '
'["female", 10]]}]}'
)


def mock_get_table_cols_results(name):
return ["cnt", "nato", "greek", "numeric", "timestamp", "bool"]


@pytest.mark.parametrize(
"query_params,filter_groups,expected",
[
# flitering on display column
({"column": "nato"}, ["nato:strEq:alfa"], [("alfa", 50)]),
# General check on joins with non-included columns
({"column": "nato"}, ["nato:strEq:alfa,greek:strEq:alpha"], [("alfa", 40)]),
({"column": "nato"}, ["nato:strEq:alfa,greek:strEq:beta"], [("alfa", 10)]),
# filtering on non-included columns only
({"column": "nato"}, ["greek:strEq:beta"], [("alfa", 10)]),
# checking joins on AND/OR
(
{"column": "nato"},
["greek:strEq:alpha,numeric:eq:2.2", "greek:strEq:beta,numeric:eq:1.1"],
[("alfa", 10), ("alfa", 10)],
),
# validating all potential filter types
## strings
({"column": "nato"}, ["nato:strEq:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strContains:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strStartsWith:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strEndsWith:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:matches:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strEqCI:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strContainsCI:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strStartsWithCI:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strEndsWithCI:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:matchesCI:bravo"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strNotEq:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strNotContains:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strNotStartsWith:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strNotEndsWith:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:notMatches:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strNotEqCI:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strNotContainsCI:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strNotStartsWithCI:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:strNotEndsWithCI:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:notMatchesCI:alfa"], [("bravo", 10)]),
({"column": "nato"}, ["nato:notMatchesCI:alfa"], [("bravo", 10)]),
# Date handling
({"column": "nato"}, ["timestamp:sameDay:2022-02-02"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:sameWeek:2022-02-03"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:sameMonth:2022-02-21"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:sameYear:2022-03-03"], [("alfa", 10)]),
(
{"column": "nato"},
["timestamp:sameDayOrBefore:2022-02-02"],
[("alfa", 40), ("alfa", 10), ("bravo", 10)],
),
(
{"column": "nato"},
["timestamp:sameWeekOrBefore:2022-02-03"],
[("alfa", 40), ("alfa", 10), ("bravo", 10)],
),
(
{"column": "nato"},
["timestamp:sameMonthOrBefore:2022-02-21"],
[("alfa", 40), ("alfa", 10), ("bravo", 10)],
),
(
{"column": "nato"},
["timestamp:sameYearOrBefore:2022-03-03"],
[("alfa", 40), ("alfa", 10), ("bravo", 10)],
),
({"column": "nato"}, ["timestamp:sameDayOrAfter:2022-02-02"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:sameWeekOrAfter:2022-02-03"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:sameMonthOrAfter:2022-02-21"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:sameYearOrAfter:2022-03-03"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:beforeDay:2022-02-02"], [("alfa", 40), ("bravo", 10)]),
({"column": "nato"}, ["timestamp:beforeWeek:2022-02-03"], [("alfa", 40), ("bravo", 10)]),
({"column": "nato"}, ["timestamp:beforeMonth:2022-02-21"], [("alfa", 40), ("bravo", 10)]),
({"column": "nato"}, ["timestamp:beforeYear:2022-03-03"], [("alfa", 40), ("bravo", 10)]),
({"column": "nato"}, ["timestamp:afterDay:2022-02-01"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:afterWeek:2022-01-20"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:afterMonth:2022-01-01"], [("alfa", 10)]),
({"column": "nato"}, ["timestamp:afterYear:2021-03-03"], [("alfa", 10)]),
# numeric
({"column": "nato"}, ["numeric:eq:2.2"], [("alfa", 10)]),
({"column": "nato"}, ["numeric:ne:1.1"], [("alfa", 10)]),
({"column": "nato"}, ["numeric:gt:2.1"], [("alfa", 10)]),
({"column": "nato"}, ["numeric:gte:2.2"], [("alfa", 10)]),
({"column": "nato"}, ["numeric:lt:2.2"], [("alfa", 40), ("bravo", 10)]),
({"column": "nato"}, ["numeric:lte:2.2"], [("alfa", 40), ("alfa", 10), ("bravo", 10)]),
# Boolean
({"column": "nato"}, ["bool:isTrue"], [("alfa", 10)]),
({"column": "nato"}, ["bool:isNotTrue"], [("alfa", 40), ("bravo", 10)]),
({"column": "nato"}, ["bool:isNotFalse"], [("alfa", 10)]),
({"column": "nato"}, ["bool:isFalse"], [("alfa", 40), ("bravo", 10)]),
({"column": "nato"}, ["bool:isNull"], [("alfa", 50), ("bravo", 10)]),
({"column": "nato"}, ["bool:isNotNull"], [("alfa", 40), ("alfa", 10), ("bravo", 10)]),
],
)
@mock.patch(
"src.dashboard.get_chart_data.get_chart_data._get_table_cols", mock_get_table_cols_results
)
def test_query_results(mock_db, mock_bucket, query_params, filter_groups, expected):
mock_db.execute(f'CREATE SCHEMA "{TEST_GLUE_DB}"')
mock_db.execute(
'CREATE TABLE "cumulus-aggregator-test-db"."test__cube__001" AS SELECT * FROM '
'read_parquet("./tests/test_data/mock_cube_col_types.parquet")'
)
query, count_col = get_chart_data._build_query(
query_params, filter_groups, {"data_package_id": "test__cube__001"}
)
res = mock_db.execute(query).fetchall()

assert len(res) == len(expected)
for i in range(0, len(res)):
assert res[i] == expected[i]
Loading
Loading