From 3bc55cfcd025fd3b0250368b603921a8485fcb4e Mon Sep 17 00:00:00 2001 From: AbdulRehman Date: Mon, 8 Jun 2026 10:43:14 +0500 Subject: [PATCH 1/4] perf(filters): cache distinct column values to avoid re-running virtual dataset query per filter --- superset/datasource/api.py | 80 +++++++++++- .../integration_tests/datasource/api_tests.py | 120 ++++++++++++++++++ 2 files changed, 194 insertions(+), 6 deletions(-) diff --git a/superset/datasource/api.py b/superset/datasource/api.py index fb3f21c6926b..2b245111b783 100644 --- a/superset/datasource/api.py +++ b/superset/datasource/api.py @@ -31,7 +31,13 @@ from superset.extensions import cache_manager from superset.superset_typing import FlaskResponse from superset.utils import json -from superset.utils.core import apply_max_row_limit, DatasourceType, SqlExpressionType +from superset.utils.core import ( + apply_max_row_limit, + DatasourceType, + get_user_id, + parse_boolean_string, + SqlExpressionType, +) from superset.views.base_api import BaseSupersetApi, statsd_metrics logger = logging.getLogger(__name__) @@ -51,8 +57,9 @@ class DatasourceRestApi(BaseSupersetApi): @safe @statsd_metrics @event_logger.log_this_with_context( - action=lambda self, *args, **kwargs: f"{self.__class__.__name__}" - f".get_column_values", + action=lambda self, *args, **kwargs: ( + f"{self.__class__.__name__}.get_column_values" + ), log_to_statsd=False, ) def get_column_values( @@ -124,13 +131,48 @@ def get_column_values( row_limit = apply_max_row_limit(app.config["FILTER_SELECT_ROW_LIMIT"]) denormalize_column = not datasource.normalize_columns + + # Cache distinct column-value results so a dashboard with many filters + # backed by the same (often heavy) virtual dataset doesn't re-execute + # the wrapping query per filter (#39342). The cache key includes the + # user id so RLS-filtered datasources can't leak values across users, + # and the dataset's ``changed_on`` so an edit to the underlying SQL + # busts cached entries on the next request. + force = parse_boolean_string(request.args.get("force")) + cache_key = ( + "col_values:" + + hashlib.sha256( + json.dumps( + { + "uid": datasource.uid, + "col": column_name, + "limit": row_limit, + "denorm": denormalize_column, + "user": get_user_id(), + "changed_on": str(getattr(datasource, "changed_on", "")), + }, + sort_keys=True, + ).encode() + ).hexdigest() + ) + + if ( + not force + and (cached := cache_manager.data_cache.get(cache_key)) is not None + ): + logger.debug( + "column-values cache HIT: uid=%s col=%s", datasource.uid, column_name + ) + response = self.response(200, result=cached) + response.headers["X-Cache-Status"] = "HIT" + return response + try: payload = datasource.values_for_column( column_name=column_name, limit=row_limit, denormalize_column=denormalize_column, ) - return self.response(200, result=payload) except KeyError: return self.response( 400, message=f"Column name {column_name} does not exist" @@ -144,6 +186,31 @@ def get_column_values( ), ) + # Warn before caching very large payloads (high-cardinality columns) + # so operators can spot cache-memory pressure before Redis OOMs. + # Threshold is operator-tunable; defaults to 100k rows. + warn_threshold = app.config.get("FILTER_VALUES_CACHE_WARN_THRESHOLD", 100_000) + if (payload_size := len(payload)) > warn_threshold: + logger.warning( + "column-values payload exceeds cache-warn threshold: " + "uid=%s col=%s rows=%d threshold=%d", + datasource.uid, + column_name, + payload_size, + warn_threshold, + ) + + timeout = datasource.cache_timeout or app.config.get( + "CACHE_DEFAULT_TIMEOUT", 300 + ) + cache_manager.data_cache.set(cache_key, payload, timeout=timeout) + logger.debug( + "column-values cache MISS: uid=%s col=%s", datasource.uid, column_name + ) + response = self.response(200, result=payload) + response.headers["X-Cache-Status"] = "MISS" + return response + @expose( "///validate_expression/", methods=("POST",), @@ -152,8 +219,9 @@ def get_column_values( @safe @statsd_metrics @event_logger.log_this_with_context( - action=lambda self, *args, **kwargs: f"{self.__class__.__name__}" - f".validate_expression", + action=lambda self, *args, **kwargs: ( + f"{self.__class__.__name__}.validate_expression" + ), log_to_statsd=False, ) def validate_expression( diff --git a/tests/integration_tests/datasource/api_tests.py b/tests/integration_tests/datasource/api_tests.py index 0dda05a41ac2..b241cfd92c78 100644 --- a/tests/integration_tests/datasource/api_tests.py +++ b/tests/integration_tests/datasource/api_tests.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +from datetime import datetime from unittest.mock import ANY, patch import pytest @@ -23,6 +24,7 @@ from superset import db, security_manager from superset.connectors.sqla.models import SqlaTable from superset.daos.exceptions import DatasourceTypeNotSupportedError +from superset.extensions import cache_manager from superset.utils import json from tests.integration_tests.base_tests import SupersetTestCase from tests.integration_tests.constants import ADMIN_USERNAME, GAMMA_USERNAME @@ -205,6 +207,124 @@ def test_get_column_values_with_rls_no_values(self): response = json.loads(rv.data.decode("utf-8")) assert response["result"] == [] + @pytest.mark.usefixtures("app_context", "virtual_dataset") + @patch("superset.models.helpers.ExploreMixin.values_for_column") + def test_get_column_values_cache_hit_skips_query(self, values_for_column_mock): + """Regression test for #39342. + + Two identical requests for the same column on the same datasource + should hit ``values_for_column`` exactly once — the second request + returns the cached payload. + """ + cache_manager.data_cache.clear() + values_for_column_mock.return_value = ["a", "b", "c"] + self.login(ADMIN_USERNAME) + table = self.get_virtual_dataset() + url = f"api/v1/datasource/table/{table.id}/column/col2/values/" + + rv1 = self.client.get(url) + rv2 = self.client.get(url) + + assert rv1.status_code == 200 + assert rv2.status_code == 200 + assert json.loads(rv2.data.decode("utf-8"))["result"] == ["a", "b", "c"] + assert values_for_column_mock.call_count == 1 + + @pytest.mark.usefixtures("app_context", "virtual_dataset") + @patch("superset.models.helpers.ExploreMixin.values_for_column") + def test_get_column_values_force_bypasses_cache(self, values_for_column_mock): + """``?force=true`` should bypass the cache and re-query the source.""" + cache_manager.data_cache.clear() + values_for_column_mock.return_value = ["a", "b"] + self.login(ADMIN_USERNAME) + table = self.get_virtual_dataset() + url = f"api/v1/datasource/table/{table.id}/column/col2/values/" + + self.client.get(url) + self.client.get(f"{url}?force=true") + + assert values_for_column_mock.call_count == 2 + + @pytest.mark.usefixtures("app_context", "virtual_dataset") + @patch("superset.models.helpers.ExploreMixin.values_for_column") + def test_get_column_values_cache_isolated_per_column(self, values_for_column_mock): + """Different columns on the same datasource must not share a cache + entry — otherwise filter values would be silently swapped.""" + cache_manager.data_cache.clear() + values_for_column_mock.return_value = ["x"] + self.login(ADMIN_USERNAME) + table = self.get_virtual_dataset() + + self.client.get(f"api/v1/datasource/table/{table.id}/column/col1/values/") + self.client.get(f"api/v1/datasource/table/{table.id}/column/col2/values/") + + assert values_for_column_mock.call_count == 2 + + @pytest.mark.usefixtures("app_context", "virtual_dataset") + @patch("superset.models.helpers.ExploreMixin.values_for_column") + def test_get_column_values_cache_isolated_per_user(self, values_for_column_mock): + """RLS safety: two users hitting the same column must each populate + their own cache entry. Without this isolation, user A's + RLS-restricted values could leak to user B.""" + cache_manager.data_cache.clear() + values_for_column_mock.return_value = ["v"] + table = self.get_virtual_dataset() + url = f"api/v1/datasource/table/{table.id}/column/col2/values/" + + self.login(ADMIN_USERNAME) + self.client.get(url) + self.logout() + self.login(GAMMA_USERNAME) + # Grant gamma the permission so the request reaches the cache path. + perm = security_manager.find_permission_view_menu( + "can_get_column_values", "Datasource" + ) + security_manager.add_permission_role(security_manager.find_role("Gamma"), perm) + self.client.get(url) + + assert values_for_column_mock.call_count == 2 + + @pytest.mark.usefixtures("app_context", "virtual_dataset") + @patch("superset.models.helpers.ExploreMixin.values_for_column") + def test_get_column_values_cache_busts_on_changed_on(self, values_for_column_mock): + """Editing the underlying virtual dataset SQL bumps ``changed_on``, + which is part of the cache key — so the next request must miss the + cache and re-run the query.""" + cache_manager.data_cache.clear() + values_for_column_mock.return_value = ["v"] + self.login(ADMIN_USERNAME) + table = self.get_virtual_dataset() + url = f"api/v1/datasource/table/{table.id}/column/col2/values/" + + self.client.get(url) + # Simulate an edit to the dataset; ``changed_on`` is what the cache + # key reads, so any new value forces a miss. + table.changed_on = datetime(2030, 1, 1) + db.session.flush() + self.client.get(url) + + assert values_for_column_mock.call_count == 2 + + @pytest.mark.usefixtures("app_context", "virtual_dataset") + @patch("superset.models.helpers.ExploreMixin.values_for_column") + def test_get_column_values_response_advertises_cache_status( + self, values_for_column_mock + ): + """The ``X-Cache-Status`` response header should advertise MISS on + the populating request and HIT on the next identical request, so + operators can debug cache behavior from logs or browser devtools.""" + cache_manager.data_cache.clear() + values_for_column_mock.return_value = ["v"] + self.login(ADMIN_USERNAME) + table = self.get_virtual_dataset() + url = f"api/v1/datasource/table/{table.id}/column/col2/values/" + + rv_miss = self.client.get(url) + rv_hit = self.client.get(url) + + assert rv_miss.headers.get("X-Cache-Status") == "MISS" + assert rv_hit.headers.get("X-Cache-Status") == "HIT" + @patch("superset.datasource.api.security_manager.can_access") @patch("superset.datasource.api.GetCombinedDatasourceListCommand.run") def test_combined_list_invalid_order_column( From 92654da298b8ccb419fc5c826fbd11e82e9c601a Mon Sep 17 00:00:00 2001 From: AbdulRehman Date: Mon, 8 Jun 2026 11:17:49 +0500 Subject: [PATCH 2/4] fix(filters): include RLS fingerprint in cache key so embedded/guest sessions don't share entries --- superset/datasource/api.py | 19 +++++++++++---- .../integration_tests/datasource/api_tests.py | 23 +++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/superset/datasource/api.py b/superset/datasource/api.py index 2b245111b783..d8d462f01b91 100644 --- a/superset/datasource/api.py +++ b/superset/datasource/api.py @@ -134,10 +134,20 @@ def get_column_values( # Cache distinct column-value results so a dashboard with many filters # backed by the same (often heavy) virtual dataset doesn't re-execute - # the wrapping query per filter (#39342). The cache key includes the - # user id so RLS-filtered datasources can't leak values across users, - # and the dataset's ``changed_on`` so an edit to the underlying SQL - # busts cached entries on the next request. + # the wrapping query per filter (#39342). + # + # Key fields: + # - ``user`` — baseline per-user isolation. ``get_user_id()`` returns + # ``None`` for guest/anonymous sessions, so this field alone is not + # sufficient for RLS safety. + # - ``rls`` — full RLS fingerprint via + # ``security_manager.get_rls_cache_key`` (the canonical helper used + # by viz.py and query_context_processor.py). Covers both regular + # RLS policy changes and guest-token RLS, so embedded sessions + # with different guest tokens never share cached values even when + # ``get_user_id()`` is ``None`` for both. + # - ``changed_on`` — auto-busts cached entries when the dataset's + # underlying SQL is edited. force = parse_boolean_string(request.args.get("force")) cache_key = ( "col_values:" @@ -149,6 +159,7 @@ def get_column_values( "limit": row_limit, "denorm": denormalize_column, "user": get_user_id(), + "rls": security_manager.get_rls_cache_key(datasource), "changed_on": str(getattr(datasource, "changed_on", "")), }, sort_keys=True, diff --git a/tests/integration_tests/datasource/api_tests.py b/tests/integration_tests/datasource/api_tests.py index b241cfd92c78..c6352bf41b48 100644 --- a/tests/integration_tests/datasource/api_tests.py +++ b/tests/integration_tests/datasource/api_tests.py @@ -305,6 +305,29 @@ def test_get_column_values_cache_busts_on_changed_on(self, values_for_column_moc assert values_for_column_mock.call_count == 2 + @pytest.mark.usefixtures("app_context", "virtual_dataset") + @patch("superset.datasource.api.security_manager.get_rls_cache_key") + @patch("superset.models.helpers.ExploreMixin.values_for_column") + def test_get_column_values_cache_isolated_per_rls_context( + self, values_for_column_mock, get_rls_cache_key_mock + ): + """RLS safety for guest/embedded sessions. ``get_user_id()`` returns + ``None`` for guest users, so two embedded dashboards with different + guest-token RLS would otherwise collide on ``user=None``. Including + the RLS fingerprint in the cache key keeps them separate.""" + cache_manager.data_cache.clear() + values_for_column_mock.return_value = ["v"] + self.login(ADMIN_USERNAME) + table = self.get_virtual_dataset() + url = f"api/v1/datasource/table/{table.id}/column/col2/values/" + + get_rls_cache_key_mock.return_value = ["dept='A'"] + self.client.get(url) + get_rls_cache_key_mock.return_value = ["dept='B'"] + self.client.get(url) + + assert values_for_column_mock.call_count == 2 + @pytest.mark.usefixtures("app_context", "virtual_dataset") @patch("superset.models.helpers.ExploreMixin.values_for_column") def test_get_column_values_response_advertises_cache_status( From 09317ef9ab1e7de0268258c56611ba51134ea5fb Mon Sep 17 00:00:00 2001 From: AbdulRehman Date: Mon, 8 Jun 2026 11:46:48 +0500 Subject: [PATCH 3/4] test(filters): mock get_user_id directly for per-user cache isolation test --- .../integration_tests/datasource/api_tests.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/integration_tests/datasource/api_tests.py b/tests/integration_tests/datasource/api_tests.py index c6352bf41b48..744638804698 100644 --- a/tests/integration_tests/datasource/api_tests.py +++ b/tests/integration_tests/datasource/api_tests.py @@ -261,25 +261,25 @@ def test_get_column_values_cache_isolated_per_column(self, values_for_column_moc assert values_for_column_mock.call_count == 2 @pytest.mark.usefixtures("app_context", "virtual_dataset") + @patch("superset.datasource.api.get_user_id") @patch("superset.models.helpers.ExploreMixin.values_for_column") - def test_get_column_values_cache_isolated_per_user(self, values_for_column_mock): - """RLS safety: two users hitting the same column must each populate - their own cache entry. Without this isolation, user A's - RLS-restricted values could leak to user B.""" + def test_get_column_values_cache_isolated_per_user( + self, values_for_column_mock, get_user_id_mock + ): + """Per-user cache isolation: two distinct users hitting the same + column must each populate their own cache entry. Patching + ``get_user_id`` directly avoids the login-flow side effects (which + would also trip the datasource-access check before the cache code + is reached).""" cache_manager.data_cache.clear() values_for_column_mock.return_value = ["v"] + self.login(ADMIN_USERNAME) table = self.get_virtual_dataset() url = f"api/v1/datasource/table/{table.id}/column/col2/values/" - self.login(ADMIN_USERNAME) + get_user_id_mock.return_value = 1 self.client.get(url) - self.logout() - self.login(GAMMA_USERNAME) - # Grant gamma the permission so the request reaches the cache path. - perm = security_manager.find_permission_view_menu( - "can_get_column_values", "Datasource" - ) - security_manager.add_permission_role(security_manager.find_role("Gamma"), perm) + get_user_id_mock.return_value = 2 self.client.get(url) assert values_for_column_mock.call_count == 2 From 3111a4e3e25b68322c0a58a056410fbcff03558d Mon Sep 17 00:00:00 2001 From: AbdulRehman Date: Wed, 10 Jun 2026 13:12:27 +0500 Subject: [PATCH 4/4] perf(filters): address review (drop user_id, RLS-only key, setUp cache clear) --- superset/datasource/api.py | 21 ++++++------ .../integration_tests/datasource/api_tests.py | 32 +++++-------------- 2 files changed, 20 insertions(+), 33 deletions(-) diff --git a/superset/datasource/api.py b/superset/datasource/api.py index d8d462f01b91..69ad1e341810 100644 --- a/superset/datasource/api.py +++ b/superset/datasource/api.py @@ -34,7 +34,6 @@ from superset.utils.core import ( apply_max_row_limit, DatasourceType, - get_user_id, parse_boolean_string, SqlExpressionType, ) @@ -137,17 +136,22 @@ def get_column_values( # the wrapping query per filter (#39342). # # Key fields: - # - ``user`` — baseline per-user isolation. ``get_user_id()`` returns - # ``None`` for guest/anonymous sessions, so this field alone is not - # sufficient for RLS safety. # - ``rls`` — full RLS fingerprint via # ``security_manager.get_rls_cache_key`` (the canonical helper used - # by viz.py and query_context_processor.py). Covers both regular - # RLS policy changes and guest-token RLS, so embedded sessions - # with different guest tokens never share cached values even when - # ``get_user_id()`` is ``None`` for both. + # by viz.py and query_context_processor.py). This is the sole + # security-isolation field — two users with identical effective + # RLS share a cache entry (intentional: they would see identical + # filtered values anyway), while users with different RLS, guest + # sessions with different guest-token RLS, and anonymous sessions + # with no RLS each get their own partition. We deliberately do + # NOT include the raw user id; doing so would defeat the + # intended cross-user cache sharing without adding any real + # security boundary beyond what the RLS fingerprint already + # provides. # - ``changed_on`` — auto-busts cached entries when the dataset's # underlying SQL is edited. + # - ``uid`` / ``col`` / ``limit`` / ``denorm`` — basic query-shape + # isolation so different inputs never collide. force = parse_boolean_string(request.args.get("force")) cache_key = ( "col_values:" @@ -158,7 +162,6 @@ def get_column_values( "col": column_name, "limit": row_limit, "denorm": denormalize_column, - "user": get_user_id(), "rls": security_manager.get_rls_cache_key(datasource), "changed_on": str(getattr(datasource, "changed_on", "")), }, diff --git a/tests/integration_tests/datasource/api_tests.py b/tests/integration_tests/datasource/api_tests.py index 744638804698..c767cbfaca2e 100644 --- a/tests/integration_tests/datasource/api_tests.py +++ b/tests/integration_tests/datasource/api_tests.py @@ -31,6 +31,14 @@ class TestDatasourceApi(SupersetTestCase): + def setUp(self): + # Clear the column-values cache before every test so that + # ``get_column_values`` always re-runs ``values_for_column`` rather + # than returning a payload populated by a previous test. Prevents + # order-dependent flakes now that the endpoint caches its result. + super().setUp() + cache_manager.data_cache.clear() + def get_virtual_dataset(self): return ( db.session.query(SqlaTable) @@ -260,30 +268,6 @@ def test_get_column_values_cache_isolated_per_column(self, values_for_column_moc assert values_for_column_mock.call_count == 2 - @pytest.mark.usefixtures("app_context", "virtual_dataset") - @patch("superset.datasource.api.get_user_id") - @patch("superset.models.helpers.ExploreMixin.values_for_column") - def test_get_column_values_cache_isolated_per_user( - self, values_for_column_mock, get_user_id_mock - ): - """Per-user cache isolation: two distinct users hitting the same - column must each populate their own cache entry. Patching - ``get_user_id`` directly avoids the login-flow side effects (which - would also trip the datasource-access check before the cache code - is reached).""" - cache_manager.data_cache.clear() - values_for_column_mock.return_value = ["v"] - self.login(ADMIN_USERNAME) - table = self.get_virtual_dataset() - url = f"api/v1/datasource/table/{table.id}/column/col2/values/" - - get_user_id_mock.return_value = 1 - self.client.get(url) - get_user_id_mock.return_value = 2 - self.client.get(url) - - assert values_for_column_mock.call_count == 2 - @pytest.mark.usefixtures("app_context", "virtual_dataset") @patch("superset.models.helpers.ExploreMixin.values_for_column") def test_get_column_values_cache_busts_on_changed_on(self, values_for_column_mock):