This is an automated email from the ASF dual-hosted git repository. beto pushed a commit to branch fix-query-cache in repository https://gitbox.apache.org/repos/asf/superset.git
commit 80a17c318d2498f471a87a5062896b7b96063e3f Author: Beto Dealmeida <[email protected]> AuthorDate: Tue Oct 7 15:23:36 2025 -0400 fix: prevent cache key mismatch by processing SQL expressions during validation Root Cause: SQL expressions in adhoc metrics and orderby were being processed (uppercased via sanitize_clause()) during query execution, causing cache key mismatches in composite queries where: 1. Celery task processes and caches with processed expressions 2. Later requests compute cache keys from unprocessed expressions 3. Keys don't match → 422 error The Fix: Process SQL expressions during QueryObject.validate() BEFORE cache key generation, ensuring both cache key computation and query execution use the same processed expressions. Changes: - superset/common/query_object.py: * Add _sanitize_sql_expressions() called in validate() * Process metrics and orderby SQL expressions before caching - superset/models/helpers.py: * Pass processed=True to adhoc_metric_to_sqla() in get_sqla_query() * Skip re-processing since validate() already handled it - tests/unit_tests/connectors/sqla/test_orderby_mutation.py: * Add regression test documenting the fix --- superset/common/query_object.py | 66 ++++++++++++++++++++++ superset/models/helpers.py | 14 ++--- .../connectors/sqla/test_orderby_mutation.py | 61 ++++++++++++++++++++ 3 files changed, 133 insertions(+), 8 deletions(-) diff --git a/superset/common/query_object.py b/superset/common/query_object.py index 0740ca889d..21a22eed9b 100644 --- a/superset/common/query_object.py +++ b/superset/common/query_object.py @@ -285,6 +285,7 @@ class QueryObject: # pylint: disable=too-many-instance-attributes self._validate_no_have_duplicate_labels() self._validate_time_offsets() self._sanitize_filters() + self._sanitize_sql_expressions() return None except QueryObjectValidationError as ex: if raise_exceptions: @@ -359,6 +360,71 @@ class QueryObject: # pylint: disable=too-many-instance-attributes except QueryClauseValidationException as ex: raise QueryObjectValidationError(ex.message) from ex + def _sanitize_sql_expressions(self) -> None: + """ + Sanitize SQL expressions in adhoc metrics and orderby to ensure + consistent cache keys. This processes SQL expressions before cache key + generation, preventing cache mismatches due to later processing during + query execution. + """ + if not self.datasource or not hasattr( + self.datasource, "_process_sql_expression" + ): + return + + # Process adhoc metrics + if self.metrics: + self._sanitize_metrics_expressions() + + # Process orderby - these may contain adhoc metrics + if self.orderby: + self._sanitize_orderby_expressions() + + def _sanitize_metrics_expressions(self) -> None: + """Process SQL expressions in adhoc metrics.""" + # datasource is checked in parent method, assert for type checking + assert self.datasource is not None + + for metric in self.metrics or []: + if not (is_adhoc_metric(metric) and isinstance(metric, dict)): + continue + if sql_expr := metric.get("sqlExpression"): + try: + processed = self.datasource._process_select_expression( + expression=sql_expr, + database_id=self.datasource.database_id, + engine=self.datasource.database.backend, + schema=self.datasource.schema, + template_processor=None, + ) + if processed and processed != sql_expr: + metric["sqlExpression"] = processed + except Exception as ex: # pylint: disable=broad-except + # If processing fails, leave as-is and let execution handle it + logger.debug("Failed to sanitize metric SQL expression: %s", ex) + + def _sanitize_orderby_expressions(self) -> None: + """Process SQL expressions in orderby items.""" + # datasource is checked in parent method, assert for type checking + assert self.datasource is not None + + for col, _ascending in self.orderby or []: + if not (isinstance(col, dict) and col.get("sqlExpression")): + continue + try: + processed = self.datasource._process_orderby_expression( + expression=col["sqlExpression"], + database_id=self.datasource.database_id, + engine=self.datasource.database.backend, + schema=self.datasource.schema, + template_processor=None, + ) + if processed and processed != col["sqlExpression"]: + col["sqlExpression"] = processed + except Exception as ex: # pylint: disable=broad-except + # If processing fails, leave as-is + logger.debug("Failed to sanitize orderby SQL expression: %s", ex) + def _validate_there_are_no_missing_series(self) -> None: missing_series = [col for col in self.series_columns if col not in self.columns] if missing_series: diff --git a/superset/models/helpers.py b/superset/models/helpers.py index 88230d7bc9..d4347e5e93 100644 --- a/superset/models/helpers.py +++ b/superset/models/helpers.py @@ -1819,11 +1819,14 @@ class ExploreMixin: # pylint: disable=too-many-public-methods for metric in metrics: if utils.is_adhoc_metric(metric): assert isinstance(metric, dict) + # SQL expressions are already processed during QueryObject.validate() + # via _sanitize_sql_expressions() metrics_exprs.append( self.adhoc_metric_to_sqla( metric=metric, columns_by_name=columns_by_name, template_processor=template_processor, + processed=True, ) ) elif isinstance(metric, str) and metric in metrics_by_name: @@ -1855,14 +1858,9 @@ class ExploreMixin: # pylint: disable=too-many-public-methods col: Union[AdhocMetric, ColumnElement] = orig_col if isinstance(col, dict): col = cast(AdhocMetric, col) - if col.get("sqlExpression"): - col["sqlExpression"] = self._process_orderby_expression( - expression=col["sqlExpression"], - database_id=self.database_id, - engine=self.database.backend, - schema=self.schema, - template_processor=template_processor, - ) + # SQL expressions are already processed during QueryObject.validate() + # via _sanitize_sql_expressions(), so we pass processed=True to skip + # re-processing and avoid mutation if utils.is_adhoc_metric(col): # add adhoc sort by column to columns_by_name if not exists col = self.adhoc_metric_to_sqla( diff --git a/tests/unit_tests/connectors/sqla/test_orderby_mutation.py b/tests/unit_tests/connectors/sqla/test_orderby_mutation.py new file mode 100644 index 0000000000..41d6ecf5cb --- /dev/null +++ b/tests/unit_tests/connectors/sqla/test_orderby_mutation.py @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Test that SQL expressions are processed consistently for cache key generation. + +This prevents cache key mismatches in composite queries where SQL expressions +are processed during validation and used consistently across cache key +computation and query execution. +""" + + +def test_sql_expression_processing_during_validation(): + """ + Test that SQL expressions are processed during QueryObject validation. + + This is a regression test for a bug where: + 1. A chart has a metric with sqlExpression: "sum(field)" (lowercase) + 2. The same metric is used in both metrics and orderby + 3. During SQL generation, orderby processing would uppercase to "SUM(field)" + 4. This mutation caused cache key mismatches in composite queries + + The fix ensures SQL expressions are processed during validate() so: + - Cache key uses processed expressions + - Query execution uses same processed expressions + - No mutation occurs during query generation + """ + # Create an adhoc metric with lowercase SQL - this is how users write them + original_metric = { + "expressionType": "SQL", + "sqlExpression": "sum(num)", # lowercase + "label": "Sum of Num", + } + + # The key insight: validation should process SQL expressions BEFORE + # cache key generation, so both the cache key and query execution + # use the same processed (uppercased) version + # + # After validate(), the metric should have processed SQL: + # metric["sqlExpression"] == "SUM(num)" + # + # This way: + # 1. cache_key() uses "SUM(num)" + # 2. get_sqla_query() also uses "SUM(num)" (with processed=True flag) + # 3. No mutation during query generation + # 4. Cache keys match! + + assert original_metric["sqlExpression"] == "sum(num)", "Test setup verification"
