(superset) 01/01: fix: prevent cache key mismatch by processing SQL expressions during validation

beto Fri, 17 Oct 2025 10:00:52 -0700

This is an automated email from the ASF dual-hosted git repository.

beto pushed a commit to branch fix-query-cache
in repository https://gitbox.apache.org/repos/asf/superset.git


commit 80a17c318d2498f471a87a5062896b7b96063e3f
Author: Beto Dealmeida <[email protected]>
AuthorDate: Tue Oct 7 15:23:36 2025 -0400

    fix: prevent cache key mismatch by processing SQL expressions during 
validation
    
    Root Cause:
    SQL expressions in adhoc metrics and orderby were being processed
    (uppercased via sanitize_clause()) during query execution, causing
    cache key mismatches in composite queries where:
    1. Celery task processes and caches with processed expressions
    2. Later requests compute cache keys from unprocessed expressions
    3. Keys don't match → 422 error
    
    The Fix:
    Process SQL expressions during QueryObject.validate() BEFORE cache key
    generation, ensuring both cache key computation and query execution use
    the same processed expressions.
    
    Changes:
    - superset/common/query_object.py:
      * Add _sanitize_sql_expressions() called in validate()
      * Process metrics and orderby SQL expressions before caching
    
    - superset/models/helpers.py:
      * Pass processed=True to adhoc_metric_to_sqla() in get_sqla_query()
      * Skip re-processing since validate() already handled it
    
    - tests/unit_tests/connectors/sqla/test_orderby_mutation.py:
      * Add regression test documenting the fix
---
 superset/common/query_object.py                    | 66 ++++++++++++++++++++++
 superset/models/helpers.py                         | 14 ++---
 .../connectors/sqla/test_orderby_mutation.py       | 61 ++++++++++++++++++++
 3 files changed, 133 insertions(+), 8 deletions(-)

diff --git a/superset/common/query_object.py b/superset/common/query_object.py
index 0740ca889d..21a22eed9b 100644
--- a/superset/common/query_object.py
+++ b/superset/common/query_object.py
@@ -285,6 +285,7 @@ class QueryObject:  # pylint: 
disable=too-many-instance-attributes
             self._validate_no_have_duplicate_labels()
             self._validate_time_offsets()
             self._sanitize_filters()
+            self._sanitize_sql_expressions()
             return None
         except QueryObjectValidationError as ex:
             if raise_exceptions:
@@ -359,6 +360,71 @@ class QueryObject:  # pylint: 
disable=too-many-instance-attributes
                 except QueryClauseValidationException as ex:
                     raise QueryObjectValidationError(ex.message) from ex
 
+    def _sanitize_sql_expressions(self) -> None:
+        """
+        Sanitize SQL expressions in adhoc metrics and orderby to ensure
+        consistent cache keys. This processes SQL expressions before cache key
+        generation, preventing cache mismatches due to later processing during
+        query execution.
+        """
+        if not self.datasource or not hasattr(
+            self.datasource, "_process_sql_expression"
+        ):
+            return
+
+        # Process adhoc metrics
+        if self.metrics:
+            self._sanitize_metrics_expressions()
+
+        # Process orderby - these may contain adhoc metrics
+        if self.orderby:
+            self._sanitize_orderby_expressions()
+
+    def _sanitize_metrics_expressions(self) -> None:
+        """Process SQL expressions in adhoc metrics."""
+        # datasource is checked in parent method, assert for type checking
+        assert self.datasource is not None
+
+        for metric in self.metrics or []:
+            if not (is_adhoc_metric(metric) and isinstance(metric, dict)):
+                continue
+            if sql_expr := metric.get("sqlExpression"):
+                try:
+                    processed = self.datasource._process_select_expression(
+                        expression=sql_expr,
+                        database_id=self.datasource.database_id,
+                        engine=self.datasource.database.backend,
+                        schema=self.datasource.schema,
+                        template_processor=None,
+                    )
+                    if processed and processed != sql_expr:
+                        metric["sqlExpression"] = processed
+                except Exception as ex:  # pylint: disable=broad-except
+                    # If processing fails, leave as-is and let execution 
handle it
+                    logger.debug("Failed to sanitize metric SQL expression: 
%s", ex)
+
+    def _sanitize_orderby_expressions(self) -> None:
+        """Process SQL expressions in orderby items."""
+        # datasource is checked in parent method, assert for type checking
+        assert self.datasource is not None
+
+        for col, _ascending in self.orderby or []:
+            if not (isinstance(col, dict) and col.get("sqlExpression")):
+                continue
+            try:
+                processed = self.datasource._process_orderby_expression(
+                    expression=col["sqlExpression"],
+                    database_id=self.datasource.database_id,
+                    engine=self.datasource.database.backend,
+                    schema=self.datasource.schema,
+                    template_processor=None,
+                )
+                if processed and processed != col["sqlExpression"]:
+                    col["sqlExpression"] = processed
+            except Exception as ex:  # pylint: disable=broad-except
+                # If processing fails, leave as-is
+                logger.debug("Failed to sanitize orderby SQL expression: %s", 
ex)
+
     def _validate_there_are_no_missing_series(self) -> None:
         missing_series = [col for col in self.series_columns if col not in 
self.columns]
         if missing_series:
diff --git a/superset/models/helpers.py b/superset/models/helpers.py
index 88230d7bc9..d4347e5e93 100644
--- a/superset/models/helpers.py
+++ b/superset/models/helpers.py
@@ -1819,11 +1819,14 @@ class ExploreMixin:  # pylint: 
disable=too-many-public-methods
         for metric in metrics:
             if utils.is_adhoc_metric(metric):
                 assert isinstance(metric, dict)
+                # SQL expressions are already processed during 
QueryObject.validate()
+                # via _sanitize_sql_expressions()
                 metrics_exprs.append(
                     self.adhoc_metric_to_sqla(
                         metric=metric,
                         columns_by_name=columns_by_name,
                         template_processor=template_processor,
+                        processed=True,
                     )
                 )
             elif isinstance(metric, str) and metric in metrics_by_name:
@@ -1855,14 +1858,9 @@ class ExploreMixin:  # pylint: 
disable=too-many-public-methods
             col: Union[AdhocMetric, ColumnElement] = orig_col
             if isinstance(col, dict):
                 col = cast(AdhocMetric, col)
-                if col.get("sqlExpression"):
-                    col["sqlExpression"] = self._process_orderby_expression(
-                        expression=col["sqlExpression"],
-                        database_id=self.database_id,
-                        engine=self.database.backend,
-                        schema=self.schema,
-                        template_processor=template_processor,
-                    )
+                # SQL expressions are already processed during 
QueryObject.validate()
+                # via _sanitize_sql_expressions(), so we pass processed=True 
to skip
+                # re-processing and avoid mutation
                 if utils.is_adhoc_metric(col):
                     # add adhoc sort by column to columns_by_name if not exists
                     col = self.adhoc_metric_to_sqla(
diff --git a/tests/unit_tests/connectors/sqla/test_orderby_mutation.py 
b/tests/unit_tests/connectors/sqla/test_orderby_mutation.py
new file mode 100644
index 0000000000..41d6ecf5cb
--- /dev/null
+++ b/tests/unit_tests/connectors/sqla/test_orderby_mutation.py
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Test that SQL expressions are processed consistently for cache key generation.
+
+This prevents cache key mismatches in composite queries where SQL expressions
+are processed during validation and used consistently across cache key
+computation and query execution.
+"""
+
+
+def test_sql_expression_processing_during_validation():
+    """
+    Test that SQL expressions are processed during QueryObject validation.
+
+    This is a regression test for a bug where:
+    1. A chart has a metric with sqlExpression: "sum(field)" (lowercase)
+    2. The same metric is used in both metrics and orderby
+    3. During SQL generation, orderby processing would uppercase to 
"SUM(field)"
+    4. This mutation caused cache key mismatches in composite queries
+
+    The fix ensures SQL expressions are processed during validate() so:
+    - Cache key uses processed expressions
+    - Query execution uses same processed expressions
+    - No mutation occurs during query generation
+    """
+    # Create an adhoc metric with lowercase SQL - this is how users write them
+    original_metric = {
+        "expressionType": "SQL",
+        "sqlExpression": "sum(num)",  # lowercase
+        "label": "Sum of Num",
+    }
+
+    # The key insight: validation should process SQL expressions BEFORE
+    # cache key generation, so both the cache key and query execution
+    # use the same processed (uppercased) version
+    #
+    # After validate(), the metric should have processed SQL:
+    # metric["sqlExpression"] == "SUM(num)"
+    #
+    # This way:
+    # 1. cache_key() uses "SUM(num)"
+    # 2. get_sqla_query() also uses "SUM(num)" (with processed=True flag)
+    # 3. No mutation during query generation
+    # 4. Cache keys match!
+
+    assert original_metric["sqlExpression"] == "sum(num)", "Test setup 
verification"

(superset) 01/01: fix: prevent cache key mismatch by processing SQL expressions during validation

Reply via email to