Re: [PR] feat: semantic layer extension [superset]

via GitHub Mon, 23 Mar 2026 23:05:13 -0700


shivamgoel commented on code in PR #37815:
URL: https://github.com/apache/superset/pull/37815#discussion_r2979257160



##########
superset/semantic_layers/mapper.py:
##########
@@ -0,0 +1,912 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Functions for mapping `QueryObject` to semantic layers.
+
+These functions validate and convert a `QueryObject` into one or more 
`SemanticQuery`,
+which are then passed to semantic layer implementations for execution, 
returning a
+single dataframe.
+
+"""
+
+from datetime import datetime, timedelta
+from time import time
+from typing import Any, cast, Sequence, TypeGuard
+
+import isodate
+import numpy as np
+import pyarrow as pa
+from superset_core.semantic_layers.types import (
+    AdhocExpression,
+    Dimension,
+    Filter,
+    FilterValues,
+    Grain,
+    Grains,
+    GroupLimit,
+    Metric,
+    Operator,
+    OrderDirection,
+    OrderTuple,
+    PredicateType,
+    SemanticQuery,
+    SemanticResult,
+)
+from superset_core.semantic_layers.view import SemanticViewFeature
+
+from superset.common.db_query_status import QueryStatus
+from superset.common.query_object import QueryObject
+from superset.common.utils.time_range_utils import 
get_since_until_from_query_object
+from superset.connectors.sqla.models import BaseDatasource
+from superset.constants import NO_TIME_RANGE
+from superset.models.helpers import QueryResult
+from superset.superset_typing import AdhocColumn
+from superset.utils.core import (
+    FilterOperator,
+    QueryObjectFilterClause,
+    TIME_COMPARISON,
+)
+from superset.utils.date_parser import get_past_or_future
+
+
+class ValidatedQueryObjectFilterClause(QueryObjectFilterClause):
+    """
+    A validated QueryObject filter clause with a string column name.
+
+    The `col` in a `QueryObjectFilterClause` can be either a string (column 
name) or an
+    adhoc column, but we only support the former in semantic layers.
+    """
+
+    # overwrite to narrow type; mypy complains about more restrictive typed 
dicts,
+    # but the alternative would be to redefine the object
+    col: str  # type: ignore[misc]
+    op: str  # type: ignore[misc]
+
+
+class ValidatedQueryObject(QueryObject):
+    """
+    A query object that has a datasource defined.
+    """
+
+    datasource: BaseDatasource
+
+    # overwrite to narrow type; mypy complains about the assignment since the 
base type
+    # allows adhoc filters, but we only support validated filters here
+    filter: list[ValidatedQueryObjectFilterClause]  # type: ignore[assignment]
+    series_columns: Sequence[str]  # type: ignore[assignment]
+    series_limit_metric: str | None
+
+
+def get_results(query_object: QueryObject) -> QueryResult:
+    """
+    Run 1+ queries based on `QueryObject` and return the results.
+
+    :param query_object: The QueryObject containing query specifications
+    :return: QueryResult compatible with Superset's query interface
+    """
+    if not validate_query_object(query_object):
+        raise ValueError("QueryObject must have a datasource defined.")
+
+    # Track execution time
+    start_time = time()
+
+    semantic_view = query_object.datasource.implementation
+    dispatcher = (
+        semantic_view.get_row_count
+        if query_object.is_rowcount
+        else semantic_view.get_table
+    )
+
+    # Step 1: Convert QueryObject to list of SemanticQuery objects
+    # The first query is the main query, subsequent queries are for time 
offsets
+    queries = map_query_object(query_object)
+
+    # Step 2: Execute the main query (first in the list)
+    main_query = queries[0]
+    main_result = dispatcher(main_query)
+
+    main_df = main_result.results.to_pandas()
+
+    # Collect all requests (SQL queries, HTTP requests, etc.) for 
troubleshooting
+    all_requests = list(main_result.requests)
+
+    # If no time offsets, return the main result as-is
+    if not query_object.time_offsets or len(queries) <= 1:
+        duration = timedelta(seconds=time() - start_time)
+        return map_semantic_result_to_query_result(
+            main_result,
+            query_object,
+            duration,
+        )
+
+    # Get metric names from the main query
+    # These are the columns that will be renamed with offset suffixes
+    metric_names = [metric.name for metric in main_query.metrics]
+
+    # Join keys are all columns except metrics
+    # These will be used to match rows between main and offset DataFrames
+    join_keys = [col for col in main_df.columns if col not in metric_names]
+
+    # Step 3 & 4: Execute each time offset query and join results
+    for offset_query, time_offset in zip(
+        queries[1:],
+        query_object.time_offsets,
+        strict=False,
+    ):
+        # Execute the offset query
+        result = dispatcher(offset_query)
+
+        # Add this query's requests to the collection
+        all_requests.extend(result.requests)
+
+        offset_df = result.results.to_pandas()
+
+        # Handle empty results - add NaN columns directly instead of merging
+        # This avoids dtype mismatch issues with empty DataFrames
+        if offset_df.empty:
+            # Add offset metric columns with NaN values directly to main_df
+            for metric in metric_names:
+                offset_col_name = TIME_COMPARISON.join([metric, time_offset])
+                main_df[offset_col_name] = np.nan
+        else:
+            # Rename metric columns with time offset suffix
+            # Format: "{metric_name}__{time_offset}"
+            # Example: "revenue" -> "revenue__1 week ago"
+            offset_df = offset_df.rename(
+                columns={
+                    metric: TIME_COMPARISON.join([metric, time_offset])
+                    for metric in metric_names
+                }
+            )
+
+            # Step 5: Perform left join on dimension columns
+            # This preserves all rows from main_df and adds offset metrics
+            # where they match
+            main_df = main_df.merge(
+                offset_df,
+                on=join_keys,
+                how="left",
+                suffixes=("", "__duplicate"),
+            )
+
+            # Clean up any duplicate columns that might have been created
+            # (shouldn't happen with proper join keys, but defensive 
programming)
+            duplicate_cols = [
+                col for col in main_df.columns if col.endswith("__duplicate")
+            ]
+            if duplicate_cols:
+                main_df = main_df.drop(columns=duplicate_cols)
+
+    # Convert final result to QueryResult
+    semantic_result = SemanticResult(
+        requests=all_requests,
+        results=pa.Table.from_pandas(main_df),
+    )
+    duration = timedelta(seconds=time() - start_time)
+    return map_semantic_result_to_query_result(
+        semantic_result,
+        query_object,
+        duration,
+    )
+
+
+def map_semantic_result_to_query_result(
+    semantic_result: SemanticResult,
+    query_object: ValidatedQueryObject,
+    duration: timedelta,
+) -> QueryResult:
+    """
+    Convert a SemanticResult to a QueryResult.
+
+    :param semantic_result: Result from the semantic layer
+    :param query_object: Original QueryObject (for passthrough attributes)
+    :param duration: Time taken to execute the query
+    :return: QueryResult compatible with Superset's query interface
+    """
+    # Get the query string from requests (typically one or more SQL queries)
+    query_str = ""
+    if semantic_result.requests:
+        # Join all requests for display (could be multiple for time 
comparisons)
+        query_str = "\n\n".join(
+            f"-- {req.type}\n{req.definition}" for req in 
semantic_result.requests
+        )
+
+    return QueryResult(
+        # Core data
+        df=semantic_result.results.to_pandas(),
+        query=query_str,
+        duration=duration,
+        # Template filters - not applicable to semantic layers
+        # (semantic layers don't use Jinja templates)
+        applied_template_filters=None,
+        # Filter columns - not applicable to semantic layers
+        # (semantic layers handle filter validation internally)
+        applied_filter_columns=None,
+        rejected_filter_columns=None,
+        # Status - always success if we got here
+        # (errors would raise exceptions before reaching this point)
+        status=QueryStatus.SUCCESS,
+        error_message=None,
+        errors=None,
+        # Time range - pass through from original query_object
+        from_dttm=query_object.from_dttm,
+        to_dttm=query_object.to_dttm,
+    )
+
+
+def _normalize_column(column: str | AdhocColumn, dimension_names: set[str]) -> 
str:
+    """
+    Normalize a column to its dimension name.
+
+    Columns can be either:
+    - A string (dimension name directly)
+    - An AdhocColumn with isColumnReference=True and sqlExpression containing 
the
+      dimension name
+    """
+    if isinstance(column, str):
+        return column
+
+    # Handle column references (e.g., from time-series charts)
+    if column.get("isColumnReference") and (sql_expr := 
column.get("sqlExpression")):
+        if sql_expr in dimension_names:
+            return sql_expr
+
+    raise ValueError("Adhoc dimensions are not supported in Semantic Views.")
+
+
+def map_query_object(query_object: ValidatedQueryObject) -> 
list[SemanticQuery]:

Review Comment:
   The mapper does QueryObject -> SemanticQuery conversion. For SIP-199, I'll 
need the inverse: SemanticQuery -> QueryObject (so that a simplified headless 
payload can be executed against traditional SQL datasets via 
QueryObjectFactory).
   
   Would it make sense to add a map_semantic_query_to_query_object() function 
here (or in a sibling module) as a follow-up? This would let both datasets and 
semantic views be queried with the same simplified payload -- semantic views go
     forward through the mapper, datasets go backward through it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: semantic layer extension [superset]

Reply via email to