This is an automated email from the ASF dual-hosted git repository. aminghadersohi pushed a commit to branch mcp-rls-plugins-99978 in repository https://gitbox.apache.org/repos/asf/superset.git
commit e14a92f881d178ff8b939299816d5ec2dd238083 Author: Beto Dealmeida <[email protected]> AuthorDate: Thu May 21 09:25:27 2026 -0400 fix(semantic layers): coerce filter types (#40222) --- superset/semantic_layers/mapper.py | 192 ++++++++++-- tests/unit_tests/semantic_layers/mapper_test.py | 372 +++++++++++++++++++++++- 2 files changed, 543 insertions(+), 21 deletions(-) diff --git a/superset/semantic_layers/mapper.py b/superset/semantic_layers/mapper.py index 23dea06124d..b71dfeb44db 100644 --- a/superset/semantic_layers/mapper.py +++ b/superset/semantic_layers/mapper.py @@ -24,9 +24,10 @@ single dataframe. """ -from datetime import datetime, timedelta -from time import time +from datetime import date, datetime, time, timedelta, tzinfo +from time import time as current_time from typing import Any, cast, Sequence, TypeGuard +from zoneinfo import ZoneInfo import isodate import numpy as np @@ -103,7 +104,7 @@ def get_results(query_object: QueryObject) -> QueryResult: raise ValueError("QueryObject must have a datasource defined.") # Track execution time - start_time = time() + start_time = current_time() semantic_view = query_object.datasource.implementation dispatcher = ( @@ -127,7 +128,7 @@ def get_results(query_object: QueryObject) -> QueryResult: # If no time offsets, return the main result as-is if not query_object.time_offsets or len(queries) <= 1: - duration = timedelta(seconds=time() - start_time) + duration = timedelta(seconds=current_time() - start_time) return map_semantic_result_to_query_result( main_result, query_object, @@ -197,7 +198,7 @@ def get_results(query_object: QueryObject) -> QueryResult: requests=all_requests, results=pa.Table.from_pandas(main_df), ) - duration = timedelta(seconds=time() - start_time) + duration = timedelta(seconds=current_time() - start_time) return map_semantic_result_to_query_result( semantic_result, query_object, @@ -541,21 +542,29 @@ def _convert_query_object_filter( if operator_str == FilterOperator.TEMPORAL_RANGE.value: if not isinstance(value, str) or value == NO_TIME_RANGE: return None - start, end = value.split(" : ") - return { - Filter( - type=PredicateType.WHERE, - column=dimension, - operator=Operator.GREATER_THAN_OR_EQUAL, - value=start, - ), - Filter( - type=PredicateType.WHERE, - column=dimension, - operator=Operator.LESS_THAN, - value=end, - ), - } + start, end = (side.strip() for side in value.split(" : ")) + filters: set[Filter] = set() + if start: + filters.add( + Filter( + type=PredicateType.WHERE, + column=dimension, + operator=Operator.GREATER_THAN_OR_EQUAL, + value=_coerce_scalar_filter_value(start, dimension), + ) + ) + if end: + filters.add( + Filter( + type=PredicateType.WHERE, + column=dimension, + operator=Operator.LESS_THAN, + value=_coerce_scalar_filter_value(end, dimension), + ) + ) + return filters or None + + value = _coerce_filter_value(value, dimension) # Map QueryObject operators to semantic layer operators operator_mapping = { @@ -588,6 +597,149 @@ def _convert_query_object_filter( } +def _coerce_filter_value( + value: FilterValues | frozenset[FilterValues], + dimension: Dimension, +) -> FilterValues | frozenset[FilterValues]: + if isinstance(value, frozenset): + return frozenset(_coerce_scalar_filter_value(v, dimension) for v in value) + return _coerce_scalar_filter_value(value, dimension) + + +def _timestamp_target_tz(dtype: pa.DataType) -> tzinfo | None: + tz_name = getattr(dtype, "tz", None) + return ZoneInfo(tz_name) if tz_name else None + + +def _align_tz(dt: datetime, target_tz: tzinfo | None) -> datetime: + if target_tz is None: + return dt + if dt.tzinfo is None: + return dt.replace(tzinfo=target_tz) + return dt.astimezone(target_tz) + + +def _coerce_scalar_filter_value( # noqa: C901 — type dispatch, complexity is inherent + value: FilterValues, dimension: Dimension +) -> FilterValues: + if value is None: + return None + + dtype = dimension.type + + if pa.types.is_boolean(dtype): + if isinstance(value, bool): + return value + if isinstance(value, (int, float)) and value in (0, 1): + return bool(value) + if isinstance(value, str): + parsed = value.strip().lower() + if parsed in {"true", "t", "1", "yes", "y", "on"}: + return True + if parsed in {"false", "f", "0", "no", "n", "off"}: + return False + raise ValueError( + f"Invalid boolean value {value!r} for filter column {dimension.name}" + ) + + if pa.types.is_integer(dtype): + if isinstance(value, bool): + raise ValueError( + f"Invalid integer value {value!r} for filter column {dimension.name}" + ) + if isinstance(value, int): + return value + if isinstance(value, float) and value.is_integer(): + return int(value) + if isinstance(value, str): + try: + return int(value.strip()) + except ValueError as ex: + raise ValueError( + f"Invalid integer value {value!r} for filter column " + f"{dimension.name}" + ) from ex + raise ValueError( + f"Invalid integer value {value!r} for filter column {dimension.name}" + ) + + if pa.types.is_floating(dtype) or pa.types.is_decimal(dtype): + # Decimal dimensions are coerced through ``float`` because ``FilterValues`` + # does not include ``Decimal``. That is lossless for the common case + # (≤ ~15 significant digits) and matches how downstream semantic-view + # implementations consume numeric filters; high-precision decimals would + # need a wider ``FilterValues`` union and propagation through the cache's + # comparability checks. + if isinstance(value, bool): + raise ValueError( + f"Invalid numeric value {value!r} for filter column {dimension.name}" + ) + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value.strip()) + except ValueError as ex: + raise ValueError( + f"Invalid numeric value {value!r} for filter column " + f"{dimension.name}" + ) from ex + raise ValueError( + f"Invalid numeric value {value!r} for filter column {dimension.name}" + ) + + if pa.types.is_date(dtype): + if isinstance(value, datetime): + return value.date() + if isinstance(value, date): + return value + if isinstance(value, str): + try: + return datetime.fromisoformat(value.strip()).date() + except ValueError as ex: + raise ValueError( + f"Invalid date value {value!r} for filter column {dimension.name}" + ) from ex + raise ValueError( + f"Invalid date value {value!r} for filter column {dimension.name}" + ) + + if pa.types.is_timestamp(dtype): + target_tz = _timestamp_target_tz(dtype) + if isinstance(value, datetime): + return _align_tz(value, target_tz) + if isinstance(value, date): + return _align_tz(datetime.combine(value, time.min), target_tz) + if isinstance(value, str): + normalized = value.strip().replace("Z", "+00:00") + try: + return _align_tz(datetime.fromisoformat(normalized), target_tz) + except ValueError as ex: + raise ValueError( + f"Invalid timestamp value {value!r} for filter column " + f"{dimension.name}" + ) from ex + raise ValueError( + f"Invalid timestamp value {value!r} for filter column {dimension.name}" + ) + + if pa.types.is_time(dtype): + if isinstance(value, time): + return value + if isinstance(value, str): + try: + return time.fromisoformat(value.strip()) + except ValueError as ex: + raise ValueError( + f"Invalid time value {value!r} for filter column {dimension.name}" + ) from ex + raise ValueError( + f"Invalid time value {value!r} for filter column {dimension.name}" + ) + + return value + + def _get_order_from_query_object( query_object: ValidatedQueryObject, all_metrics: dict[str, Metric], diff --git a/tests/unit_tests/semantic_layers/mapper_test.py b/tests/unit_tests/semantic_layers/mapper_test.py index 7abdce91a6e..0cbb59c8430 100644 --- a/tests/unit_tests/semantic_layers/mapper_test.py +++ b/tests/unit_tests/semantic_layers/mapper_test.py @@ -15,8 +15,10 @@ # specific language governing permissions and limitations # under the License. -from datetime import datetime +from datetime import date, datetime, time, timezone +from typing import Any from unittest.mock import MagicMock +from zoneinfo import ZoneInfo import pandas as pd import pyarrow as pa @@ -40,6 +42,7 @@ from superset_core.semantic_layers.types import ( from superset_core.semantic_layers.view import SemanticViewFeature from superset.semantic_layers.mapper import ( + _coerce_scalar_filter_value, _convert_query_object_filter, _convert_time_grain, _get_filters_from_extras, @@ -1204,6 +1207,91 @@ def test_convert_query_object_filter_like() -> None: } +def test_convert_query_object_filter_coerces_integer_string_value() -> None: + """Test scalar filter values are coerced to dimension type.""" + all_dimensions = { + "birthyear": Dimension( + "birthyear", + "birthyear", + pa.int64(), + "birthyear", + "Birthyear", + ) + } + + filter_: ValidatedQueryObjectFilterClause = { + "op": FilterOperator.GREATER_THAN_OR_EQUALS.value, + "col": "birthyear", + "val": "1982", + } + + result = _convert_query_object_filter(filter_, all_dimensions) + + assert result == { + Filter( + type=PredicateType.WHERE, + column=all_dimensions["birthyear"], + operator=Operator.GREATER_THAN_OR_EQUAL, + value=1982, + ) + } + + +def test_convert_query_object_filter_coerces_in_integer_values() -> None: + """Test IN filter list values are coerced element-wise.""" + all_dimensions = { + "order_id__amount": Dimension( + "order_id__amount", + "order_id__amount", + pa.int64(), + "order_id__amount", + "Order amount", + ) + } + + filter_: ValidatedQueryObjectFilterClause = { + "op": FilterOperator.IN.value, + "col": "order_id__amount", + "val": ["58", "61"], + } + + result = _convert_query_object_filter(filter_, all_dimensions) + + assert result == { + Filter( + type=PredicateType.WHERE, + column=all_dimensions["order_id__amount"], + operator=Operator.IN, + value=frozenset({58, 61}), + ) + } + + +def test_convert_query_object_filter_invalid_integer_value_raises() -> None: + """Test invalid integer value raises a clear error.""" + all_dimensions = { + "birthyear": Dimension( + "birthyear", + "birthyear", + pa.int64(), + "birthyear", + "Birthyear", + ) + } + + filter_: ValidatedQueryObjectFilterClause = { + "op": FilterOperator.GREATER_THAN_OR_EQUALS.value, + "col": "birthyear", + "val": "nineteen-eighty-two", + } + + with pytest.raises( + ValueError, + match="Invalid integer value 'nineteen-eighty-two' for filter column birthyear", + ): + _convert_query_object_filter(filter_, all_dimensions) + + def test_get_results_without_time_offsets( mock_datasource: MagicMock, mocker: MockerFixture, @@ -1923,6 +2011,86 @@ def test_convert_query_object_filter_temporal_range_with_value() -> None: } +def test_convert_query_object_filter_temporal_range_coerces_date_bounds() -> None: + """ + TEMPORAL_RANGE bounds should be coerced against the dimension's dtype so + date/timestamp columns are not compared against raw strings. + """ + all_dimensions = { + "order_date": Dimension( + "order_date", "order_date", pa.date32(), "order_date", "Order date" + ) + } + filter_: ValidatedQueryObjectFilterClause = { + "op": FilterOperator.TEMPORAL_RANGE.value, + "col": "order_date", + "val": "2025-01-01 : 2025-12-31", + } + + result = _convert_query_object_filter(filter_, all_dimensions) + + assert result == { + Filter( + type=PredicateType.WHERE, + column=all_dimensions["order_date"], + operator=Operator.GREATER_THAN_OR_EQUAL, + value=date(2025, 1, 1), + ), + Filter( + type=PredicateType.WHERE, + column=all_dimensions["order_date"], + operator=Operator.LESS_THAN, + value=date(2025, 12, 31), + ), + } + + +def test_convert_query_object_filter_temporal_range_open_ended() -> None: + """ + Open-ended TEMPORAL_RANGE bounds should emit only the bounded predicate. + """ + all_dimensions = { + "order_date": Dimension( + "order_date", "order_date", pa.date32(), "order_date", "Order date" + ) + } + + only_start: ValidatedQueryObjectFilterClause = { + "op": FilterOperator.TEMPORAL_RANGE.value, + "col": "order_date", + "val": "2025-01-01 : ", + } + assert _convert_query_object_filter(only_start, all_dimensions) == { + Filter( + type=PredicateType.WHERE, + column=all_dimensions["order_date"], + operator=Operator.GREATER_THAN_OR_EQUAL, + value=date(2025, 1, 1), + ), + } + + only_end: ValidatedQueryObjectFilterClause = { + "op": FilterOperator.TEMPORAL_RANGE.value, + "col": "order_date", + "val": " : 2025-12-31", + } + assert _convert_query_object_filter(only_end, all_dimensions) == { + Filter( + type=PredicateType.WHERE, + column=all_dimensions["order_date"], + operator=Operator.LESS_THAN, + value=date(2025, 12, 31), + ), + } + + empty: ValidatedQueryObjectFilterClause = { + "op": FilterOperator.TEMPORAL_RANGE.value, + "col": "order_date", + "val": " : ", + } + assert _convert_query_object_filter(empty, all_dimensions) is None + + def test_get_order_adhoc_with_none_sql_expression(mock_datasource: MagicMock) -> None: """ Test order extraction skips adhoc expression with None sqlExpression. @@ -2741,3 +2909,205 @@ def test_get_group_limit_filters_no_granularity( # Should return None - no granularity means no time filters added assert result is None + + +# --------------------------------------------------------------------------- +# _coerce_scalar_filter_value: per-dtype branches +# --------------------------------------------------------------------------- + + +def _dim(dtype: pa.DataType, name: str = "d") -> Dimension: + return Dimension(name, name, dtype, name, name.capitalize()) + + +def test_coerce_none_returns_none() -> None: + assert _coerce_scalar_filter_value(None, _dim(pa.int64())) is None + + +def test_coerce_unsupported_dtype_passes_through() -> None: + # utf8 (and any dtype not branched in the function) returns the value as-is. + assert _coerce_scalar_filter_value("abc", _dim(pa.utf8())) == "abc" + + [email protected]( + "raw,expected", + [ + (True, True), + (False, False), + (1, True), + (0, False), + (1.0, True), + (0.0, False), + ("true", True), + ("T", True), + (" 1 ", True), + ("yes", True), + ("Y", True), + ("on", True), + ("false", False), + ("F", False), + ("0", False), + ("no", False), + ("N", False), + ("off", False), + ], +) +def test_coerce_boolean(raw: Any, expected: bool) -> None: + assert _coerce_scalar_filter_value(raw, _dim(pa.bool_())) is expected + + [email protected]("raw", ["maybe", 2, 0.5, -1]) +def test_coerce_boolean_invalid_raises(raw: Any) -> None: + with pytest.raises(ValueError, match="Invalid boolean value"): + _coerce_scalar_filter_value(raw, _dim(pa.bool_())) + + +def test_coerce_integer_passthrough() -> None: + assert _coerce_scalar_filter_value(42, _dim(pa.int64())) == 42 + + +def test_coerce_integer_accepts_integer_valued_float() -> None: + # JSON round-trips can turn an int into ``42.0``; accept losslessly. + assert _coerce_scalar_filter_value(42.0, _dim(pa.int64())) == 42 + + +def test_coerce_integer_rejects_bool() -> None: + # bool is a subclass of int; we explicitly reject it. + with pytest.raises(ValueError, match="Invalid integer value"): + _coerce_scalar_filter_value(True, _dim(pa.int64())) + + +def test_coerce_integer_rejects_non_integer_float() -> None: + with pytest.raises(ValueError, match="Invalid integer value"): + _coerce_scalar_filter_value(1.5, _dim(pa.int64())) + + +def test_coerce_integer_rejects_other_types() -> None: + with pytest.raises(ValueError, match="Invalid integer value"): + _coerce_scalar_filter_value([1], _dim(pa.int64())) + + [email protected]( + "dtype", + [pa.float64(), pa.decimal128(10, 2)], +) +def test_coerce_floating_or_decimal(dtype: pa.DataType) -> None: + assert _coerce_scalar_filter_value(1, _dim(dtype)) == 1.0 + assert _coerce_scalar_filter_value(1.5, _dim(dtype)) == 1.5 + assert _coerce_scalar_filter_value(" 2.5 ", _dim(dtype)) == 2.5 + + +def test_coerce_floating_rejects_bool() -> None: + with pytest.raises(ValueError, match="Invalid numeric value"): + _coerce_scalar_filter_value(True, _dim(pa.float64())) + + +def test_coerce_floating_invalid_string_raises() -> None: + with pytest.raises(ValueError, match="Invalid numeric value"): + _coerce_scalar_filter_value("not-a-number", _dim(pa.float64())) + + +def test_coerce_floating_rejects_other_types() -> None: + with pytest.raises(ValueError, match="Invalid numeric value"): + _coerce_scalar_filter_value([1.0], _dim(pa.float64())) + + +def test_coerce_date_from_datetime() -> None: + out = _coerce_scalar_filter_value(datetime(2025, 1, 2, 12, 0), _dim(pa.date32())) + assert out == date(2025, 1, 2) + + +def test_coerce_date_passthrough() -> None: + out = _coerce_scalar_filter_value(date(2025, 1, 2), _dim(pa.date32())) + assert out == date(2025, 1, 2) + + +def test_coerce_date_from_iso_string() -> None: + out = _coerce_scalar_filter_value(" 2025-01-02 ", _dim(pa.date32())) + assert out == date(2025, 1, 2) + + +def test_coerce_date_invalid_string_raises() -> None: + with pytest.raises(ValueError, match="Invalid date value"): + _coerce_scalar_filter_value("not-a-date", _dim(pa.date32())) + + +def test_coerce_date_rejects_other_types() -> None: + with pytest.raises(ValueError, match="Invalid date value"): + _coerce_scalar_filter_value(20250102, _dim(pa.date32())) + + +def test_coerce_timestamp_from_datetime_passthrough() -> None: + dt = datetime(2025, 1, 2, 3, 4, 5) + # Naive dtype: returned as-is, still naive. + assert _coerce_scalar_filter_value(dt, _dim(pa.timestamp("us"))) == dt + + +def test_coerce_timestamp_from_date() -> None: + out = _coerce_scalar_filter_value(date(2025, 1, 2), _dim(pa.timestamp("us"))) + assert out == datetime(2025, 1, 2, 0, 0) + + +def test_coerce_timestamp_from_iso_string_with_z() -> None: + out = _coerce_scalar_filter_value("2025-01-02T03:04:05Z", _dim(pa.timestamp("us"))) + assert out == datetime.fromisoformat("2025-01-02T03:04:05+00:00") + + +def test_coerce_timestamp_invalid_string_raises() -> None: + with pytest.raises(ValueError, match="Invalid timestamp value"): + _coerce_scalar_filter_value("not-a-ts", _dim(pa.timestamp("us"))) + + +def test_coerce_timestamp_rejects_other_types() -> None: + with pytest.raises(ValueError, match="Invalid timestamp value"): + _coerce_scalar_filter_value(1234567890, _dim(pa.timestamp("us"))) + + +def test_coerce_timestamp_tz_aware_dtype_attaches_tz_to_naive_datetime() -> None: + dt = datetime(2025, 1, 2, 3, 4, 5) + out = _coerce_scalar_filter_value(dt, _dim(pa.timestamp("us", tz="UTC"))) + assert out == datetime(2025, 1, 2, 3, 4, 5, tzinfo=ZoneInfo("UTC")) + + +def test_coerce_timestamp_tz_aware_dtype_converts_aware_datetime() -> None: + dt = datetime(2025, 1, 2, 12, 0, tzinfo=timezone.utc) + out = _coerce_scalar_filter_value( + dt, _dim(pa.timestamp("us", tz="America/New_York")) + ) + # 12:00 UTC == 07:00 in New York + assert out == datetime(2025, 1, 2, 7, 0, tzinfo=ZoneInfo("America/New_York")) + + +def test_coerce_timestamp_tz_aware_dtype_attaches_tz_to_date() -> None: + out = _coerce_scalar_filter_value( + date(2025, 1, 2), _dim(pa.timestamp("us", tz="UTC")) + ) + assert out == datetime(2025, 1, 2, 0, 0, tzinfo=ZoneInfo("UTC")) + + +def test_coerce_timestamp_tz_aware_dtype_parses_string_with_tz() -> None: + out = _coerce_scalar_filter_value( + "2025-01-02T03:04:05", _dim(pa.timestamp("us", tz="UTC")) + ) + # Naive string gets UTC attached. + assert out == datetime(2025, 1, 2, 3, 4, 5, tzinfo=ZoneInfo("UTC")) + + +def test_coerce_time_passthrough() -> None: + out = _coerce_scalar_filter_value(time(3, 4, 5), _dim(pa.time64("us"))) + assert out == time(3, 4, 5) + + +def test_coerce_time_from_iso_string() -> None: + out = _coerce_scalar_filter_value(" 03:04:05 ", _dim(pa.time64("us"))) + assert out == time(3, 4, 5) + + +def test_coerce_time_invalid_string_raises() -> None: + with pytest.raises(ValueError, match="Invalid time value"): + _coerce_scalar_filter_value("not-a-time", _dim(pa.time64("us"))) + + +def test_coerce_time_rejects_other_types() -> None: + with pytest.raises(ValueError, match="Invalid time value"): + _coerce_scalar_filter_value(123, _dim(pa.time64("us")))
