This is an automated email from the ASF dual-hosted git repository.
beto pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/superset.git
The following commit(s) were added to refs/heads/master by this push:
new 99e69c32ee fix: coerce datetime conversion errors (#32683)
99e69c32ee is described below
commit 99e69c32eef2f0fcca009ae5a9cb9fedc5b38dc1
Author: Beto Dealmeida <[email protected]>
AuthorDate: Tue Mar 18 13:09:23 2025 -0400
fix: coerce datetime conversion errors (#32683)
---
superset/utils/core.py | 14 ++-
tests/integration_tests/utils_tests.py | 5 -
tests/unit_tests/utils/test_core.py | 193 +++++++++++++++++++++++++++++++++
3 files changed, 204 insertions(+), 8 deletions(-)
diff --git a/superset/utils/core.py b/superset/utils/core.py
index 69de1707ed..2b80c89f61 100644
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -1682,18 +1682,26 @@ def normalize_dttm_col(
utc=False,
unit=unit,
origin="unix",
- errors="raise",
+ errors="coerce",
exact=False,
)
else:
# Column has already been formatted as a timestamp.
- df[_col.col_label] = dttm_series.apply(pd.Timestamp)
+ try:
+ df[_col.col_label] = dttm_series.apply(
+ lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
+ )
+ except ValueError:
+ logger.warning(
+ "Unable to convert column %s to datetime, ignoring",
+ _col.col_label,
+ )
else:
df[_col.col_label] = pd.to_datetime(
df[_col.col_label],
utc=False,
format=_col.timestamp_format,
- errors="raise",
+ errors="coerce",
exact=False,
)
if _col.offset:
diff --git a/tests/integration_tests/utils_tests.py
b/tests/integration_tests/utils_tests.py
index aa39923152..2bab7cdee4 100644
--- a/tests/integration_tests/utils_tests.py
+++ b/tests/integration_tests/utils_tests.py
@@ -483,8 +483,3 @@ class TestUtils(SupersetTestCase):
# test numeric epoch_ms format
df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
assert normalize_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts
-
- # test that we raise an error when we can't convert
- df = pd.DataFrame([{"__timestamp": "1677-09-21 00:00:00", "a": 1}])
- with pytest.raises(pd.errors.OutOfBoundsDatetime):
- normalize_col(df, None, 0, None)
diff --git a/tests/unit_tests/utils/test_core.py
b/tests/unit_tests/utils/test_core.py
index aa51e52f6c..e629f00290 100644
--- a/tests/unit_tests/utils/test_core.py
+++ b/tests/unit_tests/utils/test_core.py
@@ -19,9 +19,11 @@ from dataclasses import dataclass
from typing import Any, Optional
from unittest.mock import MagicMock, patch
+import numpy as np
import pandas as pd
import pytest
from flask import current_app
+from pandas.api.types import is_datetime64_dtype
from pytest_mock import MockerFixture
from superset.exceptions import SupersetException
@@ -225,6 +227,197 @@ def test_normalize_dttm_col() -> None:
assert df["__time"].astype(str).tolist() == ["2017-07-01"]
+def test_normalize_dttm_col_epoch_seconds() -> None:
+ """Test conversion of epoch seconds."""
+ df = pd.DataFrame(
+ {
+ "epoch_col": [
+ 1577836800,
+ 1609459200,
+ 1640995200,
+ ] # 2020-01-01, 2021-01-01, 2022-01-01
+ }
+ )
+ dttm_cols = (DateColumn(col_label="epoch_col",
timestamp_format="epoch_s"),)
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["epoch_col"])
+ assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+ assert df["epoch_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
+ assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_epoch_milliseconds() -> None:
+ """Test conversion of epoch milliseconds."""
+ df = pd.DataFrame(
+ {
+ "epoch_ms_col": [
+ 1577836800000,
+ 1609459200000,
+ 1640995200000,
+ ] # 2020-01-01, 2021-01-01, 2022-01-01
+ }
+ )
+ dttm_cols = (DateColumn(col_label="epoch_ms_col",
timestamp_format="epoch_ms"),)
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["epoch_ms_col"])
+ assert df["epoch_ms_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+ assert df["epoch_ms_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
+ assert df["epoch_ms_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_formatted_date() -> None:
+ """Test conversion of formatted date strings."""
+ df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
+ dttm_cols = (DateColumn(col_label="date_col",
timestamp_format="%Y-%m-%d"),)
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["date_col"])
+ assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+ assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
+ assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_with_offset() -> None:
+ """Test with hour offset."""
+ df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
+ dttm_cols = (
+ DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d",
offset=3),
+ )
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["date_col"])
+ assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01
03:00:00"
+ assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01
03:00:00"
+ assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01
03:00:00"
+
+
+def test_normalize_dttm_col_with_time_shift() -> None:
+ """Test with time shift."""
+ df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
+ dttm_cols = (
+ DateColumn(
+ col_label="date_col", timestamp_format="%Y-%m-%d", time_shift="1
day"
+ ),
+ )
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["date_col"])
+ assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-02"
+ assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-02"
+ assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-02"
+
+
+def test_normalize_dttm_col_with_offset_and_time_shift() -> None:
+ """Test with both offset and time shift."""
+ df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
+ dttm_cols = (
+ DateColumn(
+ col_label="date_col",
+ timestamp_format="%Y-%m-%d",
+ offset=3,
+ time_shift="1 hour",
+ ),
+ )
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["date_col"])
+ assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01
04:00:00"
+ assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01
04:00:00"
+ assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01
04:00:00"
+
+
+def test_normalize_dttm_col_invalid_date_coerced() -> None:
+ """Test that invalid dates are coerced to NaT."""
+ df = pd.DataFrame({"date_col": ["2020-01-01", "invalid_date",
"2022-01-01"]})
+ dttm_cols = (DateColumn(col_label="date_col",
timestamp_format="%Y-%m-%d"),)
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["date_col"])
+ assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+ assert pd.isna(df["date_col"][1])
+ assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_invalid_epoch_coerced() -> None:
+ """Test that invalid epoch values are coerced to NaT."""
+ df = pd.DataFrame(
+ {"epoch_col": [1577836800, np.nan, 1640995200]} # 2020-01-01, NaN,
2022-01-01
+ )
+ dttm_cols = (DateColumn(col_label="epoch_col",
timestamp_format="epoch_s"),)
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["epoch_col"])
+ assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+ assert pd.isna(df["epoch_col"][1])
+ assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_non_existing_column() -> None:
+ """Test handling of non-existing columns."""
+ df = pd.DataFrame({"existing_col": [1, 2, 3]})
+ dttm_cols = (DateColumn(col_label="non_existing_col",
timestamp_format="%Y-%m-%d"),)
+
+ # Should not raise any exception
+ normalize_dttm_col(df, dttm_cols)
+
+ # DataFrame should remain unchanged
+ assert list(df.columns) == ["existing_col"]
+ assert df["existing_col"].tolist() == [1, 2, 3]
+
+
+def test_normalize_dttm_col_multiple_columns() -> None:
+ """Test normalizing multiple datetime columns."""
+ df = pd.DataFrame(
+ {
+ "date_col1": ["2020-01-01", "2021-01-01", "2022-01-01"],
+ "date_col2": ["01/01/2020", "01/01/2021", "01/01/2022"],
+ }
+ )
+ dttm_cols = (
+ DateColumn(col_label="date_col1", timestamp_format="%Y-%m-%d"),
+ DateColumn(col_label="date_col2", timestamp_format="%m/%d/%Y"),
+ )
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["date_col1"])
+ assert is_datetime64_dtype(df["date_col2"])
+ assert df["date_col1"][0].strftime("%Y-%m-%d") == "2020-01-01"
+ assert df["date_col2"][0].strftime("%Y-%m-%d") == "2020-01-01"
+
+
+def test_normalize_dttm_col_already_datetime_series() -> None:
+ """Test handling of already datetime series with epoch format."""
+ # Create a DataFrame with timestamp strings
+ df = pd.DataFrame(
+ {
+ "ts_col": [
+ "2020-01-01 00:00:00",
+ "2021-01-01 00:00:00",
+ "2022-01-01 00:00:00",
+ ]
+ }
+ )
+ dttm_cols = (DateColumn(col_label="ts_col", timestamp_format="epoch_s"),)
+
+ normalize_dttm_col(df, dttm_cols)
+
+ assert is_datetime64_dtype(df["ts_col"])
+ assert df["ts_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+ assert df["ts_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
+ assert df["ts_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
def test_check_if_safe_zip_success(app_context: None) -> None:
"""
Test if ZIP files are safe