This is an automated email from the ASF dual-hosted git repository.

beto pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/superset.git


The following commit(s) were added to refs/heads/master by this push:
     new 99e69c32ee fix: coerce datetime conversion errors (#32683)
99e69c32ee is described below

commit 99e69c32eef2f0fcca009ae5a9cb9fedc5b38dc1
Author: Beto Dealmeida <[email protected]>
AuthorDate: Tue Mar 18 13:09:23 2025 -0400

    fix: coerce datetime conversion errors (#32683)
---
 superset/utils/core.py                 |  14 ++-
 tests/integration_tests/utils_tests.py |   5 -
 tests/unit_tests/utils/test_core.py    | 193 +++++++++++++++++++++++++++++++++
 3 files changed, 204 insertions(+), 8 deletions(-)

diff --git a/superset/utils/core.py b/superset/utils/core.py
index 69de1707ed..2b80c89f61 100644
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -1682,18 +1682,26 @@ def normalize_dttm_col(
                     utc=False,
                     unit=unit,
                     origin="unix",
-                    errors="raise",
+                    errors="coerce",
                     exact=False,
                 )
             else:
                 # Column has already been formatted as a timestamp.
-                df[_col.col_label] = dttm_series.apply(pd.Timestamp)
+                try:
+                    df[_col.col_label] = dttm_series.apply(
+                        lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
+                    )
+                except ValueError:
+                    logger.warning(
+                        "Unable to convert column %s to datetime, ignoring",
+                        _col.col_label,
+                    )
         else:
             df[_col.col_label] = pd.to_datetime(
                 df[_col.col_label],
                 utc=False,
                 format=_col.timestamp_format,
-                errors="raise",
+                errors="coerce",
                 exact=False,
             )
         if _col.offset:
diff --git a/tests/integration_tests/utils_tests.py 
b/tests/integration_tests/utils_tests.py
index aa39923152..2bab7cdee4 100644
--- a/tests/integration_tests/utils_tests.py
+++ b/tests/integration_tests/utils_tests.py
@@ -483,8 +483,3 @@ class TestUtils(SupersetTestCase):
         # test numeric epoch_ms format
         df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
         assert normalize_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts
-
-        # test that we raise an error when we can't convert
-        df = pd.DataFrame([{"__timestamp": "1677-09-21 00:00:00", "a": 1}])
-        with pytest.raises(pd.errors.OutOfBoundsDatetime):
-            normalize_col(df, None, 0, None)
diff --git a/tests/unit_tests/utils/test_core.py 
b/tests/unit_tests/utils/test_core.py
index aa51e52f6c..e629f00290 100644
--- a/tests/unit_tests/utils/test_core.py
+++ b/tests/unit_tests/utils/test_core.py
@@ -19,9 +19,11 @@ from dataclasses import dataclass
 from typing import Any, Optional
 from unittest.mock import MagicMock, patch
 
+import numpy as np
 import pandas as pd
 import pytest
 from flask import current_app
+from pandas.api.types import is_datetime64_dtype
 from pytest_mock import MockerFixture
 
 from superset.exceptions import SupersetException
@@ -225,6 +227,197 @@ def test_normalize_dttm_col() -> None:
     assert df["__time"].astype(str).tolist() == ["2017-07-01"]
 
 
+def test_normalize_dttm_col_epoch_seconds() -> None:
+    """Test conversion of epoch seconds."""
+    df = pd.DataFrame(
+        {
+            "epoch_col": [
+                1577836800,
+                1609459200,
+                1640995200,
+            ]  # 2020-01-01, 2021-01-01, 2022-01-01
+        }
+    )
+    dttm_cols = (DateColumn(col_label="epoch_col", 
timestamp_format="epoch_s"),)
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["epoch_col"])
+    assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+    assert df["epoch_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
+    assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_epoch_milliseconds() -> None:
+    """Test conversion of epoch milliseconds."""
+    df = pd.DataFrame(
+        {
+            "epoch_ms_col": [
+                1577836800000,
+                1609459200000,
+                1640995200000,
+            ]  # 2020-01-01, 2021-01-01, 2022-01-01
+        }
+    )
+    dttm_cols = (DateColumn(col_label="epoch_ms_col", 
timestamp_format="epoch_ms"),)
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["epoch_ms_col"])
+    assert df["epoch_ms_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+    assert df["epoch_ms_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
+    assert df["epoch_ms_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_formatted_date() -> None:
+    """Test conversion of formatted date strings."""
+    df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
+    dttm_cols = (DateColumn(col_label="date_col", 
timestamp_format="%Y-%m-%d"),)
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["date_col"])
+    assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+    assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
+    assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_with_offset() -> None:
+    """Test with hour offset."""
+    df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
+    dttm_cols = (
+        DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d", 
offset=3),
+    )
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["date_col"])
+    assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 
03:00:00"
+    assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01 
03:00:00"
+    assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01 
03:00:00"
+
+
+def test_normalize_dttm_col_with_time_shift() -> None:
+    """Test with time shift."""
+    df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
+    dttm_cols = (
+        DateColumn(
+            col_label="date_col", timestamp_format="%Y-%m-%d", time_shift="1 
day"
+        ),
+    )
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["date_col"])
+    assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-02"
+    assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-02"
+    assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-02"
+
+
+def test_normalize_dttm_col_with_offset_and_time_shift() -> None:
+    """Test with both offset and time shift."""
+    df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
+    dttm_cols = (
+        DateColumn(
+            col_label="date_col",
+            timestamp_format="%Y-%m-%d",
+            offset=3,
+            time_shift="1 hour",
+        ),
+    )
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["date_col"])
+    assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 
04:00:00"
+    assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01 
04:00:00"
+    assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01 
04:00:00"
+
+
+def test_normalize_dttm_col_invalid_date_coerced() -> None:
+    """Test that invalid dates are coerced to NaT."""
+    df = pd.DataFrame({"date_col": ["2020-01-01", "invalid_date", 
"2022-01-01"]})
+    dttm_cols = (DateColumn(col_label="date_col", 
timestamp_format="%Y-%m-%d"),)
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["date_col"])
+    assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+    assert pd.isna(df["date_col"][1])
+    assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_invalid_epoch_coerced() -> None:
+    """Test that invalid epoch values are coerced to NaT."""
+    df = pd.DataFrame(
+        {"epoch_col": [1577836800, np.nan, 1640995200]}  # 2020-01-01, NaN, 
2022-01-01
+    )
+    dttm_cols = (DateColumn(col_label="epoch_col", 
timestamp_format="epoch_s"),)
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["epoch_col"])
+    assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+    assert pd.isna(df["epoch_col"][1])
+    assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
+def test_normalize_dttm_col_non_existing_column() -> None:
+    """Test handling of non-existing columns."""
+    df = pd.DataFrame({"existing_col": [1, 2, 3]})
+    dttm_cols = (DateColumn(col_label="non_existing_col", 
timestamp_format="%Y-%m-%d"),)
+
+    # Should not raise any exception
+    normalize_dttm_col(df, dttm_cols)
+
+    # DataFrame should remain unchanged
+    assert list(df.columns) == ["existing_col"]
+    assert df["existing_col"].tolist() == [1, 2, 3]
+
+
+def test_normalize_dttm_col_multiple_columns() -> None:
+    """Test normalizing multiple datetime columns."""
+    df = pd.DataFrame(
+        {
+            "date_col1": ["2020-01-01", "2021-01-01", "2022-01-01"],
+            "date_col2": ["01/01/2020", "01/01/2021", "01/01/2022"],
+        }
+    )
+    dttm_cols = (
+        DateColumn(col_label="date_col1", timestamp_format="%Y-%m-%d"),
+        DateColumn(col_label="date_col2", timestamp_format="%m/%d/%Y"),
+    )
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["date_col1"])
+    assert is_datetime64_dtype(df["date_col2"])
+    assert df["date_col1"][0].strftime("%Y-%m-%d") == "2020-01-01"
+    assert df["date_col2"][0].strftime("%Y-%m-%d") == "2020-01-01"
+
+
+def test_normalize_dttm_col_already_datetime_series() -> None:
+    """Test handling of already datetime series with epoch format."""
+    # Create a DataFrame with timestamp strings
+    df = pd.DataFrame(
+        {
+            "ts_col": [
+                "2020-01-01 00:00:00",
+                "2021-01-01 00:00:00",
+                "2022-01-01 00:00:00",
+            ]
+        }
+    )
+    dttm_cols = (DateColumn(col_label="ts_col", timestamp_format="epoch_s"),)
+
+    normalize_dttm_col(df, dttm_cols)
+
+    assert is_datetime64_dtype(df["ts_col"])
+    assert df["ts_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
+    assert df["ts_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
+    assert df["ts_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
+
+
 def test_check_if_safe_zip_success(app_context: None) -> None:
     """
     Test if ZIP files are safe

Reply via email to