This is an automated email from the ASF dual-hosted git repository.

michaelsmolina pushed a commit to branch 5.0
in repository https://gitbox.apache.org/repos/asf/superset.git


The following commit(s) were added to refs/heads/5.0 by this push:
     new 7076ff3342 fix(csv upload): Correctly casting to string numbers with 
floating points (e+) (#35586)
7076ff3342 is described below

commit 7076ff334269de4e780cdf9bfb1e91f9237230d7
Author: Luiz Otavio <[email protected]>
AuthorDate: Fri Oct 10 15:01:03 2025 -0300

    fix(csv upload): Correctly casting to string numbers with floating points 
(e+) (#35586)
    
    (cherry picked from commit 17ebbdd966d255bee2ab40b65d854b068d604144)
---
 superset/commands/database/uploaders/csv_reader.py | 32 +++++++++++++++++++++-
 .../commands/databases/csv_reader_test.py          | 27 ++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/superset/commands/database/uploaders/csv_reader.py 
b/superset/commands/database/uploaders/csv_reader.py
index cc1102e26b..ad8b98b224 100644
--- a/superset/commands/database/uploaders/csv_reader.py
+++ b/superset/commands/database/uploaders/csv_reader.py
@@ -247,10 +247,40 @@ class CSVReader(BaseDataReader):
             CSVReader._cast_single_column(df, column, dtype, kwargs)
         return df
 
+    @staticmethod
+    def _split_types(types: dict[str, str]) -> tuple[dict[str, str], dict[str, 
str]]:
+        """
+        Split column data types into custom and pandas-native types.
+
+        :param types: Dictionary mapping column names to data types
+        :return: Tuple of (custom_types, pandas_types) dictionaries
+        """
+        pandas_types = {
+            col: dtype
+            for col, dtype in types.items()
+            if dtype in ("str", "object", "string")
+        }
+        custom_types = {
+            col: dtype
+            for col, dtype in types.items()
+            if dtype not in ("str", "object", "string")
+        }
+        return custom_types, pandas_types
+
     @staticmethod
     def _read_csv(file: FileStorage, kwargs: dict[str, Any]) -> pd.DataFrame:
         try:
-            types = kwargs.pop("dtype", None)
+            types = None
+            if "dtype" in kwargs and kwargs["dtype"]:
+                custom_types, pandas_types = 
CSVReader._split_types(kwargs["dtype"])
+                if pandas_types:
+                    kwargs["dtype"] = pandas_types
+                else:
+                    kwargs.pop("dtype", None)
+
+                # Custom types for our manual casting
+                types = custom_types if custom_types else None
+
             df = None
             if "chunksize" in kwargs:
                 df = pd.concat(
diff --git a/tests/unit_tests/commands/databases/csv_reader_test.py 
b/tests/unit_tests/commands/databases/csv_reader_test.py
index a992eac5dc..6ba21522e0 100644
--- a/tests/unit_tests/commands/databases/csv_reader_test.py
+++ b/tests/unit_tests/commands/databases/csv_reader_test.py
@@ -807,6 +807,33 @@ def test_csv_reader_successful_numeric_conversion():
     assert df.iloc[0]["ID"] == 1001
 
 
+def test_csv_reader_successful_string_conversion_with_floats():
+    csv_data = [
+        ["id"],
+        [1439403621518935563],
+        [42286989],
+        [1413660691875593351],
+        [8.26839e17],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(
+            column_data_types={
+                "id": "str",
+            }
+        )
+    )
+
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    assert df.shape == (4, 1)
+    assert df["id"].dtype == "object"
+    assert df.iloc[0]["id"] == "1439403621518935563"
+    assert df.iloc[1]["id"] == "42286989"
+    assert df.iloc[2]["id"] == "1413660691875593351"
+    assert df.iloc[3]["id"] == "8.26839e+17"
+
+
 def test_csv_reader_error_detection_improvements_summary():
     csv_data_with_custom_header = [
         ["metadata_row", "skip", "this"],

Reply via email to