This is an automated email from the ASF dual-hosted git repository.
michaelsmolina pushed a commit to branch 5.0
in repository https://gitbox.apache.org/repos/asf/superset.git
The following commit(s) were added to refs/heads/5.0 by this push:
new 7076ff3342 fix(csv upload): Correctly casting to string numbers with
floating points (e+) (#35586)
7076ff3342 is described below
commit 7076ff334269de4e780cdf9bfb1e91f9237230d7
Author: Luiz Otavio <[email protected]>
AuthorDate: Fri Oct 10 15:01:03 2025 -0300
fix(csv upload): Correctly casting to string numbers with floating points
(e+) (#35586)
(cherry picked from commit 17ebbdd966d255bee2ab40b65d854b068d604144)
---
superset/commands/database/uploaders/csv_reader.py | 32 +++++++++++++++++++++-
.../commands/databases/csv_reader_test.py | 27 ++++++++++++++++++
2 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/superset/commands/database/uploaders/csv_reader.py
b/superset/commands/database/uploaders/csv_reader.py
index cc1102e26b..ad8b98b224 100644
--- a/superset/commands/database/uploaders/csv_reader.py
+++ b/superset/commands/database/uploaders/csv_reader.py
@@ -247,10 +247,40 @@ class CSVReader(BaseDataReader):
CSVReader._cast_single_column(df, column, dtype, kwargs)
return df
+ @staticmethod
+ def _split_types(types: dict[str, str]) -> tuple[dict[str, str], dict[str,
str]]:
+ """
+ Split column data types into custom and pandas-native types.
+
+ :param types: Dictionary mapping column names to data types
+ :return: Tuple of (custom_types, pandas_types) dictionaries
+ """
+ pandas_types = {
+ col: dtype
+ for col, dtype in types.items()
+ if dtype in ("str", "object", "string")
+ }
+ custom_types = {
+ col: dtype
+ for col, dtype in types.items()
+ if dtype not in ("str", "object", "string")
+ }
+ return custom_types, pandas_types
+
@staticmethod
def _read_csv(file: FileStorage, kwargs: dict[str, Any]) -> pd.DataFrame:
try:
- types = kwargs.pop("dtype", None)
+ types = None
+ if "dtype" in kwargs and kwargs["dtype"]:
+ custom_types, pandas_types =
CSVReader._split_types(kwargs["dtype"])
+ if pandas_types:
+ kwargs["dtype"] = pandas_types
+ else:
+ kwargs.pop("dtype", None)
+
+ # Custom types for our manual casting
+ types = custom_types if custom_types else None
+
df = None
if "chunksize" in kwargs:
df = pd.concat(
diff --git a/tests/unit_tests/commands/databases/csv_reader_test.py
b/tests/unit_tests/commands/databases/csv_reader_test.py
index a992eac5dc..6ba21522e0 100644
--- a/tests/unit_tests/commands/databases/csv_reader_test.py
+++ b/tests/unit_tests/commands/databases/csv_reader_test.py
@@ -807,6 +807,33 @@ def test_csv_reader_successful_numeric_conversion():
assert df.iloc[0]["ID"] == 1001
+def test_csv_reader_successful_string_conversion_with_floats():
+ csv_data = [
+ ["id"],
+ [1439403621518935563],
+ [42286989],
+ [1413660691875593351],
+ [8.26839e17],
+ ]
+
+ csv_reader = CSVReader(
+ options=CSVReaderOptions(
+ column_data_types={
+ "id": "str",
+ }
+ )
+ )
+
+ df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+ assert df.shape == (4, 1)
+ assert df["id"].dtype == "object"
+ assert df.iloc[0]["id"] == "1439403621518935563"
+ assert df.iloc[1]["id"] == "42286989"
+ assert df.iloc[2]["id"] == "1413660691875593351"
+ assert df.iloc[3]["id"] == "8.26839e+17"
+
+
def test_csv_reader_error_detection_improvements_summary():
csv_data_with_custom_header = [
["metadata_row", "skip", "this"],