(superset) 02/02: fix: Upload CSV as Dataset (#34763)

michaelsmolina Thu, 18 Sep 2025 10:43:05 -0700

This is an automated email from the ASF dual-hosted git repository.

michaelsmolina pushed a commit to branch 5.0
in repository https://gitbox.apache.org/repos/asf/superset.git


commit 49df35d01298c997b79a3369d754815dc25a2742
Author: Luiz Otavio <45200344+luizotavi...@users.noreply.github.com>
AuthorDate: Mon Sep 8 11:48:23 2025 -0300

    fix: Upload CSV as Dataset (#34763)
    
    (cherry picked from commit 1c2b9db4f0d5728f9bcf35ebdef2df5c41d35590)
---
 superset/commands/database/uploaders/base.py       |   3 +-
 superset/commands/database/uploaders/csv_reader.py | 229 ++++++++-
 superset/models/core.py                            |   4 +-
 .../commands/databases/csv_reader_test.py          | 556 +++++++++++++++++++++
 4 files changed, 776 insertions(+), 16 deletions(-)

diff --git a/superset/commands/database/uploaders/base.py 
b/superset/commands/database/uploaders/base.py
index 5a8227b054..68a437a203 100644
--- a/superset/commands/database/uploaders/base.py
+++ b/superset/commands/database/uploaders/base.py
@@ -133,7 +133,8 @@ class BaseDataReader:
                 )
             ) from ex
         except Exception as ex:
-            raise DatabaseUploadFailed(exception=ex) from ex
+            message = ex.message if hasattr(ex, "message") and ex.message else 
str(ex)
+            raise DatabaseUploadFailed(message=message, exception=ex) from ex
 
 
 class UploadCommand(BaseCommand):
diff --git a/superset/commands/database/uploaders/csv_reader.py 
b/superset/commands/database/uploaders/csv_reader.py
index 0072a7d5cf..cc1102e26b 100644
--- a/superset/commands/database/uploaders/csv_reader.py
+++ b/superset/commands/database/uploaders/csv_reader.py
@@ -33,6 +33,10 @@ logger = logging.getLogger(__name__)
 READ_CSV_CHUNK_SIZE = 1000
 ROWS_TO_READ_METADATA = 2
 
+# Fixed error limit to avoid huge payloads and poor UX given that a file
+# might contain thousands of errors.
+MAX_DISPLAYED_ERRORS = 5
+
 
 class CSVReaderOptions(ReaderOptions, total=False):
     delimiter: str
@@ -60,20 +64,211 @@ class CSVReader(BaseDataReader):
             options=dict(options),
         )
 
+    @staticmethod
+    def _find_invalid_values_numeric(df: pd.DataFrame, column: str) -> 
pd.Series:
+        """
+        Find invalid values for numeric type conversion.
+        Identifies rows where values cannot be converted to numeric types using
+        pandas to_numeric with error coercing. Returns a boolean mask 
indicating
+        which values are invalid (non-null but unconvertible).
+        :param df: DataFrame containing the data
+        :param column: Name of the column to check for invalid values
+        :return: Boolean Series indicating which rows have invalid
+        values for numeric conversion
+        """
+        converted = pd.to_numeric(df[column], errors="coerce")
+        return converted.isna() & df[column].notna()
+
+    @staticmethod
+    def _find_invalid_values_non_numeric(
+        df: pd.DataFrame, column: str, dtype: str
+    ) -> pd.Series:
+        """
+        Find invalid values for non-numeric type conversion.
+        Identifies rows where values cannot be converted to the specified 
non-numeric
+        data type by attempting conversion and catching exceptions. This is 
used for
+        string, categorical, or other non-numeric type conversions.
+        :param df: DataFrame containing the data
+        :param column: Name of the column to check for invalid values
+        :param dtype: Target data type for conversion (e.g., 'string', 
'category')
+        :return: Boolean Series indicating which rows have
+        invalid values for the target type
+        """
+        invalid_mask = pd.Series([False] * len(df), index=df.index)
+        for idx, value in df[column].items():
+            if pd.notna(value):
+                try:
+                    pd.Series([value]).astype(dtype)
+                except (ValueError, TypeError):
+                    invalid_mask[idx] = True
+        return invalid_mask
+
+    @staticmethod
+    def _get_error_details(
+        df: pd.DataFrame,
+        column: str,
+        dtype: str,
+        invalid_mask: pd.Series,
+        kwargs: dict[str, Any],
+    ) -> tuple[list[str], int]:
+        """
+        Get detailed error information for invalid values in type conversion.
+        Extracts detailed information about conversion errors, including 
specific
+        invalid values and their line numbers. Limits the number of detailed 
errors
+        shown to avoid overwhelming output while providing total error count.
+        :param df: DataFrame containing the data
+        :param column: Name of the column with conversion errors
+        :param dtype: Target data type that failed conversion
+        :param invalid_mask: Boolean mask indicating which rows have invalid 
values
+        :param kwargs: Additional parameters including header row information
+        :return: Tuple containing:
+            - List of formatted error detail strings (limited by 
MAX_DISPLAYED_ERRORS)
+            - Total count of errors found
+        """
+        if not invalid_mask.any():
+            return [], 0
+
+        invalid_indices = invalid_mask[invalid_mask].index.tolist()
+        total_errors = len(invalid_indices)
+
+        error_details = []
+        for idx in invalid_indices[:MAX_DISPLAYED_ERRORS]:
+            invalid_value = df.loc[idx, column]
+            line_number = idx + kwargs.get("header", 0) + 2
+            error_details.append(
+                f"  • Line {line_number}: '{invalid_value}' cannot be 
converted to "
+                f"{dtype}"
+            )
+
+        return error_details, total_errors
+
+    @staticmethod
+    def _create_error_message(
+        df: pd.DataFrame,
+        column: str,
+        dtype: str,
+        invalid_mask: pd.Series,
+        kwargs: dict[str, Any],
+        original_error: Exception,
+    ) -> str:
+        """
+        Create detailed error message for type conversion failure.
+        Constructs a comprehensive error message that includes:
+        - Column name and target type
+        - Total count of errors found
+        - Detailed list of first few errors with line numbers and values
+        - Summary of remaining errors if exceeding display limit
+        :param df: DataFrame containing the data
+        :param column: Name of the column that failed conversion
+        :param dtype: Target data type that failed
+        :param invalid_mask: Boolean mask indicating which rows have invalid 
values
+        :param kwargs: Additional parameters including header information
+        :param original_error: Original exception that triggered the error 
handling
+        :return: Formatted error message string ready for display to user
+        """
+        error_details, total_errors = CSVReader._get_error_details(
+            df, column, dtype, invalid_mask, kwargs
+        )
+
+        if error_details:
+            base_msg = (
+                f"Cannot convert column '{column}' to {dtype}. "
+                f"Found {total_errors} error(s):"
+            )
+            detailed_errors = "\n".join(error_details)
+
+            if total_errors > MAX_DISPLAYED_ERRORS:
+                remaining = total_errors - MAX_DISPLAYED_ERRORS
+                additional_msg = f"\n  ... and {remaining} more error(s)"
+                return f"{base_msg}\n{detailed_errors}{additional_msg}"
+            else:
+                return f"{base_msg}\n{detailed_errors}"
+        else:
+            return f"Cannot convert column '{column}' to {dtype}. 
{str(original_error)}"
+
+    @staticmethod
+    def _cast_single_column(
+        df: pd.DataFrame, column: str, dtype: str, kwargs: dict[str, Any]
+    ) -> None:
+        """
+        Cast a single DataFrame column to the specified data type.
+        Attempts to convert a column to the target data type with enhanced 
error
+        handling. For numeric types, uses pandas to_numeric for better 
performance
+        and error detection. If conversion fails, provides detailed
+        error messages including specific invalid values and their line 
numbers.
+        :param df: DataFrame to modify (modified in-place)
+        :param column: Name of the column to cast
+        :param dtype: Target data type (e.g., 'int64', 'float64', 'string')
+        :param kwargs: Additional parameters including header row information
+        :raises DatabaseUploadFailed: If type conversion fails,
+        with detailed error message
+        """
+        numeric_types = {"int64", "int32", "float64", "float32"}
+
+        try:
+            if dtype in numeric_types:
+                df[column] = pd.to_numeric(df[column], errors="raise")
+                df[column] = df[column].astype(dtype)
+            else:
+                df[column] = df[column].astype(dtype)
+        except (ValueError, TypeError) as ex:
+            try:
+                if dtype in numeric_types:
+                    invalid_mask = CSVReader._find_invalid_values_numeric(df, 
column)
+                else:
+                    invalid_mask = CSVReader._find_invalid_values_non_numeric(
+                        df, column, dtype
+                    )
+
+                error_msg = CSVReader._create_error_message(
+                    df, column, dtype, invalid_mask, kwargs, ex
+                )
+            except Exception:
+                error_msg = f"Cannot convert column '{column}' to {dtype}. 
{str(ex)}"
+
+            raise DatabaseUploadFailed(message=error_msg) from ex
+
+    @staticmethod
+    def _cast_column_types(
+        df: pd.DataFrame, types: dict[str, str], kwargs: dict[str, Any]
+    ) -> pd.DataFrame:
+        """
+        Cast DataFrame columns to specified types with detailed
+        error reporting.
+        :param df: DataFrame to cast
+        :param types: Dictionary mapping column names to target types
+        :param kwargs: Original read_csv kwargs for line number calculation
+        :return: DataFrame with casted columns
+        :raises DatabaseUploadFailed: If type conversion fails with detailed 
error info
+        """
+        for column, dtype in types.items():
+            if column not in df.columns:
+                continue
+            CSVReader._cast_single_column(df, column, dtype, kwargs)
+        return df
+
     @staticmethod
     def _read_csv(file: FileStorage, kwargs: dict[str, Any]) -> pd.DataFrame:
         try:
+            types = kwargs.pop("dtype", None)
+            df = None
             if "chunksize" in kwargs:
-                return pd.concat(
+                df = pd.concat(
                     pd.read_csv(
                         filepath_or_buffer=file.stream,
                         **kwargs,
                     )
                 )
-            return pd.read_csv(
-                filepath_or_buffer=file.stream,
-                **kwargs,
-            )
+            else:
+                df = pd.read_csv(
+                    filepath_or_buffer=file.stream,
+                    **kwargs,
+                )
+            if types:
+                df = CSVReader._cast_column_types(df, types, kwargs)
+            return df
+        except DatabaseUploadFailed:
+            raise
         except (
             pd.errors.ParserError,
             pd.errors.EmptyDataError,
@@ -102,21 +297,27 @@ class CSVReader(BaseDataReader):
             "dayfirst": self._options.get("day_first", False),
             "iterator": True,
             "keep_default_na": not self._options.get("null_values"),
-            "usecols": self._options.get("columns_read")
-            if self._options.get("columns_read")  # None if an empty list
-            else None,
-            "na_values": self._options.get("null_values")
-            if self._options.get("null_values")  # None if an empty list
-            else None,
+            "usecols": (
+                self._options.get("columns_read")
+                if self._options.get("columns_read")  # None if an empty list
+                else None
+            ),
+            "na_values": (
+                self._options.get("null_values")
+                if self._options.get("null_values")  # None if an empty list
+                else None
+            ),
             "nrows": self._options.get("rows_to_read"),
             "parse_dates": self._options.get("column_dates"),
             "sep": self._options.get("delimiter", ","),
             "skip_blank_lines": self._options.get("skip_blank_lines", False),
             "skipinitialspace": self._options.get("skip_initial_space", False),
             "skiprows": self._options.get("skip_rows", 0),
-            "dtype": self._options.get("column_data_types")
-            if self._options.get("column_data_types")
-            else None,
+            "dtype": (
+                self._options.get("column_data_types")
+                if self._options.get("column_data_types")
+                else None
+            ),
         }
         return self._read_csv(file, kwargs)
 
diff --git a/superset/models/core.py b/superset/models/core.py
index 4db7b04d56..361d0c37a3 100755
--- a/superset/models/core.py
+++ b/superset/models/core.py
@@ -1094,7 +1094,9 @@ class Database(Model, AuditMixinNullable, 
ImportExportMixin):  # pylint: disable
     def has_table(self, table: Table) -> bool:
         with self.get_sqla_engine(catalog=table.catalog, schema=table.schema) 
as engine:
             # do not pass "" as an empty schema; force null
-            return engine.has_table(table.table, table.schema or None)
+            if engine.has_table(table.table, table.schema or None):
+                return True
+            return engine.has_table(table.table.lower(), table.schema or None)
 
     def has_view(self, table: Table) -> bool:
         with self.get_sqla_engine(catalog=table.catalog, schema=table.schema) 
as engine:
diff --git a/tests/unit_tests/commands/databases/csv_reader_test.py 
b/tests/unit_tests/commands/databases/csv_reader_test.py
index 2fb6bc5c6c..a992eac5dc 100644
--- a/tests/unit_tests/commands/databases/csv_reader_test.py
+++ b/tests/unit_tests/commands/databases/csv_reader_test.py
@@ -18,6 +18,7 @@ import io
 from datetime import datetime
 
 import numpy as np
+import pandas as pd
 import pytest
 from werkzeug.datastructures import FileStorage
 
@@ -371,3 +372,558 @@ def test_csv_reader_file_metadata_invalid_file():
         "Parsing error: Error tokenizing data. C error:"
         " Expected 3 fields in line 3, saw 7\n"
     )
+
+
+def test_csv_reader_integer_in_float_column():
+    csv_data = [
+        ["Name", "Score", "City"],
+        ["name1", 25.5, "city1"],
+        ["name2", 25, "city2"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(column_data_types={"Score": "float"})
+    )
+
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    assert df.shape == (2, 3)
+    assert df["Score"].dtype == "float64"
+
+
+def test_csv_reader_object_type_auto_inferring():
+    # this case below won't raise a error
+    csv_data = [
+        ["Name", "id", "City"],
+        ["name1", 25.5, "city1"],
+        ["name2", 15, "city2"],
+        ["name3", 123456789086, "city3"],
+        ["name4", "abc", "city4"],
+        ["name5", 4.75, "city5"],
+    ]
+
+    csv_reader = CSVReader()
+
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    assert df.shape == (5, 3)
+    # pandas automatically infers the type if column_data_types is not informed
+    # if there's only one string in the column it converts the whole column to 
object
+    assert df["id"].dtype == "object"
+
+
+def test_csv_reader_float_type_auto_inferring():
+    csv_data = [
+        ["Name", "id", "City"],
+        ["name1", "25", "city1"],
+        ["name2", "15", "city2"],
+        ["name3", "123456789086", "city3"],
+        ["name5", "4.75", "city5"],
+    ]
+
+    csv_reader = CSVReader()
+
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    assert df.shape == (4, 3)
+    # The type here is automatically inferred to float due to 4.75 value
+    assert df["id"].dtype == "float64"
+
+
+def test_csv_reader_int_type_auto_inferring():
+    csv_data = [
+        ["Name", "id", "City"],
+        ["name1", "0", "city1"],
+        ["name2", "15", "city2"],
+        ["name3", "123456789086", "city3"],
+        ["name5", "45", "city5"],
+    ]
+
+    csv_reader = CSVReader()
+
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    assert df.shape == (4, 3)
+    assert df["id"].dtype == "int64"
+
+
+def test_csv_reader_bigint_type_auto_inferring():
+    csv_data = [
+        ["Name", "id", "City"],
+        ["name1", "9223372036854775807", "city1"],
+        ["name2", "9223372036854775806", "city2"],
+        ["name3", "1234567890123456789", "city3"],
+        ["name4", "0", "city4"],
+        ["name5", "-9223372036854775808", "city5"],
+    ]
+
+    csv_reader = CSVReader()
+
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    assert df.shape == (5, 3)
+    assert df["id"].dtype == "int64"
+    assert df.iloc[0]["id"] == 9223372036854775807
+    assert df.iloc[4]["id"] == -9223372036854775808
+
+
+def test_csv_reader_int_typing():
+    csv_data = [
+        ["Name", "id", "City"],
+        ["name1", "0", "city1"],
+        ["name2", "15", "city2"],
+        ["name3", "123456789086", "city3"],
+        ["name5", "45", "city5"],
+    ]
+
+    csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"id": 
"int"}))
+
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    assert df.shape == (4, 3)
+    assert df["id"].dtype == "int64"
+
+
+def test_csv_reader_float_typing():
+    csv_data = [
+        ["Name", "score", "City"],
+        ["name1", "0", "city1"],
+        ["name2", "15.3", "city2"],
+        ["name3", "45", "city3"],
+        ["name5", "23.1342", "city5"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(column_data_types={"score": "float"})
+    )
+
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    assert df.shape == (4, 3)
+    assert df["score"].dtype == "float64"
+
+
+def test_csv_reader_multiple_errors_display():
+    """Test that multiple errors are displayed with proper formatting."""
+    csv_data = [
+        ["Name", "Age", "Score"],
+        ["Alice", "25", "95.5"],
+        ["Bob", "invalid1", "87.2"],
+        ["Charlie", "invalid2", "92.1"],
+        ["Diana", "invalid3", "88.5"],
+        ["Eve", "invalid4", "90.0"],
+        ["Frank", "30", "85.5"],
+    ]
+
+    csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": 
"int64"}))
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Age' to int64" in error_msg
+    assert "Found 4 error(s):" in error_msg
+    assert "Line 3: 'invalid1' cannot be converted to int64" in error_msg
+    assert "Line 4: 'invalid2' cannot be converted to int64" in error_msg
+    assert "Line 5: 'invalid3' cannot be converted to int64" in error_msg
+    assert "Line 6: 'invalid4' cannot be converted to int64" in error_msg
+    # With MAX_DISPLAYED_ERRORS = 5, all 4 errors should be shown without 
truncation
+    assert "and" not in error_msg or "more error(s)" not in error_msg
+
+
+def test_csv_reader_non_numeric_in_integer_column():
+    csv_data = [
+        ["Name", "Age", "City"],
+        ["name1", "abc", "city1"],
+        ["name2", "25", "city2"],
+    ]
+
+    csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": 
"int64"}))
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Age' to int64" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 2: 'abc' cannot be converted to int64" in error_msg
+
+
+def test_csv_reader_non_numeric_in_float_column():
+    csv_data = [
+        ["Name", "Score", "City"],
+        ["name1", "5.3", "city1"],
+        ["name2", "25.5", "city2"],
+        ["name3", "24.5", "city3"],
+        ["name4", "1.0", "city4"],
+        ["name5", "one point five", "city5"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(column_data_types={"Score": "float64"})
+    )
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Score' to float64" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 6: 'one point five' cannot be converted to float64" in 
error_msg
+
+
+def test_csv_reader_improved_error_detection_int32():
+    """Test improved error detection for int32 type casting."""
+    csv_data = [
+        ["Name", "ID", "City"],
+        ["name1", "123", "city1"],
+        ["name2", "456", "city2"],
+        ["name3", "not_a_number", "city3"],
+        ["name4", "789", "city4"],
+    ]
+
+    csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"ID": 
"int32"}))
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'ID' to int32" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 4: 'not_a_number' cannot be converted to int32" in error_msg
+
+
+def test_csv_reader_improved_error_detection_float32():
+    """Test improved error detection for float32 type casting."""
+    csv_data = [
+        ["Name", "Score", "City"],
+        ["name1", "1.5", "city1"],
+        ["name2", "2.7", "city2"],
+        ["name3", "invalid_float", "city3"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(column_data_types={"Score": "float32"})
+    )
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Score' to float32" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 4: 'invalid_float' cannot be converted to float32" in 
error_msg
+
+
+def test_csv_reader_error_detection_with_header_row():
+    """Test that line numbers are correctly calculated with custom header 
row."""
+    csv_data = [
+        ["skip_this_row", "skip", "skip"],
+        ["Name", "Age", "City"],
+        ["name1", "25", "city1"],
+        ["name2", "invalid_age", "city2"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(header_row=1, column_data_types={"Age": 
"int"})
+    )
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Age' to int" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 4: 'invalid_age' cannot be converted to int" in error_msg
+
+
+def test_csv_reader_error_detection_first_row_error():
+    """Test error detection when the first data row has the error."""
+
+    csv_data = [
+        ["Name", "Age", "City"],
+        ["name1", "not_a_number", "city1"],
+        ["name2", "25", "city2"],
+    ]
+
+    csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": 
"int64"}))
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Age' to int64" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 2: 'not_a_number' cannot be converted to int64" in error_msg
+
+
+def test_csv_reader_error_detection_missing_column():
+    """Test that missing columns are handled gracefully."""
+    csv_data = [
+        ["Name", "City"],
+        ["name1", "city1"],
+        ["name2", "city2"],
+    ]
+
+    # Try to cast a column that doesn't exist
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(column_data_types={"NonExistent": "int64"})
+    )
+
+    # Should not raise an error for missing columns
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+    assert df.shape == (2, 2)
+    assert df.columns.tolist() == ["Name", "City"]
+
+
+def test_csv_reader_error_detection_mixed_valid_invalid():
+    csv_data = [
+        ["Name", "Score", "City"],
+        ["name1", "95.5", "city1"],
+        ["name2", "87.2", "city2"],
+        ["name3", "92.1", "city3"],
+        ["name4", "eighty-five", "city4"],
+        ["name5", "78.9", "city5"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(column_data_types={"Score": "float64"})
+    )
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Score' to float64" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 5: 'eighty-five' cannot be converted to float64" in error_msg
+
+
+def test_csv_reader_error_detection_multiple_invalid_values():
+    """Test error detection with multiple invalid values showing first 5 + 
count."""
+    csv_data = [
+        ["Name", "Score", "City"],
+        ["name1", "95.5", "city1"],
+        ["name2", "87.2", "city2"],
+        ["name3", "92.1", "city3"],
+        ["name4", "eighty-five", "city4"],
+        ["name4", "eighty-one", "city4"],
+        ["name4", "eighty", "city4"],
+        ["name4", "one", "city4"],
+        ["name4", "two", "city4"],
+        ["name4", "three", "city4"],
+        ["name5", "78.9", "city5"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(column_data_types={"Score": "float64"})
+    )
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Score' to float64" in error_msg
+    assert "Found 6 error(s):" in error_msg
+    assert "Line 5: 'eighty-five' cannot be converted to float64" in error_msg
+    assert "Line 6: 'eighty-one' cannot be converted to float64" in error_msg
+    assert "Line 7: 'eighty' cannot be converted to float64" in error_msg
+    assert "Line 8: 'one' cannot be converted to float64" in error_msg
+    assert "Line 9: 'two' cannot be converted to float64" in error_msg
+    assert "and 1 more error(s)" in error_msg
+
+
+def test_csv_reader_error_detection_non_numeric_types():
+    """Test error detection for non-numeric type casting."""
+    csv_data = [
+        ["Name", "Status", "City"],
+        ["name1", "active", "city1"],
+        ["name2", "inactive", "city2"],
+        ["name3", 123, "city3"],  # This should cause an error when casting to 
string
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(column_data_types={"Status": "string"})
+    )
+
+    # For non-numeric types, the error detection should still work
+    # but might have different behavior depending on pandas version
+    try:
+        df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+        # If no error is raised, the conversion succeeded
+        assert df["Status"].dtype == "string"
+    except DatabaseUploadFailed as ex:
+        # If an error is raised, it should have proper formatting
+        error_msg = str(ex.value)
+        assert "Cannot convert" in error_msg
+        assert "Status" in error_msg
+
+
+def test_csv_reader_error_detection_with_null_values():
+    csv_data = [
+        ["Name", "Age", "City"],
+        ["name1", "25", "city1"],
+        ["name2", "", "city2"],
+        ["name3", "invalid_age", "city3"],
+    ]
+
+    csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": 
"int64"}))
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Age' to int64" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg
+
+
+def test_csv_reader_successful_numeric_conversion():
+    csv_data = [
+        ["Name", "Age", "Score", "ID"],
+        ["name1", "25", "95.5", "1001"],
+        ["name2", "30", "87.2", "1002"],
+        ["name3", "35", "92.1", "1003"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(
+            column_data_types={
+                "Age": "int64",
+                "Score": "float64",
+                "ID": "int32",
+            }
+        )
+    )
+
+    df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
+
+    assert df.shape == (3, 4)
+    assert df["Age"].dtype == "int64"
+    assert df["Score"].dtype == "float64"
+    assert df["ID"].dtype == "int32"
+    assert df.iloc[0]["Age"] == 25
+    assert df.iloc[0]["Score"] == 95.5
+    assert df.iloc[0]["ID"] == 1001
+
+
+def test_csv_reader_error_detection_improvements_summary():
+    csv_data_with_custom_header = [
+        ["metadata_row", "skip", "this"],
+        ["Name", "Age", "Score"],
+        ["Alice", "25", "95.5"],
+        ["Bob", "invalid_age", "87.2"],
+        ["Charlie", "30", "92.1"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(
+            header_row=1, column_data_types={"Age": "int64", "Score": 
"float64"}
+        )
+    )
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        
csv_reader.file_to_dataframe(create_csv_file(csv_data_with_custom_header))
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'Age' to int64" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg
+
+    # Test case 2: Multiple type errors - Age comes first alphabetically
+    csv_data_multiple_errors = [
+        ["Name", "Age", "Score"],
+        ["Alice", "25", "95.5"],
+        ["Bob", "invalid_age", "invalid_score"],  # Error in both columns 
(line 3)
+        ["Charlie", "30", "92.1"],
+    ]
+
+    csv_reader = CSVReader(
+        options=CSVReaderOptions(column_data_types={"Age": "int64", "Score": 
"float64"})
+    )
+
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        csv_reader.file_to_dataframe(create_csv_file(csv_data_multiple_errors))
+
+    error_msg = str(ex.value)
+    # Should catch the Age error first (Age comes before Score alphabetically)
+    assert "Cannot convert column 'Age' to int64" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 3: 'invalid_age' cannot be converted to int64" in error_msg
+
+
+def test_csv_reader_cast_column_types_function():
+    """Test the _cast_column_types function directly for better isolation."""
+    # Create test DataFrame
+    test_data = {
+        "name": ["Alice", "Bob", "Charlie"],
+        "age": ["25", "30", "invalid_age"],
+        "score": ["95.5", "87.2", "92.1"],
+    }
+    df = pd.DataFrame(test_data)
+
+    # Test successful casting
+    types_success = {"age": "int64", "score": "float64"}
+    kwargs = {"header": 0}
+
+    # This should work for first two rows, but we'll only test the first two
+    df_subset = df.iloc[:2].copy()
+    result_df = CSVReader._cast_column_types(df_subset, types_success, kwargs)
+
+    assert result_df["age"].dtype == "int64"
+    assert result_df["score"].dtype == "float64"
+    assert result_df.iloc[0]["age"] == 25
+    assert result_df.iloc[0]["score"] == 95.5
+
+    # Test error case
+    with pytest.raises(DatabaseUploadFailed) as ex:
+        CSVReader._cast_column_types(df, types_success, kwargs)
+
+    error_msg = str(ex.value)
+    assert "Cannot convert column 'age' to int64" in error_msg
+    assert "Found 1 error(s):" in error_msg
+    assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg
+
+
+def test_csv_reader_cast_column_types_missing_column():
+    """Test _cast_column_types with missing columns."""
+    test_data = {
+        "name": ["Alice", "Bob"],
+        "age": ["25", "30"],
+    }
+    df = pd.DataFrame(test_data)
+
+    # Try to cast a column that doesn't exist
+    types = {"age": "int64", "nonexistent": "float64"}
+    kwargs = {"header": 0}
+
+    # Should not raise an error for missing columns
+    result_df = CSVReader._cast_column_types(df, types, kwargs)
+    assert result_df["age"].dtype == "int64"
+    assert "nonexistent" not in result_df.columns
+
+
+def test_csv_reader_cast_column_types_different_numeric_types():
+    """Test _cast_column_types with various numeric types."""
+    test_data = {
+        "int32_col": ["1", "2", "3"],
+        "int64_col": ["100", "200", "300"],
+        "float32_col": ["1.5", "2.5", "3.5"],
+        "float64_col": ["10.1", "20.2", "30.3"],
+    }
+    df = pd.DataFrame(test_data)
+
+    types = {
+        "int32_col": "int32",
+        "int64_col": "int64",
+        "float32_col": "float32",
+        "float64_col": "float64",
+    }
+    kwargs = {"header": 0}
+
+    result_df = CSVReader._cast_column_types(df, types, kwargs)
+
+    assert result_df["int32_col"].dtype == "int32"
+    assert result_df["int64_col"].dtype == "int64"
+    assert result_df["float32_col"].dtype == "float32"
+    assert result_df["float64_col"].dtype == "float64"

(superset) 02/02: fix: Upload CSV as Dataset (#34763)

Reply via email to