This is an automated email from the ASF dual-hosted git repository. michaelsmolina pushed a commit to branch 5.0 in repository https://gitbox.apache.org/repos/asf/superset.git
commit 49df35d01298c997b79a3369d754815dc25a2742 Author: Luiz Otavio <45200344+luizotavi...@users.noreply.github.com> AuthorDate: Mon Sep 8 11:48:23 2025 -0300 fix: Upload CSV as Dataset (#34763) (cherry picked from commit 1c2b9db4f0d5728f9bcf35ebdef2df5c41d35590) --- superset/commands/database/uploaders/base.py | 3 +- superset/commands/database/uploaders/csv_reader.py | 229 ++++++++- superset/models/core.py | 4 +- .../commands/databases/csv_reader_test.py | 556 +++++++++++++++++++++ 4 files changed, 776 insertions(+), 16 deletions(-) diff --git a/superset/commands/database/uploaders/base.py b/superset/commands/database/uploaders/base.py index 5a8227b054..68a437a203 100644 --- a/superset/commands/database/uploaders/base.py +++ b/superset/commands/database/uploaders/base.py @@ -133,7 +133,8 @@ class BaseDataReader: ) ) from ex except Exception as ex: - raise DatabaseUploadFailed(exception=ex) from ex + message = ex.message if hasattr(ex, "message") and ex.message else str(ex) + raise DatabaseUploadFailed(message=message, exception=ex) from ex class UploadCommand(BaseCommand): diff --git a/superset/commands/database/uploaders/csv_reader.py b/superset/commands/database/uploaders/csv_reader.py index 0072a7d5cf..cc1102e26b 100644 --- a/superset/commands/database/uploaders/csv_reader.py +++ b/superset/commands/database/uploaders/csv_reader.py @@ -33,6 +33,10 @@ logger = logging.getLogger(__name__) READ_CSV_CHUNK_SIZE = 1000 ROWS_TO_READ_METADATA = 2 +# Fixed error limit to avoid huge payloads and poor UX given that a file +# might contain thousands of errors. +MAX_DISPLAYED_ERRORS = 5 + class CSVReaderOptions(ReaderOptions, total=False): delimiter: str @@ -60,20 +64,211 @@ class CSVReader(BaseDataReader): options=dict(options), ) + @staticmethod + def _find_invalid_values_numeric(df: pd.DataFrame, column: str) -> pd.Series: + """ + Find invalid values for numeric type conversion. + Identifies rows where values cannot be converted to numeric types using + pandas to_numeric with error coercing. Returns a boolean mask indicating + which values are invalid (non-null but unconvertible). + :param df: DataFrame containing the data + :param column: Name of the column to check for invalid values + :return: Boolean Series indicating which rows have invalid + values for numeric conversion + """ + converted = pd.to_numeric(df[column], errors="coerce") + return converted.isna() & df[column].notna() + + @staticmethod + def _find_invalid_values_non_numeric( + df: pd.DataFrame, column: str, dtype: str + ) -> pd.Series: + """ + Find invalid values for non-numeric type conversion. + Identifies rows where values cannot be converted to the specified non-numeric + data type by attempting conversion and catching exceptions. This is used for + string, categorical, or other non-numeric type conversions. + :param df: DataFrame containing the data + :param column: Name of the column to check for invalid values + :param dtype: Target data type for conversion (e.g., 'string', 'category') + :return: Boolean Series indicating which rows have + invalid values for the target type + """ + invalid_mask = pd.Series([False] * len(df), index=df.index) + for idx, value in df[column].items(): + if pd.notna(value): + try: + pd.Series([value]).astype(dtype) + except (ValueError, TypeError): + invalid_mask[idx] = True + return invalid_mask + + @staticmethod + def _get_error_details( + df: pd.DataFrame, + column: str, + dtype: str, + invalid_mask: pd.Series, + kwargs: dict[str, Any], + ) -> tuple[list[str], int]: + """ + Get detailed error information for invalid values in type conversion. + Extracts detailed information about conversion errors, including specific + invalid values and their line numbers. Limits the number of detailed errors + shown to avoid overwhelming output while providing total error count. + :param df: DataFrame containing the data + :param column: Name of the column with conversion errors + :param dtype: Target data type that failed conversion + :param invalid_mask: Boolean mask indicating which rows have invalid values + :param kwargs: Additional parameters including header row information + :return: Tuple containing: + - List of formatted error detail strings (limited by MAX_DISPLAYED_ERRORS) + - Total count of errors found + """ + if not invalid_mask.any(): + return [], 0 + + invalid_indices = invalid_mask[invalid_mask].index.tolist() + total_errors = len(invalid_indices) + + error_details = [] + for idx in invalid_indices[:MAX_DISPLAYED_ERRORS]: + invalid_value = df.loc[idx, column] + line_number = idx + kwargs.get("header", 0) + 2 + error_details.append( + f" • Line {line_number}: '{invalid_value}' cannot be converted to " + f"{dtype}" + ) + + return error_details, total_errors + + @staticmethod + def _create_error_message( + df: pd.DataFrame, + column: str, + dtype: str, + invalid_mask: pd.Series, + kwargs: dict[str, Any], + original_error: Exception, + ) -> str: + """ + Create detailed error message for type conversion failure. + Constructs a comprehensive error message that includes: + - Column name and target type + - Total count of errors found + - Detailed list of first few errors with line numbers and values + - Summary of remaining errors if exceeding display limit + :param df: DataFrame containing the data + :param column: Name of the column that failed conversion + :param dtype: Target data type that failed + :param invalid_mask: Boolean mask indicating which rows have invalid values + :param kwargs: Additional parameters including header information + :param original_error: Original exception that triggered the error handling + :return: Formatted error message string ready for display to user + """ + error_details, total_errors = CSVReader._get_error_details( + df, column, dtype, invalid_mask, kwargs + ) + + if error_details: + base_msg = ( + f"Cannot convert column '{column}' to {dtype}. " + f"Found {total_errors} error(s):" + ) + detailed_errors = "\n".join(error_details) + + if total_errors > MAX_DISPLAYED_ERRORS: + remaining = total_errors - MAX_DISPLAYED_ERRORS + additional_msg = f"\n ... and {remaining} more error(s)" + return f"{base_msg}\n{detailed_errors}{additional_msg}" + else: + return f"{base_msg}\n{detailed_errors}" + else: + return f"Cannot convert column '{column}' to {dtype}. {str(original_error)}" + + @staticmethod + def _cast_single_column( + df: pd.DataFrame, column: str, dtype: str, kwargs: dict[str, Any] + ) -> None: + """ + Cast a single DataFrame column to the specified data type. + Attempts to convert a column to the target data type with enhanced error + handling. For numeric types, uses pandas to_numeric for better performance + and error detection. If conversion fails, provides detailed + error messages including specific invalid values and their line numbers. + :param df: DataFrame to modify (modified in-place) + :param column: Name of the column to cast + :param dtype: Target data type (e.g., 'int64', 'float64', 'string') + :param kwargs: Additional parameters including header row information + :raises DatabaseUploadFailed: If type conversion fails, + with detailed error message + """ + numeric_types = {"int64", "int32", "float64", "float32"} + + try: + if dtype in numeric_types: + df[column] = pd.to_numeric(df[column], errors="raise") + df[column] = df[column].astype(dtype) + else: + df[column] = df[column].astype(dtype) + except (ValueError, TypeError) as ex: + try: + if dtype in numeric_types: + invalid_mask = CSVReader._find_invalid_values_numeric(df, column) + else: + invalid_mask = CSVReader._find_invalid_values_non_numeric( + df, column, dtype + ) + + error_msg = CSVReader._create_error_message( + df, column, dtype, invalid_mask, kwargs, ex + ) + except Exception: + error_msg = f"Cannot convert column '{column}' to {dtype}. {str(ex)}" + + raise DatabaseUploadFailed(message=error_msg) from ex + + @staticmethod + def _cast_column_types( + df: pd.DataFrame, types: dict[str, str], kwargs: dict[str, Any] + ) -> pd.DataFrame: + """ + Cast DataFrame columns to specified types with detailed + error reporting. + :param df: DataFrame to cast + :param types: Dictionary mapping column names to target types + :param kwargs: Original read_csv kwargs for line number calculation + :return: DataFrame with casted columns + :raises DatabaseUploadFailed: If type conversion fails with detailed error info + """ + for column, dtype in types.items(): + if column not in df.columns: + continue + CSVReader._cast_single_column(df, column, dtype, kwargs) + return df + @staticmethod def _read_csv(file: FileStorage, kwargs: dict[str, Any]) -> pd.DataFrame: try: + types = kwargs.pop("dtype", None) + df = None if "chunksize" in kwargs: - return pd.concat( + df = pd.concat( pd.read_csv( filepath_or_buffer=file.stream, **kwargs, ) ) - return pd.read_csv( - filepath_or_buffer=file.stream, - **kwargs, - ) + else: + df = pd.read_csv( + filepath_or_buffer=file.stream, + **kwargs, + ) + if types: + df = CSVReader._cast_column_types(df, types, kwargs) + return df + except DatabaseUploadFailed: + raise except ( pd.errors.ParserError, pd.errors.EmptyDataError, @@ -102,21 +297,27 @@ class CSVReader(BaseDataReader): "dayfirst": self._options.get("day_first", False), "iterator": True, "keep_default_na": not self._options.get("null_values"), - "usecols": self._options.get("columns_read") - if self._options.get("columns_read") # None if an empty list - else None, - "na_values": self._options.get("null_values") - if self._options.get("null_values") # None if an empty list - else None, + "usecols": ( + self._options.get("columns_read") + if self._options.get("columns_read") # None if an empty list + else None + ), + "na_values": ( + self._options.get("null_values") + if self._options.get("null_values") # None if an empty list + else None + ), "nrows": self._options.get("rows_to_read"), "parse_dates": self._options.get("column_dates"), "sep": self._options.get("delimiter", ","), "skip_blank_lines": self._options.get("skip_blank_lines", False), "skipinitialspace": self._options.get("skip_initial_space", False), "skiprows": self._options.get("skip_rows", 0), - "dtype": self._options.get("column_data_types") - if self._options.get("column_data_types") - else None, + "dtype": ( + self._options.get("column_data_types") + if self._options.get("column_data_types") + else None + ), } return self._read_csv(file, kwargs) diff --git a/superset/models/core.py b/superset/models/core.py index 4db7b04d56..361d0c37a3 100755 --- a/superset/models/core.py +++ b/superset/models/core.py @@ -1094,7 +1094,9 @@ class Database(Model, AuditMixinNullable, ImportExportMixin): # pylint: disable def has_table(self, table: Table) -> bool: with self.get_sqla_engine(catalog=table.catalog, schema=table.schema) as engine: # do not pass "" as an empty schema; force null - return engine.has_table(table.table, table.schema or None) + if engine.has_table(table.table, table.schema or None): + return True + return engine.has_table(table.table.lower(), table.schema or None) def has_view(self, table: Table) -> bool: with self.get_sqla_engine(catalog=table.catalog, schema=table.schema) as engine: diff --git a/tests/unit_tests/commands/databases/csv_reader_test.py b/tests/unit_tests/commands/databases/csv_reader_test.py index 2fb6bc5c6c..a992eac5dc 100644 --- a/tests/unit_tests/commands/databases/csv_reader_test.py +++ b/tests/unit_tests/commands/databases/csv_reader_test.py @@ -18,6 +18,7 @@ import io from datetime import datetime import numpy as np +import pandas as pd import pytest from werkzeug.datastructures import FileStorage @@ -371,3 +372,558 @@ def test_csv_reader_file_metadata_invalid_file(): "Parsing error: Error tokenizing data. C error:" " Expected 3 fields in line 3, saw 7\n" ) + + +def test_csv_reader_integer_in_float_column(): + csv_data = [ + ["Name", "Score", "City"], + ["name1", 25.5, "city1"], + ["name2", 25, "city2"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions(column_data_types={"Score": "float"}) + ) + + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + assert df.shape == (2, 3) + assert df["Score"].dtype == "float64" + + +def test_csv_reader_object_type_auto_inferring(): + # this case below won't raise a error + csv_data = [ + ["Name", "id", "City"], + ["name1", 25.5, "city1"], + ["name2", 15, "city2"], + ["name3", 123456789086, "city3"], + ["name4", "abc", "city4"], + ["name5", 4.75, "city5"], + ] + + csv_reader = CSVReader() + + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + assert df.shape == (5, 3) + # pandas automatically infers the type if column_data_types is not informed + # if there's only one string in the column it converts the whole column to object + assert df["id"].dtype == "object" + + +def test_csv_reader_float_type_auto_inferring(): + csv_data = [ + ["Name", "id", "City"], + ["name1", "25", "city1"], + ["name2", "15", "city2"], + ["name3", "123456789086", "city3"], + ["name5", "4.75", "city5"], + ] + + csv_reader = CSVReader() + + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + assert df.shape == (4, 3) + # The type here is automatically inferred to float due to 4.75 value + assert df["id"].dtype == "float64" + + +def test_csv_reader_int_type_auto_inferring(): + csv_data = [ + ["Name", "id", "City"], + ["name1", "0", "city1"], + ["name2", "15", "city2"], + ["name3", "123456789086", "city3"], + ["name5", "45", "city5"], + ] + + csv_reader = CSVReader() + + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + assert df.shape == (4, 3) + assert df["id"].dtype == "int64" + + +def test_csv_reader_bigint_type_auto_inferring(): + csv_data = [ + ["Name", "id", "City"], + ["name1", "9223372036854775807", "city1"], + ["name2", "9223372036854775806", "city2"], + ["name3", "1234567890123456789", "city3"], + ["name4", "0", "city4"], + ["name5", "-9223372036854775808", "city5"], + ] + + csv_reader = CSVReader() + + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + assert df.shape == (5, 3) + assert df["id"].dtype == "int64" + assert df.iloc[0]["id"] == 9223372036854775807 + assert df.iloc[4]["id"] == -9223372036854775808 + + +def test_csv_reader_int_typing(): + csv_data = [ + ["Name", "id", "City"], + ["name1", "0", "city1"], + ["name2", "15", "city2"], + ["name3", "123456789086", "city3"], + ["name5", "45", "city5"], + ] + + csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"id": "int"})) + + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + assert df.shape == (4, 3) + assert df["id"].dtype == "int64" + + +def test_csv_reader_float_typing(): + csv_data = [ + ["Name", "score", "City"], + ["name1", "0", "city1"], + ["name2", "15.3", "city2"], + ["name3", "45", "city3"], + ["name5", "23.1342", "city5"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions(column_data_types={"score": "float"}) + ) + + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + assert df.shape == (4, 3) + assert df["score"].dtype == "float64" + + +def test_csv_reader_multiple_errors_display(): + """Test that multiple errors are displayed with proper formatting.""" + csv_data = [ + ["Name", "Age", "Score"], + ["Alice", "25", "95.5"], + ["Bob", "invalid1", "87.2"], + ["Charlie", "invalid2", "92.1"], + ["Diana", "invalid3", "88.5"], + ["Eve", "invalid4", "90.0"], + ["Frank", "30", "85.5"], + ] + + csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"})) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Age' to int64" in error_msg + assert "Found 4 error(s):" in error_msg + assert "Line 3: 'invalid1' cannot be converted to int64" in error_msg + assert "Line 4: 'invalid2' cannot be converted to int64" in error_msg + assert "Line 5: 'invalid3' cannot be converted to int64" in error_msg + assert "Line 6: 'invalid4' cannot be converted to int64" in error_msg + # With MAX_DISPLAYED_ERRORS = 5, all 4 errors should be shown without truncation + assert "and" not in error_msg or "more error(s)" not in error_msg + + +def test_csv_reader_non_numeric_in_integer_column(): + csv_data = [ + ["Name", "Age", "City"], + ["name1", "abc", "city1"], + ["name2", "25", "city2"], + ] + + csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"})) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Age' to int64" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 2: 'abc' cannot be converted to int64" in error_msg + + +def test_csv_reader_non_numeric_in_float_column(): + csv_data = [ + ["Name", "Score", "City"], + ["name1", "5.3", "city1"], + ["name2", "25.5", "city2"], + ["name3", "24.5", "city3"], + ["name4", "1.0", "city4"], + ["name5", "one point five", "city5"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions(column_data_types={"Score": "float64"}) + ) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Score' to float64" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 6: 'one point five' cannot be converted to float64" in error_msg + + +def test_csv_reader_improved_error_detection_int32(): + """Test improved error detection for int32 type casting.""" + csv_data = [ + ["Name", "ID", "City"], + ["name1", "123", "city1"], + ["name2", "456", "city2"], + ["name3", "not_a_number", "city3"], + ["name4", "789", "city4"], + ] + + csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"ID": "int32"})) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'ID' to int32" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 4: 'not_a_number' cannot be converted to int32" in error_msg + + +def test_csv_reader_improved_error_detection_float32(): + """Test improved error detection for float32 type casting.""" + csv_data = [ + ["Name", "Score", "City"], + ["name1", "1.5", "city1"], + ["name2", "2.7", "city2"], + ["name3", "invalid_float", "city3"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions(column_data_types={"Score": "float32"}) + ) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Score' to float32" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 4: 'invalid_float' cannot be converted to float32" in error_msg + + +def test_csv_reader_error_detection_with_header_row(): + """Test that line numbers are correctly calculated with custom header row.""" + csv_data = [ + ["skip_this_row", "skip", "skip"], + ["Name", "Age", "City"], + ["name1", "25", "city1"], + ["name2", "invalid_age", "city2"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions(header_row=1, column_data_types={"Age": "int"}) + ) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Age' to int" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 4: 'invalid_age' cannot be converted to int" in error_msg + + +def test_csv_reader_error_detection_first_row_error(): + """Test error detection when the first data row has the error.""" + + csv_data = [ + ["Name", "Age", "City"], + ["name1", "not_a_number", "city1"], + ["name2", "25", "city2"], + ] + + csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"})) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Age' to int64" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 2: 'not_a_number' cannot be converted to int64" in error_msg + + +def test_csv_reader_error_detection_missing_column(): + """Test that missing columns are handled gracefully.""" + csv_data = [ + ["Name", "City"], + ["name1", "city1"], + ["name2", "city2"], + ] + + # Try to cast a column that doesn't exist + csv_reader = CSVReader( + options=CSVReaderOptions(column_data_types={"NonExistent": "int64"}) + ) + + # Should not raise an error for missing columns + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + assert df.shape == (2, 2) + assert df.columns.tolist() == ["Name", "City"] + + +def test_csv_reader_error_detection_mixed_valid_invalid(): + csv_data = [ + ["Name", "Score", "City"], + ["name1", "95.5", "city1"], + ["name2", "87.2", "city2"], + ["name3", "92.1", "city3"], + ["name4", "eighty-five", "city4"], + ["name5", "78.9", "city5"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions(column_data_types={"Score": "float64"}) + ) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Score' to float64" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 5: 'eighty-five' cannot be converted to float64" in error_msg + + +def test_csv_reader_error_detection_multiple_invalid_values(): + """Test error detection with multiple invalid values showing first 5 + count.""" + csv_data = [ + ["Name", "Score", "City"], + ["name1", "95.5", "city1"], + ["name2", "87.2", "city2"], + ["name3", "92.1", "city3"], + ["name4", "eighty-five", "city4"], + ["name4", "eighty-one", "city4"], + ["name4", "eighty", "city4"], + ["name4", "one", "city4"], + ["name4", "two", "city4"], + ["name4", "three", "city4"], + ["name5", "78.9", "city5"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions(column_data_types={"Score": "float64"}) + ) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Score' to float64" in error_msg + assert "Found 6 error(s):" in error_msg + assert "Line 5: 'eighty-five' cannot be converted to float64" in error_msg + assert "Line 6: 'eighty-one' cannot be converted to float64" in error_msg + assert "Line 7: 'eighty' cannot be converted to float64" in error_msg + assert "Line 8: 'one' cannot be converted to float64" in error_msg + assert "Line 9: 'two' cannot be converted to float64" in error_msg + assert "and 1 more error(s)" in error_msg + + +def test_csv_reader_error_detection_non_numeric_types(): + """Test error detection for non-numeric type casting.""" + csv_data = [ + ["Name", "Status", "City"], + ["name1", "active", "city1"], + ["name2", "inactive", "city2"], + ["name3", 123, "city3"], # This should cause an error when casting to string + ] + + csv_reader = CSVReader( + options=CSVReaderOptions(column_data_types={"Status": "string"}) + ) + + # For non-numeric types, the error detection should still work + # but might have different behavior depending on pandas version + try: + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + # If no error is raised, the conversion succeeded + assert df["Status"].dtype == "string" + except DatabaseUploadFailed as ex: + # If an error is raised, it should have proper formatting + error_msg = str(ex.value) + assert "Cannot convert" in error_msg + assert "Status" in error_msg + + +def test_csv_reader_error_detection_with_null_values(): + csv_data = [ + ["Name", "Age", "City"], + ["name1", "25", "city1"], + ["name2", "", "city2"], + ["name3", "invalid_age", "city3"], + ] + + csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"})) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Age' to int64" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg + + +def test_csv_reader_successful_numeric_conversion(): + csv_data = [ + ["Name", "Age", "Score", "ID"], + ["name1", "25", "95.5", "1001"], + ["name2", "30", "87.2", "1002"], + ["name3", "35", "92.1", "1003"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions( + column_data_types={ + "Age": "int64", + "Score": "float64", + "ID": "int32", + } + ) + ) + + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + assert df.shape == (3, 4) + assert df["Age"].dtype == "int64" + assert df["Score"].dtype == "float64" + assert df["ID"].dtype == "int32" + assert df.iloc[0]["Age"] == 25 + assert df.iloc[0]["Score"] == 95.5 + assert df.iloc[0]["ID"] == 1001 + + +def test_csv_reader_error_detection_improvements_summary(): + csv_data_with_custom_header = [ + ["metadata_row", "skip", "this"], + ["Name", "Age", "Score"], + ["Alice", "25", "95.5"], + ["Bob", "invalid_age", "87.2"], + ["Charlie", "30", "92.1"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions( + header_row=1, column_data_types={"Age": "int64", "Score": "float64"} + ) + ) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data_with_custom_header)) + + error_msg = str(ex.value) + assert "Cannot convert column 'Age' to int64" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg + + # Test case 2: Multiple type errors - Age comes first alphabetically + csv_data_multiple_errors = [ + ["Name", "Age", "Score"], + ["Alice", "25", "95.5"], + ["Bob", "invalid_age", "invalid_score"], # Error in both columns (line 3) + ["Charlie", "30", "92.1"], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions(column_data_types={"Age": "int64", "Score": "float64"}) + ) + + with pytest.raises(DatabaseUploadFailed) as ex: + csv_reader.file_to_dataframe(create_csv_file(csv_data_multiple_errors)) + + error_msg = str(ex.value) + # Should catch the Age error first (Age comes before Score alphabetically) + assert "Cannot convert column 'Age' to int64" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 3: 'invalid_age' cannot be converted to int64" in error_msg + + +def test_csv_reader_cast_column_types_function(): + """Test the _cast_column_types function directly for better isolation.""" + # Create test DataFrame + test_data = { + "name": ["Alice", "Bob", "Charlie"], + "age": ["25", "30", "invalid_age"], + "score": ["95.5", "87.2", "92.1"], + } + df = pd.DataFrame(test_data) + + # Test successful casting + types_success = {"age": "int64", "score": "float64"} + kwargs = {"header": 0} + + # This should work for first two rows, but we'll only test the first two + df_subset = df.iloc[:2].copy() + result_df = CSVReader._cast_column_types(df_subset, types_success, kwargs) + + assert result_df["age"].dtype == "int64" + assert result_df["score"].dtype == "float64" + assert result_df.iloc[0]["age"] == 25 + assert result_df.iloc[0]["score"] == 95.5 + + # Test error case + with pytest.raises(DatabaseUploadFailed) as ex: + CSVReader._cast_column_types(df, types_success, kwargs) + + error_msg = str(ex.value) + assert "Cannot convert column 'age' to int64" in error_msg + assert "Found 1 error(s):" in error_msg + assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg + + +def test_csv_reader_cast_column_types_missing_column(): + """Test _cast_column_types with missing columns.""" + test_data = { + "name": ["Alice", "Bob"], + "age": ["25", "30"], + } + df = pd.DataFrame(test_data) + + # Try to cast a column that doesn't exist + types = {"age": "int64", "nonexistent": "float64"} + kwargs = {"header": 0} + + # Should not raise an error for missing columns + result_df = CSVReader._cast_column_types(df, types, kwargs) + assert result_df["age"].dtype == "int64" + assert "nonexistent" not in result_df.columns + + +def test_csv_reader_cast_column_types_different_numeric_types(): + """Test _cast_column_types with various numeric types.""" + test_data = { + "int32_col": ["1", "2", "3"], + "int64_col": ["100", "200", "300"], + "float32_col": ["1.5", "2.5", "3.5"], + "float64_col": ["10.1", "20.2", "30.3"], + } + df = pd.DataFrame(test_data) + + types = { + "int32_col": "int32", + "int64_col": "int64", + "float32_col": "float32", + "float64_col": "float64", + } + kwargs = {"header": 0} + + result_df = CSVReader._cast_column_types(df, types, kwargs) + + assert result_df["int32_col"].dtype == "int32" + assert result_df["int64_col"].dtype == "int64" + assert result_df["float32_col"].dtype == "float32" + assert result_df["float64_col"].dtype == "float64"