This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new cf270321ba Print row, data present, expected type, and row number in 
error messages for arrow-csv (#7361)
cf270321ba is described below

commit cf270321baa1681f856b74427fee92fe86a20d74
Author: Pranav Kesavarapu <[email protected]>
AuthorDate: Sat Apr 5 00:41:58 2025 +0530

    Print row, data present, expected type, and row number in error messages 
for arrow-csv (#7361)
    
    * fix: print verbose parsing errors for easier debugging
    
    * test: test every column in type mismatch arrow-csv
    
    * chore: fix cargo fmt issues
---
 arrow-csv/src/reader/mod.rs                        | 54 ++++++++++++++--------
 arrow-csv/src/reader/records.rs                    | 15 ++++++
 .../data/various_invalid_types/invalid_bool.csv    |  6 +++
 .../invalid_float.csv}                             |  0
 .../data/various_invalid_types/invalid_int.csv     |  6 +++
 .../null_in_non_nullable.csv}                      |  6 +--
 6 files changed, 66 insertions(+), 21 deletions(-)

diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index e3ab013a57..5440c7a86b 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -936,10 +936,12 @@ fn build_primitive_array<T: ArrowPrimitiveType + Parser>(
                 Some(e) => Ok(Some(e)),
                 None => Err(ArrowError::ParseError(format!(
                     // TODO: we should surface the underlying error here.
-                    "Error while parsing value {} for column {} at line {}",
+                    "Error while parsing value '{}' as type '{}' for column {} 
at line {}. Row data: '{}'",
                     s,
+                    T::DATA_TYPE,
                     col_idx,
-                    line_number + row_index
+                    line_number + row_index,
+                    row
                 ))),
             }
         })
@@ -1022,10 +1024,12 @@ fn build_boolean_array(
                 Some(e) => Ok(Some(e)),
                 None => Err(ArrowError::ParseError(format!(
                     // TODO: we should surface the underlying error here.
-                    "Error while parsing value {} for column {} at line {}",
+                    "Error while parsing value '{}' as type '{}' for column {} 
at line {}. Row data: '{}'",
                     s,
+                    "Boolean",
                     col_idx,
-                    line_number + row_index
+                    line_number + row_index,
+                    row
                 ))),
             }
         })
@@ -1760,10 +1764,8 @@ mod tests {
         assert_eq!(&DataType::Float64, schema.field(0).data_type());
     }
 
-    #[test]
-    fn test_parse_invalid_csv() {
-        let file = File::open("test/data/various_types_invalid.csv").unwrap();
-
+    fn invalid_csv_helper(file_name: &str) -> String {
+        let file = File::open(file_name).unwrap();
         let schema = Schema::new(vec![
             Field::new("c_int", DataType::UInt64, false),
             Field::new("c_float", DataType::Float32, false),
@@ -1778,16 +1780,32 @@ mod tests {
             .with_projection(vec![0, 1, 2, 3]);
 
         let mut csv = builder.build(file).unwrap();
-        match csv.next() {
-            Some(e) => match e {
-                Err(e) => assert_eq!(
-                    "ParseError(\"Error while parsing value 4.x4 for column 1 
at line 4\")",
-                    format!("{e:?}")
-                ),
-                Ok(_) => panic!("should have failed"),
-            },
-            None => panic!("should have failed"),
-        }
+
+        csv.next().unwrap().unwrap_err().to_string()
+    }
+
+    #[test]
+    fn test_parse_invalid_csv_float() {
+        let file_name = "test/data/various_invalid_types/invalid_float.csv";
+
+        let error = invalid_csv_helper(file_name);
+        assert_eq!("Parser error: Error while parsing value '4.x4' as type 
'Float32' for column 1 at line 4. Row data: '[4,4.x4,,false]'", error);
+    }
+
+    #[test]
+    fn test_parse_invalid_csv_int() {
+        let file_name = "test/data/various_invalid_types/invalid_int.csv";
+
+        let error = invalid_csv_helper(file_name);
+        assert_eq!("Parser error: Error while parsing value '2.3' as type 
'UInt64' for column 0 at line 2. Row data: '[2.3,2.2,2.22,false]'", error);
+    }
+
+    #[test]
+    fn test_parse_invalid_csv_bool() {
+        let file_name = "test/data/various_invalid_types/invalid_bool.csv";
+
+        let error = invalid_csv_helper(file_name);
+        assert_eq!("Parser error: Error while parsing value 'none' as type 
'Boolean' for column 3 at line 2. Row data: '[2,2.2,2.22,none]'", error);
     }
 
     /// Infer the data type of a record
diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs
index a07fc9c94f..33927c9336 100644
--- a/arrow-csv/src/reader/records.rs
+++ b/arrow-csv/src/reader/records.rs
@@ -290,6 +290,21 @@ impl<'a> StringRecord<'a> {
     }
 }
 
+impl std::fmt::Display for StringRecord<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let num_fields = self.offsets.len() - 1;
+        write!(f, "[")?;
+        for i in 0..num_fields {
+            if i > 0 {
+                write!(f, ",")?;
+            }
+            write!(f, "{}", self.get(i))?;
+        }
+        write!(f, "]")?;
+        Ok(())
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::reader::records::RecordDecoder;
diff --git a/arrow-csv/test/data/various_invalid_types/invalid_bool.csv 
b/arrow-csv/test/data/various_invalid_types/invalid_bool.csv
new file mode 100644
index 0000000000..81fd713df3
--- /dev/null
+++ b/arrow-csv/test/data/various_invalid_types/invalid_bool.csv
@@ -0,0 +1,6 @@
+c_int|c_float|c_string|c_bool
+1|1.1|"1.11"|true
+2|2.2|"2.22"|none
+3|3.3|"3.33"|true
+4|4.4|"4.4"|false
+5|6.6|""|false
diff --git a/arrow-csv/test/data/various_types_invalid.csv 
b/arrow-csv/test/data/various_invalid_types/invalid_float.csv
similarity index 100%
copy from arrow-csv/test/data/various_types_invalid.csv
copy to arrow-csv/test/data/various_invalid_types/invalid_float.csv
diff --git a/arrow-csv/test/data/various_invalid_types/invalid_int.csv 
b/arrow-csv/test/data/various_invalid_types/invalid_int.csv
new file mode 100644
index 0000000000..b2046a8994
--- /dev/null
+++ b/arrow-csv/test/data/various_invalid_types/invalid_int.csv
@@ -0,0 +1,6 @@
+c_int|c_float|c_string|c_bool
+1|1.1|"1.11"|true
+2.3|2.2|"2.22"|false
+3|3.3|"3.33"|true
+4|4.4|"4.4"|false
+5|6.6|""|false
diff --git a/arrow-csv/test/data/various_types_invalid.csv 
b/arrow-csv/test/data/various_invalid_types/null_in_non_nullable.csv
similarity index 58%
rename from arrow-csv/test/data/various_types_invalid.csv
rename to arrow-csv/test/data/various_invalid_types/null_in_non_nullable.csv
index 6f059cb73e..2b4368d860 100644
--- a/arrow-csv/test/data/various_types_invalid.csv
+++ b/arrow-csv/test/data/various_invalid_types/null_in_non_nullable.csv
@@ -1,6 +1,6 @@
 c_int|c_float|c_string|c_bool
 1|1.1|"1.11"|true
 2|2.2|"2.22"|true
-3||"3.33"|true
-4|4.x4||false
-5|6.6|""|false
\ No newline at end of file
+3|3.3|"3.33"|true
+4|4.4||false
+5|6.6|""|false

Reply via email to