This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new cf270321ba Print row, data present, expected type, and row number in
error messages for arrow-csv (#7361)
cf270321ba is described below
commit cf270321baa1681f856b74427fee92fe86a20d74
Author: Pranav Kesavarapu <[email protected]>
AuthorDate: Sat Apr 5 00:41:58 2025 +0530
Print row, data present, expected type, and row number in error messages
for arrow-csv (#7361)
* fix: print verbose parsing errors for easier debugging
* test: test every column in type mismatch arrow-csv
* chore: fix cargo fmt issues
---
arrow-csv/src/reader/mod.rs | 54 ++++++++++++++--------
arrow-csv/src/reader/records.rs | 15 ++++++
.../data/various_invalid_types/invalid_bool.csv | 6 +++
.../invalid_float.csv} | 0
.../data/various_invalid_types/invalid_int.csv | 6 +++
.../null_in_non_nullable.csv} | 6 +--
6 files changed, 66 insertions(+), 21 deletions(-)
diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index e3ab013a57..5440c7a86b 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -936,10 +936,12 @@ fn build_primitive_array<T: ArrowPrimitiveType + Parser>(
Some(e) => Ok(Some(e)),
None => Err(ArrowError::ParseError(format!(
// TODO: we should surface the underlying error here.
- "Error while parsing value {} for column {} at line {}",
+ "Error while parsing value '{}' as type '{}' for column {}
at line {}. Row data: '{}'",
s,
+ T::DATA_TYPE,
col_idx,
- line_number + row_index
+ line_number + row_index,
+ row
))),
}
})
@@ -1022,10 +1024,12 @@ fn build_boolean_array(
Some(e) => Ok(Some(e)),
None => Err(ArrowError::ParseError(format!(
// TODO: we should surface the underlying error here.
- "Error while parsing value {} for column {} at line {}",
+ "Error while parsing value '{}' as type '{}' for column {}
at line {}. Row data: '{}'",
s,
+ "Boolean",
col_idx,
- line_number + row_index
+ line_number + row_index,
+ row
))),
}
})
@@ -1760,10 +1764,8 @@ mod tests {
assert_eq!(&DataType::Float64, schema.field(0).data_type());
}
- #[test]
- fn test_parse_invalid_csv() {
- let file = File::open("test/data/various_types_invalid.csv").unwrap();
-
+ fn invalid_csv_helper(file_name: &str) -> String {
+ let file = File::open(file_name).unwrap();
let schema = Schema::new(vec![
Field::new("c_int", DataType::UInt64, false),
Field::new("c_float", DataType::Float32, false),
@@ -1778,16 +1780,32 @@ mod tests {
.with_projection(vec![0, 1, 2, 3]);
let mut csv = builder.build(file).unwrap();
- match csv.next() {
- Some(e) => match e {
- Err(e) => assert_eq!(
- "ParseError(\"Error while parsing value 4.x4 for column 1
at line 4\")",
- format!("{e:?}")
- ),
- Ok(_) => panic!("should have failed"),
- },
- None => panic!("should have failed"),
- }
+
+ csv.next().unwrap().unwrap_err().to_string()
+ }
+
+ #[test]
+ fn test_parse_invalid_csv_float() {
+ let file_name = "test/data/various_invalid_types/invalid_float.csv";
+
+ let error = invalid_csv_helper(file_name);
+ assert_eq!("Parser error: Error while parsing value '4.x4' as type
'Float32' for column 1 at line 4. Row data: '[4,4.x4,,false]'", error);
+ }
+
+ #[test]
+ fn test_parse_invalid_csv_int() {
+ let file_name = "test/data/various_invalid_types/invalid_int.csv";
+
+ let error = invalid_csv_helper(file_name);
+ assert_eq!("Parser error: Error while parsing value '2.3' as type
'UInt64' for column 0 at line 2. Row data: '[2.3,2.2,2.22,false]'", error);
+ }
+
+ #[test]
+ fn test_parse_invalid_csv_bool() {
+ let file_name = "test/data/various_invalid_types/invalid_bool.csv";
+
+ let error = invalid_csv_helper(file_name);
+ assert_eq!("Parser error: Error while parsing value 'none' as type
'Boolean' for column 3 at line 2. Row data: '[2,2.2,2.22,none]'", error);
}
/// Infer the data type of a record
diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs
index a07fc9c94f..33927c9336 100644
--- a/arrow-csv/src/reader/records.rs
+++ b/arrow-csv/src/reader/records.rs
@@ -290,6 +290,21 @@ impl<'a> StringRecord<'a> {
}
}
+impl std::fmt::Display for StringRecord<'_> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ let num_fields = self.offsets.len() - 1;
+ write!(f, "[")?;
+ for i in 0..num_fields {
+ if i > 0 {
+ write!(f, ",")?;
+ }
+ write!(f, "{}", self.get(i))?;
+ }
+ write!(f, "]")?;
+ Ok(())
+ }
+}
+
#[cfg(test)]
mod tests {
use crate::reader::records::RecordDecoder;
diff --git a/arrow-csv/test/data/various_invalid_types/invalid_bool.csv
b/arrow-csv/test/data/various_invalid_types/invalid_bool.csv
new file mode 100644
index 0000000000..81fd713df3
--- /dev/null
+++ b/arrow-csv/test/data/various_invalid_types/invalid_bool.csv
@@ -0,0 +1,6 @@
+c_int|c_float|c_string|c_bool
+1|1.1|"1.11"|true
+2|2.2|"2.22"|none
+3|3.3|"3.33"|true
+4|4.4|"4.4"|false
+5|6.6|""|false
diff --git a/arrow-csv/test/data/various_types_invalid.csv
b/arrow-csv/test/data/various_invalid_types/invalid_float.csv
similarity index 100%
copy from arrow-csv/test/data/various_types_invalid.csv
copy to arrow-csv/test/data/various_invalid_types/invalid_float.csv
diff --git a/arrow-csv/test/data/various_invalid_types/invalid_int.csv
b/arrow-csv/test/data/various_invalid_types/invalid_int.csv
new file mode 100644
index 0000000000..b2046a8994
--- /dev/null
+++ b/arrow-csv/test/data/various_invalid_types/invalid_int.csv
@@ -0,0 +1,6 @@
+c_int|c_float|c_string|c_bool
+1|1.1|"1.11"|true
+2.3|2.2|"2.22"|false
+3|3.3|"3.33"|true
+4|4.4|"4.4"|false
+5|6.6|""|false
diff --git a/arrow-csv/test/data/various_types_invalid.csv
b/arrow-csv/test/data/various_invalid_types/null_in_non_nullable.csv
similarity index 58%
rename from arrow-csv/test/data/various_types_invalid.csv
rename to arrow-csv/test/data/various_invalid_types/null_in_non_nullable.csv
index 6f059cb73e..2b4368d860 100644
--- a/arrow-csv/test/data/various_types_invalid.csv
+++ b/arrow-csv/test/data/various_invalid_types/null_in_non_nullable.csv
@@ -1,6 +1,6 @@
c_int|c_float|c_string|c_bool
1|1.1|"1.11"|true
2|2.2|"2.22"|true
-3||"3.33"|true
-4|4.x4||false
-5|6.6|""|false
\ No newline at end of file
+3|3.3|"3.33"|true
+4|4.4||false
+5|6.6|""|false