This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 4389cf95e fix: check overflow numbers while inferring type for csv
files (#6481)
4389cf95e is described below
commit 4389cf95ea1b423225fc38ae8af371a7cbbebf66
Author: Yohan Wal <[email protected]>
AuthorDate: Wed Oct 2 20:22:13 2024 +0800
fix: check overflow numbers while inferring type for csv files (#6481)
* refactor: detect overflow for type inference
* chore: fallback to utf8 and tests
---
arrow-csv/src/reader/mod.rs | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 36f80ec90..d81f1afee 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -215,7 +215,12 @@ impl InferredDataType {
self.packed |= if string.starts_with('"') {
1 << 8 // Utf8
} else if let Some(m) = REGEX_SET.matches(string).into_iter().next() {
- 1 << m
+ if m == 1 && string.len() >= 19 && string.parse::<i64>().is_err() {
+ // if overflow i64, fallback to utf8
+ 1 << 8
+ } else {
+ 1 << m
+ }
} else {
1 << 8 // Utf8
}
@@ -1819,6 +1824,8 @@ mod tests {
infer_field_schema("2021-12-19T13:12:30.123456789"),
DataType::Timestamp(TimeUnit::Nanosecond, None)
);
+ assert_eq!(infer_field_schema("–9223372036854775809"), DataType::Utf8);
+ assert_eq!(infer_field_schema("9223372036854775808"), DataType::Utf8);
}
#[test]