This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 4389cf95e fix: check overflow numbers while inferring type for csv 
files (#6481)
4389cf95e is described below

commit 4389cf95ea1b423225fc38ae8af371a7cbbebf66
Author: Yohan Wal <[email protected]>
AuthorDate: Wed Oct 2 20:22:13 2024 +0800

    fix: check overflow numbers while inferring type for csv files (#6481)
    
    * refactor: detect overflow for type inference
    
    * chore: fallback to utf8 and tests
---
 arrow-csv/src/reader/mod.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 36f80ec90..d81f1afee 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -215,7 +215,12 @@ impl InferredDataType {
         self.packed |= if string.starts_with('"') {
             1 << 8 // Utf8
         } else if let Some(m) = REGEX_SET.matches(string).into_iter().next() {
-            1 << m
+            if m == 1 && string.len() >= 19 && string.parse::<i64>().is_err() {
+                // if overflow i64, fallback to utf8
+                1 << 8
+            } else {
+                1 << m
+            }
         } else {
             1 << 8 // Utf8
         }
@@ -1819,6 +1824,8 @@ mod tests {
             infer_field_schema("2021-12-19T13:12:30.123456789"),
             DataType::Timestamp(TimeUnit::Nanosecond, None)
         );
+        assert_eq!(infer_field_schema("–9223372036854775809"), DataType::Utf8);
+        assert_eq!(infer_field_schema("9223372036854775808"), DataType::Utf8);
     }
 
     #[test]

Reply via email to