This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new e72875e Update DECIMAL_RE to allow scientific notation in auto
inferred schemas (#1216)
e72875e is described below
commit e72875e4b465f9d4eae1b852051e977488f97796
Author: Patrick More <[email protected]>
AuthorDate: Sat Jan 22 12:18:10 2022 -0800
Update DECIMAL_RE to allow scientific notation in auto inferred schemas
(#1216)
* Update DECIMAL_RE to allow scientific notation in auto inferred schemas
* Fixed format lint
---
arrow/src/csv/reader.rs | 8 +++++---
arrow/test/data/various_types.csv | 4 +++-
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs
index 269e55f..0ade29c 100644
--- a/arrow/src/csv/reader.rs
+++ b/arrow/src/csv/reader.rs
@@ -64,7 +64,8 @@ use std::ops::Neg;
lazy_static! {
static ref PARSE_DECIMAL_RE: Regex =
Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap();
- static ref DECIMAL_RE: Regex =
Regex::new(r"^-?(\d*\.\d+|\d+\.\d*)$").unwrap();
+ static ref DECIMAL_RE: Regex =
+
Regex::new(r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$").unwrap();
static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d+)$").unwrap();
static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$")
.case_insensitive(true)
@@ -1570,7 +1571,7 @@ mod tests {
let mut csv = builder.build(file).unwrap();
let batch = csv.next().unwrap().unwrap();
- assert_eq!(5, batch.num_rows());
+ assert_eq!(7, batch.num_rows());
assert_eq!(6, batch.num_columns());
let schema = batch.schema();
@@ -1872,6 +1873,7 @@ mod tests {
writeln!(csv1, "c1,c2,c3")?;
writeln!(csv1, "1,\"foo\",0.5")?;
writeln!(csv1, "3,\"bar\",1")?;
+ writeln!(csv1, "3,\"bar\",2e-06")?;
// reading csv2 will set c2 to optional
writeln!(csv2, "c1,c2,c3,c4")?;
writeln!(csv2, "10,,3.14,true")?;
@@ -1887,7 +1889,7 @@ mod tests {
csv4.path().to_str().unwrap().to_string(),
],
b',',
- Some(3), // only csv1 and csv2 should be read
+ Some(4), // only csv1 and csv2 should be read
true,
)?;
diff --git a/arrow/test/data/various_types.csv
b/arrow/test/data/various_types.csv
index 8f4466f..570d07f 100644
--- a/arrow/test/data/various_types.csv
+++ b/arrow/test/data/various_types.csv
@@ -3,4 +3,6 @@ c_int|c_float|c_string|c_bool|c_date|c_datetime
2|2.2|"2.22"|true|2020-11-08|2020-11-08T01:00:00
3||"3.33"|true|1969-12-31|1969-11-08T02:00:00
4|4.4||false||
-5|6.6|""|false|1990-01-01|1990-01-01T03:00:00
\ No newline at end of file
+5|6.6|""|false|1990-01-01|1990-01-01T03:00:00
+4|4e6||false||
+4|4.0e-6||false||
\ No newline at end of file