alamb commented on a change in pull request #941:
URL: https://github.com/apache/arrow-rs/pull/941#discussion_r748224224



##########
File path: arrow/src/csv/reader.rs
##########
@@ -513,6 +514,9 @@ fn parse(
             let field = &fields[i];
             match field.data_type() {
                 DataType::Boolean => build_boolean_array(line_number, rows, i),
+                DataType::Decimal(p, v) => {

Review comment:
       I think using `DataType::Decimal(precision, scale)` rather than 
`DataType::Decimal(p, v)` would be more consistent with the rest of the codebase

##########
File path: arrow/src/csv/reader.rs
##########
@@ -728,6 +732,87 @@ fn parse_bool(string: &str) -> Option<bool> {
     }
 }
 
+// parse the column string to an Arrow Array
+fn build_decimal_array(
+    line_number: usize,
+    rows: &[StringRecord],
+    col_idx: usize,
+    precision: usize,
+    scale: usize,
+) -> Result<ArrayRef> {
+    let mut decimal_builder = DecimalBuilder::new(line_number, precision, 
scale);
+    for row in rows {
+        let col_s = row.get(col_idx);
+        match col_s {
+            None => {
+                // No data for this row
+                decimal_builder.append_null()?;
+            }
+            Some(s) => {
+                if s.is_empty() {
+                    // append null
+                    decimal_builder.append_null()?;
+                } else {
+                    let decimal_value: Result<i128> = parse_decimal(s);
+                    match decimal_value {
+                        Ok(v) => {
+                            decimal_builder.append_value(v)?;
+                        }
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    Ok(Arc::new(decimal_builder.finish()))
+}
+
+// parse the string format decimal value to i128 format.

Review comment:
       I am surprised that we don't already have a `&str` --> i128 parser, but 
I also could not find one.
   
   
   

##########
File path: arrow/src/csv/reader.rs
##########
@@ -1055,6 +1140,31 @@ mod tests {
         assert_eq!(&metadata, batch.schema().metadata());
     }
 
+    #[test]
+    fn test_csv_reader_with_decimal() {
+        let schema = Schema::new(vec![
+            Field::new("city", DataType::Utf8, false),
+            Field::new("lat", DataType::Decimal(26, 6), false),
+            Field::new("lng", DataType::Decimal(26, 6), false),
+        ]);
+
+        let file = File::open("test/data/uk_cities.csv").unwrap();
+
+        let mut csv = Reader::new(file, Arc::new(schema), false, None, 1024, 
None, None);
+        let batch = csv.next().unwrap().unwrap();
+        // access data from a primitive array
+        let lat = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<DecimalArray>()
+            .unwrap();
+
+        assert_eq!("57.653484", lat.value_as_string(0));
+        assert_eq!("53.002666", lat.value_as_string(1));
+        assert_eq!("52.412811", lat.value_as_string(2));
+        assert_eq!("51.481583", lat.value_as_string(3))
+    }

Review comment:
       Could you also add some tests where the full precision is not supplied? 
For example, try parsing 
   
   ```
   57.65
   ```
   
   (only 2 places after the decimal)
   
   I think this implementation may end up producing a decimal of `0.005765` 
rather than `57.65000`
   
   Also, we should test 
   
   ```
   56.3838748284
   ```
   (or something else that has more data after the decimal than the declared 
scale)

##########
File path: arrow/src/csv/reader.rs
##########
@@ -728,6 +732,87 @@ fn parse_bool(string: &str) -> Option<bool> {
     }
 }
 
+// parse the column string to an Arrow Array
+fn build_decimal_array(
+    line_number: usize,
+    rows: &[StringRecord],
+    col_idx: usize,
+    precision: usize,
+    scale: usize,
+) -> Result<ArrayRef> {
+    let mut decimal_builder = DecimalBuilder::new(line_number, precision, 
scale);

Review comment:
       the first argument to `DecimalBuilder` is the capacity (aka the expected 
size of the array). So I think it would be more performant to use `rows.size()` 
rather than `line_number` here
   
   https://docs.rs/arrow/6.1.0/arrow/array/struct.DecimalBuilder.html#method.new

##########
File path: arrow/src/csv/reader.rs
##########
@@ -728,6 +732,87 @@ fn parse_bool(string: &str) -> Option<bool> {
     }
 }
 
+// parse the column string to an Arrow Array
+fn build_decimal_array(
+    line_number: usize,
+    rows: &[StringRecord],
+    col_idx: usize,
+    precision: usize,
+    scale: usize,
+) -> Result<ArrayRef> {
+    let mut decimal_builder = DecimalBuilder::new(line_number, precision, 
scale);
+    for row in rows {
+        let col_s = row.get(col_idx);
+        match col_s {
+            None => {
+                // No data for this row
+                decimal_builder.append_null()?;
+            }
+            Some(s) => {
+                if s.is_empty() {
+                    // append null
+                    decimal_builder.append_null()?;
+                } else {
+                    let decimal_value: Result<i128> = parse_decimal(s);
+                    match decimal_value {
+                        Ok(v) => {
+                            decimal_builder.append_value(v)?;
+                        }
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    Ok(Arc::new(decimal_builder.finish()))
+}
+
+// parse the string format decimal value to i128 format.
+// like "125.12" to 12512_i128.
+fn parse_decimal(s: &str) -> Result<i128> {
+    if DECIMAL_RE.is_match(s) {
+        let mut offset = s.len();
+        // each byte is digit、'-' or '.'
+        let bytes = s.as_bytes();
+        let mut negitive = false;
+        let mut result: i128 = 0;

Review comment:
       I think you could probably use `str::parse` here (after first removing 
all `'.'`) rather than a custom parsing loop
   
   For example:
   ```rust
       let i: i128 = "123".parse().unwrap();
       println!("i is {}", i);
   ```
   
   prints `"123"` 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to