alamb commented on a change in pull request #941:
URL: https://github.com/apache/arrow-rs/pull/941#discussion_r749359536



##########
File path: arrow/src/csv/reader.rs
##########
@@ -1503,6 +1586,54 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_parse_decimal_with_parameter() {
+        let tests = [
+            ("123.123", 123123i128),
+            ("123.1234", 123123i128),
+            ("123.1", 123100i128),
+            ("123", 123000i128),
+            ("-123.123", -123123i128),
+            ("-123.1234", -123123i128),
+            ("-123.1", -123100i128),
+            ("-123", -123000i128),
+            ("0.0000123", 0i128),
+            ("12.", 12000i128),
+            ("-12.", -12000i128),
+            ("00.1", 100i128),
+            ("-00.1", -100i128),
+            ("12345678912345678.1234", 12345678912345678123i128),
+            ("-12345678912345678.1234", -12345678912345678123i128),
+            ("99999999999999999.999", 99999999999999999999i128),
+            ("-99999999999999999.999", -99999999999999999999i128),
+            (".123", 123i128),
+            ("-.123", -123i128),
+            ("123.", 123000i128),
+            ("-123.", -123000i128),
+        ];
+        for (s, i) in tests {
+            let result = parse_decimal_with_parameter(s, 20, 3);
+            assert_eq!(i, result.unwrap())
+        }
+        let can_not_parse_tests = ["123,123", "."];

Review comment:
       I recommend a test with two `.` in it as well, such as `123.123.123`

##########
File path: arrow/src/csv/reader.rs
##########
@@ -769,20 +773,93 @@ fn build_decimal_array(
     Ok(Arc::new(decimal_builder.finish()))
 }
 
-// parse the string format decimal value to i128 format.
-// like "125.12" to 12512_i128.
+// Parse the string format decimal value to i128 format and checking the 
precision and scale.
+// The result i128 value can't be out of bounds.
+fn parse_decimal_with_parameter(s: &str, precision: usize, scale: usize) -> 
Result<i128> {
+    if PARSE_DECIMAL_RE.is_match(s) {
+        let mut offset = s.len();
+        let len = s.len();
+        // each byte is digit、'-' or '.'
+        let mut base = 1;
+
+        // handle the value after the '.' and meet the scale
+        let delimiter_position = s.find('.');

Review comment:
       I wonder what will happen if there are two `'.'` in the string - like 
`123.456.789` ( I would expect a parsing error in this case)

##########
File path: arrow/src/csv/reader.rs
##########
@@ -769,20 +773,93 @@ fn build_decimal_array(
     Ok(Arc::new(decimal_builder.finish()))
 }
 
-// parse the string format decimal value to i128 format.
-// like "125.12" to 12512_i128.
+// Parse the string format decimal value to i128 format and checking the 
precision and scale.
+// The result i128 value can't be out of bounds.
+fn parse_decimal_with_parameter(s: &str, precision: usize, scale: usize) -> 
Result<i128> {
+    if PARSE_DECIMAL_RE.is_match(s) {
+        let mut offset = s.len();
+        let len = s.len();
+        // each byte is digit、'-' or '.'
+        let mut base = 1;
+
+        // handle the value after the '.' and meet the scale
+        let delimiter_position = s.find('.');
+        match delimiter_position {
+            None => {
+                // there is no '.'
+                base = 10_i128.pow(scale as u32);
+            }
+            Some(mid) => {
+                // there is the '.'
+                if len - mid >= scale + 1 {
+                    // If the string value is "123.12345" and the scale is 2, 
we should just remain '.12' and drop the '345' value.
+                    offset -= len - mid - 1 - scale;
+                } else {
+                    // If the string value is "123.12" and the scale is 4, we 
should append '00' to the tail.
+                    base = 10_i128.pow((scale + 1 + mid - len) as u32);
+                }
+            }
+        };
+
+        let bytes = s.as_bytes();
+        let mut negative = false;
+        let mut result: i128 = 0;
+
+        while offset > 0 {

Review comment:
       Something that might make this code faster (and more idomatic rust) 
would be to avoid the `bytes[offset - 1]` call (which does a bounds check).
   
   It might look something like this (untested):
   
   ```rust
   
   bytes[0..offset].iter().rev()
     .try_for_each(|b| {
       match b {
         b'-' => negative = true,
         b'.' => {},
         ...
       }
     })?;
   

##########
File path: arrow/src/csv/reader.rs
##########
@@ -1503,6 +1586,54 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_parse_decimal_with_parameter() {

Review comment:
       👍 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to