[GitHub] [arrow] alamb commented on a change in pull request #9705: ARROW-11365: [Rust] [Parquet] Logical type printer and parser

GitBox Mon, 22 Mar 2021 12:42:45 -0700


alamb commented on a change in pull request #9705:
URL: https://github.com/apache/arrow/pull/9705#discussion_r599016262




##########
File path: rust/parquet/src/arrow/schema.rs
##########
@@ -959,11 +962,14 @@ mod tests {
             Field::new("boolean", DataType::Boolean, false),
             Field::new("int8", DataType::Int8, false),
             Field::new("int16", DataType::Int16, false),
+            Field::new("uint8", DataType::UInt8, false),
+            Field::new("uint16", DataType::UInt16, false),
             Field::new("int32", DataType::Int32, false),
             Field::new("int64", DataType::Int64, false),
             Field::new("double", DataType::Float64, true),
             Field::new("float", DataType::Float32, true),
             Field::new("string", DataType::Utf8, true),
+            Field::new("string_2", DataType::Utf8, true),

Review comment:
       is it cool / needed that `UTF8` and `STRING` both map to Arrow 
`DataType::Utf8`? It seems like `UTF8` is not actually a valid "logical type" 
in parquet -- it should be `STRING`
   
   
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string-types

##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -281,19 +329,142 @@ impl<'a> Parser<'a> {
             .ok_or_else(|| general_err!("Expected name, found None"))?;
 
         // Parse converted type
-        let (converted_type, precision, scale) = if let Some("(") = 
self.tokenizer.next()
+        let (logical_type, converted_type, precision, scale) = if let 
Some("(") =
+            self.tokenizer.next()
         {
-            let tpe = self
+            let (mut logical, mut converted) = self
                 .tokenizer
                 .next()
-                .ok_or_else(|| general_err!("Expected converted type, found 
None"))
-                .and_then(|v| v.to_uppercase().parse::<ConvertedType>())?;
+                .ok_or_else(|| {
+                    general_err!("Expected logical or converted type, found 
None")
+                })
+                .and_then(|v| {
+                    let upper = v.to_uppercase();
+                    let logical = upper.parse::<LogicalType>();
+                    match logical {
+                        Ok(logical) => Ok((
+                            Some(logical.clone()),
+                            ConvertedType::from(Some(logical)),
+                        )),
+                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
+                    }
+                })?;
 
             // Parse precision and scale for decimals
             let mut precision: i32 = -1;
             let mut scale: i32 = -1;
 
-            if tpe == ConvertedType::DECIMAL {
+            // Parse the concrete logical type
+            if let Some(tpe) = &logical {
+                match tpe {
+                    LogicalType::DECIMAL(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            precision = parse_i32(
+                                self.tokenizer.next(),
+                                "Expected precision, found None",
+                                "Failed to parse precision for DECIMAL type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                scale = parse_i32(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failure to parse timezone info for TIME 
type",
+                                )?; // TODO: this might not cater for the case 
of no scale correctly

Review comment:
       I don't understand the comment -- maybe worth a JIRA to track for later

##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -503,6 +675,129 @@ mod tests {
         assert!(result.is_ok());
     }
 
+    #[test]
+    fn test_parse_message_type_integer() {
+        // Invalid integer syntax
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER());
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid integer syntax, needs both bit-width and UTC sign
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER(32,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid integer because of non-numeric bit width
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(eight,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Valid types
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(8,false));
+      optional int32 f2 (INTEGER(8,true));
+      optional int32 f3 (INTEGER(16,false));
+      optional int32 f4 (INTEGER(16,true));
+      optional int32 f5 (INTEGER(32,false));
+      optional int32 f6 (INTEGER(32,true));
+      optional int64 f7 (INTEGER(64,false));
+      optional int64 f7 (INTEGER(64,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_parse_message_type_temporal() {
+        // Invalid timestamp syntax
+        let schema = "
+    message root {
+      optional int64 f1 (TIMESTAMP();
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid timestamp syntax, needs both unit and UTC adjustment
+        let schema = "
+    message root {
+      optional int64 f1 (TIMESTAMP(MILLIS,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid timestamp because of unknown unit
+        let schema = "
+    message root {
+      optional int64 f1 (TIMESTAMP(YOCTOS,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Valid types
+        let schema = "
+    message root {
+      optional int32 f1 (DATE);
+      optional int32 f2 (TIME(MILLIS,true));
+      optional int64 f3 (TIME(MICROS,false));
+      optional int64 f4 (TIME(NANOS,true));
+      optional int64 f5 (TIMESTAMP(MILLIS,true));
+      optional int64 f6 (TIMESTAMP(MICROS,true));
+      optional int64 f7 (TIMESTAMP(NANOS,false));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_ok());

Review comment:
       I recommend also assert the actual output schema too (e.g. that 
Time(micros)  actually parsed to time(micros) and not some other type) 

##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -503,6 +675,129 @@ mod tests {
         assert!(result.is_ok());
     }
 
+    #[test]
+    fn test_parse_message_type_integer() {
+        // Invalid integer syntax
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER());
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid integer syntax, needs both bit-width and UTC sign
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER(32,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid integer because of non-numeric bit width
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(eight,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Valid types
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(8,false));
+      optional int32 f2 (INTEGER(8,true));
+      optional int32 f3 (INTEGER(16,false));
+      optional int32 f4 (INTEGER(16,true));
+      optional int32 f5 (INTEGER(32,false));
+      optional int32 f6 (INTEGER(32,true));
+      optional int64 f7 (INTEGER(64,false));
+      optional int64 f7 (INTEGER(64,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_ok());

Review comment:
       I recommend testing the actual schema too in addition to asserting that 
there were no errors in parsing
   

##########
File path: rust/parquet/src/arrow/schema.rs
##########
@@ -1558,24 +1571,28 @@ mod tests {
                 DataType::Timestamp(TimeUnit::Microsecond, None),
                 false,
             ),
+            Field::new(
+                "ts_nano",
+                DataType::Timestamp(TimeUnit::Nanosecond, 
Some("UTC".to_string())),
+                false,
+            ),
         ];
 
         assert_eq!(arrow_fields, converted_arrow_fields);
     }
 
     #[test]
-    #[ignore = "To be addressed as part of ARROW-11365"]

Review comment:
       🎉 

##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -281,19 +329,142 @@ impl<'a> Parser<'a> {
             .ok_or_else(|| general_err!("Expected name, found None"))?;
 
         // Parse converted type
-        let (converted_type, precision, scale) = if let Some("(") = 
self.tokenizer.next()
+        let (logical_type, converted_type, precision, scale) = if let 
Some("(") =
+            self.tokenizer.next()
         {
-            let tpe = self
+            let (mut logical, mut converted) = self
                 .tokenizer
                 .next()
-                .ok_or_else(|| general_err!("Expected converted type, found 
None"))
-                .and_then(|v| v.to_uppercase().parse::<ConvertedType>())?;
+                .ok_or_else(|| {
+                    general_err!("Expected logical or converted type, found 
None")
+                })
+                .and_then(|v| {
+                    let upper = v.to_uppercase();
+                    let logical = upper.parse::<LogicalType>();
+                    match logical {
+                        Ok(logical) => Ok((
+                            Some(logical.clone()),
+                            ConvertedType::from(Some(logical)),
+                        )),
+                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
+                    }
+                })?;
 
             // Parse precision and scale for decimals
             let mut precision: i32 = -1;
             let mut scale: i32 = -1;
 
-            if tpe == ConvertedType::DECIMAL {
+            // Parse the concrete logical type
+            if let Some(tpe) = &logical {
+                match tpe {
+                    LogicalType::DECIMAL(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            precision = parse_i32(
+                                self.tokenizer.next(),
+                                "Expected precision, found None",
+                                "Failed to parse precision for DECIMAL type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                scale = parse_i32(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failure to parse timezone info for TIME 
type",
+                                )?; // TODO: this might not cater for the case 
of no scale correctly
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = 
Some(LogicalType::DECIMAL(DecimalType {
+                                    precision,
+                                    scale,
+                                }));
+                                converted = 
ConvertedType::from(logical.clone());
+                            } else {
+                                scale = 0;
+                                logical = 
Some(LogicalType::DECIMAL(DecimalType {
+                                    precision,
+                                    scale,
+                                }));
+                                converted = 
ConvertedType::from(logical.clone());
+                            }
+                        }
+                    }
+                    LogicalType::TIME(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            let unit = parse_timeunit(
+                                self.tokenizer.next(),
+                                "Invalid timeunit found",
+                                "Failed to parse timeunit for TIME type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                let is_adjusted_to_u_t_c = parse_bool(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failure to parse timezone info for TIME 
type",
+                                )?;
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = Some(LogicalType::TIME(TimeType {
+                                    unit,
+                                    is_adjusted_to_u_t_c,
+                                }));
+                                converted = 
ConvertedType::from(logical.clone());
+                            } else {
+                                // Invalid token for unit
+                                self.tokenizer.backtrack();
+                            }
+                        }
+                    }
+                    LogicalType::TIMESTAMP(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            let unit = parse_timeunit(
+                                self.tokenizer.next(),
+                                "Invalid timeunit found",
+                                "Failed to parse timeunit for TIMESTAMP type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                let is_adjusted_to_u_t_c = parse_bool(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failure to parse timezone info for 
TIMESTAMP type",
+                                )?;
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = 
Some(LogicalType::TIMESTAMP(TimestampType {
+                                    unit,
+                                    is_adjusted_to_u_t_c,
+                                }));
+                                converted = 
ConvertedType::from(logical.clone());
+                            } else {
+                                // Invalid token for unit
+                                self.tokenizer.backtrack();
+                            }
+                        }
+                    }
+                    LogicalType::INTEGER(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            let bit_width = parse_i32(
+                                self.tokenizer.next(),
+                                "Invalid bit_width found",
+                                "Failed to parse bit_width for INTEGER type",
+                            )? as i8;
+                            // TODO: check the unit against the physical type
+                            if let Some(",") = self.tokenizer.next() {
+                                let is_signed = parse_bool(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failure to parse timezone info for 
TIMESTAMP type",

Review comment:
       ```suggestion
                                       "Failure to parse bit_width info for 
INTEGER type",
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] alamb commented on a change in pull request #9705: ARROW-11365: [Rust] [Parquet] Logical type printer and parser

Reply via email to