alamb commented on a change in pull request #9705:
URL: https://github.com/apache/arrow/pull/9705#discussion_r599016262
##########
File path: rust/parquet/src/arrow/schema.rs
##########
@@ -959,11 +962,14 @@ mod tests {
Field::new("boolean", DataType::Boolean, false),
Field::new("int8", DataType::Int8, false),
Field::new("int16", DataType::Int16, false),
+ Field::new("uint8", DataType::UInt8, false),
+ Field::new("uint16", DataType::UInt16, false),
Field::new("int32", DataType::Int32, false),
Field::new("int64", DataType::Int64, false),
Field::new("double", DataType::Float64, true),
Field::new("float", DataType::Float32, true),
Field::new("string", DataType::Utf8, true),
+ Field::new("string_2", DataType::Utf8, true),
Review comment:
is it cool / needed that `UTF8` and `STRING` both map to Arrow
`DataType::Utf8`? It seems like `UTF8` is not actually a valid "logical type"
in parquet -- it should be `STRING`
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string-types
##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -281,19 +329,142 @@ impl<'a> Parser<'a> {
.ok_or_else(|| general_err!("Expected name, found None"))?;
// Parse converted type
- let (converted_type, precision, scale) = if let Some("(") =
self.tokenizer.next()
+ let (logical_type, converted_type, precision, scale) = if let
Some("(") =
+ self.tokenizer.next()
{
- let tpe = self
+ let (mut logical, mut converted) = self
.tokenizer
.next()
- .ok_or_else(|| general_err!("Expected converted type, found
None"))
- .and_then(|v| v.to_uppercase().parse::<ConvertedType>())?;
+ .ok_or_else(|| {
+ general_err!("Expected logical or converted type, found
None")
+ })
+ .and_then(|v| {
+ let upper = v.to_uppercase();
+ let logical = upper.parse::<LogicalType>();
+ match logical {
+ Ok(logical) => Ok((
+ Some(logical.clone()),
+ ConvertedType::from(Some(logical)),
+ )),
+ Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
+ }
+ })?;
// Parse precision and scale for decimals
let mut precision: i32 = -1;
let mut scale: i32 = -1;
- if tpe == ConvertedType::DECIMAL {
+ // Parse the concrete logical type
+ if let Some(tpe) = &logical {
+ match tpe {
+ LogicalType::DECIMAL(_) => {
+ if let Some("(") = self.tokenizer.next() {
+ precision = parse_i32(
+ self.tokenizer.next(),
+ "Expected precision, found None",
+ "Failed to parse precision for DECIMAL type",
+ )?;
+ if let Some(",") = self.tokenizer.next() {
+ scale = parse_i32(
+ self.tokenizer.next(),
+ "Invalid boolean found",
+ "Failure to parse timezone info for TIME
type",
+ )?; // TODO: this might not cater for the case
of no scale correctly
Review comment:
I don't understand the comment -- maybe worth a JIRA to track for later
##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -503,6 +675,129 @@ mod tests {
assert!(result.is_ok());
}
+ #[test]
+ fn test_parse_message_type_integer() {
+ // Invalid integer syntax
+ let schema = "
+ message root {
+ optional int64 f1 (INTEGER());
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_err());
+
+ // Invalid integer syntax, needs both bit-width and UTC sign
+ let schema = "
+ message root {
+ optional int64 f1 (INTEGER(32,));
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_err());
+
+ // Invalid integer because of non-numeric bit width
+ let schema = "
+ message root {
+ optional int32 f1 (INTEGER(eight,true));
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_err());
+
+ // Valid types
+ let schema = "
+ message root {
+ optional int32 f1 (INTEGER(8,false));
+ optional int32 f2 (INTEGER(8,true));
+ optional int32 f3 (INTEGER(16,false));
+ optional int32 f4 (INTEGER(16,true));
+ optional int32 f5 (INTEGER(32,false));
+ optional int32 f6 (INTEGER(32,true));
+ optional int64 f7 (INTEGER(64,false));
+ optional int64 f7 (INTEGER(64,true));
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_ok());
+ }
+
+ #[test]
+ fn test_parse_message_type_temporal() {
+ // Invalid timestamp syntax
+ let schema = "
+ message root {
+ optional int64 f1 (TIMESTAMP();
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_err());
+
+ // Invalid timestamp syntax, needs both unit and UTC adjustment
+ let schema = "
+ message root {
+ optional int64 f1 (TIMESTAMP(MILLIS,));
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_err());
+
+ // Invalid timestamp because of unknown unit
+ let schema = "
+ message root {
+ optional int64 f1 (TIMESTAMP(YOCTOS,));
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_err());
+
+ // Valid types
+ let schema = "
+ message root {
+ optional int32 f1 (DATE);
+ optional int32 f2 (TIME(MILLIS,true));
+ optional int64 f3 (TIME(MICROS,false));
+ optional int64 f4 (TIME(NANOS,true));
+ optional int64 f5 (TIMESTAMP(MILLIS,true));
+ optional int64 f6 (TIMESTAMP(MICROS,true));
+ optional int64 f7 (TIMESTAMP(NANOS,false));
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_ok());
Review comment:
I recommend also assert the actual output schema too (e.g. that
Time(micros) actually parsed to time(micros) and not some other type)
##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -503,6 +675,129 @@ mod tests {
assert!(result.is_ok());
}
+ #[test]
+ fn test_parse_message_type_integer() {
+ // Invalid integer syntax
+ let schema = "
+ message root {
+ optional int64 f1 (INTEGER());
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_err());
+
+ // Invalid integer syntax, needs both bit-width and UTC sign
+ let schema = "
+ message root {
+ optional int64 f1 (INTEGER(32,));
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_err());
+
+ // Invalid integer because of non-numeric bit width
+ let schema = "
+ message root {
+ optional int32 f1 (INTEGER(eight,true));
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_err());
+
+ // Valid types
+ let schema = "
+ message root {
+ optional int32 f1 (INTEGER(8,false));
+ optional int32 f2 (INTEGER(8,true));
+ optional int32 f3 (INTEGER(16,false));
+ optional int32 f4 (INTEGER(16,true));
+ optional int32 f5 (INTEGER(32,false));
+ optional int32 f6 (INTEGER(32,true));
+ optional int64 f7 (INTEGER(64,false));
+ optional int64 f7 (INTEGER(64,true));
+ }
+ ";
+ let mut iter = Tokenizer::from_str(schema);
+ let result = Parser {
+ tokenizer: &mut iter,
+ }
+ .parse_message_type();
+ assert!(result.is_ok());
Review comment:
I recommend testing the actual schema too in addition to asserting that
there were no errors in parsing
##########
File path: rust/parquet/src/arrow/schema.rs
##########
@@ -1558,24 +1571,28 @@ mod tests {
DataType::Timestamp(TimeUnit::Microsecond, None),
false,
),
+ Field::new(
+ "ts_nano",
+ DataType::Timestamp(TimeUnit::Nanosecond,
Some("UTC".to_string())),
+ false,
+ ),
];
assert_eq!(arrow_fields, converted_arrow_fields);
}
#[test]
- #[ignore = "To be addressed as part of ARROW-11365"]
Review comment:
🎉
##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -281,19 +329,142 @@ impl<'a> Parser<'a> {
.ok_or_else(|| general_err!("Expected name, found None"))?;
// Parse converted type
- let (converted_type, precision, scale) = if let Some("(") =
self.tokenizer.next()
+ let (logical_type, converted_type, precision, scale) = if let
Some("(") =
+ self.tokenizer.next()
{
- let tpe = self
+ let (mut logical, mut converted) = self
.tokenizer
.next()
- .ok_or_else(|| general_err!("Expected converted type, found
None"))
- .and_then(|v| v.to_uppercase().parse::<ConvertedType>())?;
+ .ok_or_else(|| {
+ general_err!("Expected logical or converted type, found
None")
+ })
+ .and_then(|v| {
+ let upper = v.to_uppercase();
+ let logical = upper.parse::<LogicalType>();
+ match logical {
+ Ok(logical) => Ok((
+ Some(logical.clone()),
+ ConvertedType::from(Some(logical)),
+ )),
+ Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
+ }
+ })?;
// Parse precision and scale for decimals
let mut precision: i32 = -1;
let mut scale: i32 = -1;
- if tpe == ConvertedType::DECIMAL {
+ // Parse the concrete logical type
+ if let Some(tpe) = &logical {
+ match tpe {
+ LogicalType::DECIMAL(_) => {
+ if let Some("(") = self.tokenizer.next() {
+ precision = parse_i32(
+ self.tokenizer.next(),
+ "Expected precision, found None",
+ "Failed to parse precision for DECIMAL type",
+ )?;
+ if let Some(",") = self.tokenizer.next() {
+ scale = parse_i32(
+ self.tokenizer.next(),
+ "Invalid boolean found",
+ "Failure to parse timezone info for TIME
type",
+ )?; // TODO: this might not cater for the case
of no scale correctly
+ assert_token(self.tokenizer.next(), ")")?;
+ logical =
Some(LogicalType::DECIMAL(DecimalType {
+ precision,
+ scale,
+ }));
+ converted =
ConvertedType::from(logical.clone());
+ } else {
+ scale = 0;
+ logical =
Some(LogicalType::DECIMAL(DecimalType {
+ precision,
+ scale,
+ }));
+ converted =
ConvertedType::from(logical.clone());
+ }
+ }
+ }
+ LogicalType::TIME(_) => {
+ if let Some("(") = self.tokenizer.next() {
+ let unit = parse_timeunit(
+ self.tokenizer.next(),
+ "Invalid timeunit found",
+ "Failed to parse timeunit for TIME type",
+ )?;
+ if let Some(",") = self.tokenizer.next() {
+ let is_adjusted_to_u_t_c = parse_bool(
+ self.tokenizer.next(),
+ "Invalid boolean found",
+ "Failure to parse timezone info for TIME
type",
+ )?;
+ assert_token(self.tokenizer.next(), ")")?;
+ logical = Some(LogicalType::TIME(TimeType {
+ unit,
+ is_adjusted_to_u_t_c,
+ }));
+ converted =
ConvertedType::from(logical.clone());
+ } else {
+ // Invalid token for unit
+ self.tokenizer.backtrack();
+ }
+ }
+ }
+ LogicalType::TIMESTAMP(_) => {
+ if let Some("(") = self.tokenizer.next() {
+ let unit = parse_timeunit(
+ self.tokenizer.next(),
+ "Invalid timeunit found",
+ "Failed to parse timeunit for TIMESTAMP type",
+ )?;
+ if let Some(",") = self.tokenizer.next() {
+ let is_adjusted_to_u_t_c = parse_bool(
+ self.tokenizer.next(),
+ "Invalid boolean found",
+ "Failure to parse timezone info for
TIMESTAMP type",
+ )?;
+ assert_token(self.tokenizer.next(), ")")?;
+ logical =
Some(LogicalType::TIMESTAMP(TimestampType {
+ unit,
+ is_adjusted_to_u_t_c,
+ }));
+ converted =
ConvertedType::from(logical.clone());
+ } else {
+ // Invalid token for unit
+ self.tokenizer.backtrack();
+ }
+ }
+ }
+ LogicalType::INTEGER(_) => {
+ if let Some("(") = self.tokenizer.next() {
+ let bit_width = parse_i32(
+ self.tokenizer.next(),
+ "Invalid bit_width found",
+ "Failed to parse bit_width for INTEGER type",
+ )? as i8;
+ // TODO: check the unit against the physical type
+ if let Some(",") = self.tokenizer.next() {
+ let is_signed = parse_bool(
+ self.tokenizer.next(),
+ "Invalid boolean found",
+ "Failure to parse timezone info for
TIMESTAMP type",
Review comment:
```suggestion
"Failure to parse bit_width info for
INTEGER type",
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]