alamb commented on code in PR #2626: URL: https://github.com/apache/arrow-rs/pull/2626#discussion_r967107935
########## parquet/CONTRIBUTING.md: ########## @@ -60,7 +60,18 @@ Run `cargo bench` for benchmarks. To build documentation, run `cargo doc --no-deps`. To compile and view in the browser, run `cargo doc --no-deps --open`. -## Update Supported Parquet Version +## Update Parquet Format -To update Parquet format to a newer version, check if [parquet-format](https://github.com/sunchao/parquet-format-rs) -version is available. Then simply update version of `parquet-format` crate in Cargo.toml. +To generate the parquet format code run Review Comment: ```suggestion To generate the parquet format (thrift definitions) code run ``` ########## .gitattributes: ########## @@ -1,6 +1 @@ -r/R/RcppExports.R linguist-generated=true -r/R/arrowExports.R linguist-generated=true -r/src/RcppExports.cpp linguist-generated=true -r/src/arrowExports.cpp linguist-generated=true -r/man/*.Rd linguist-generated=true - +parquet/src/format.rs linguist-generated Review Comment: 👍 ########## parquet/src/basic.rs: ########## @@ -730,113 +736,129 @@ impl From<Option<LogicalType>> for ConvertedType { // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion -impl From<parquet::FieldRepetitionType> for Repetition { - fn from(value: parquet::FieldRepetitionType) -> Self { - match value { - parquet::FieldRepetitionType::Required => Repetition::REQUIRED, - parquet::FieldRepetitionType::Optional => Repetition::OPTIONAL, - parquet::FieldRepetitionType::Repeated => Repetition::REPEATED, - } +impl TryFrom<parquet::FieldRepetitionType> for Repetition { + type Error = ParquetError; + + fn try_from(value: parquet::FieldRepetitionType) -> Result<Self> { + Ok(match value { + parquet::FieldRepetitionType::REQUIRED => Repetition::REQUIRED, + parquet::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL, + parquet::FieldRepetitionType::REPEATED => Repetition::REPEATED, + _ => return Err(general_err!("unexpected repetition type: {}", value.0)), Review Comment: ```suggestion _ => return Err(general_err!("unexpected parquet repetition type: {}", value.0)), ``` ########## parquet/src/basic.rs: ########## @@ -496,105 +496,111 @@ impl fmt::Display for ColumnOrder { // ---------------------------------------------------------------------- // parquet::Type <=> Type conversion -impl From<parquet::Type> for Type { - fn from(value: parquet::Type) -> Self { - match value { - parquet::Type::Boolean => Type::BOOLEAN, - parquet::Type::Int32 => Type::INT32, - parquet::Type::Int64 => Type::INT64, - parquet::Type::Int96 => Type::INT96, - parquet::Type::Float => Type::FLOAT, - parquet::Type::Double => Type::DOUBLE, - parquet::Type::ByteArray => Type::BYTE_ARRAY, - parquet::Type::FixedLenByteArray => Type::FIXED_LEN_BYTE_ARRAY, - } +impl TryFrom<parquet::Type> for Type { + type Error = ParquetError; + + fn try_from(value: parquet::Type) -> Result<Self> { + Ok(match value { + parquet::Type::BOOLEAN => Type::BOOLEAN, + parquet::Type::INT32 => Type::INT32, + parquet::Type::INT64 => Type::INT64, + parquet::Type::INT96 => Type::INT96, + parquet::Type::FLOAT => Type::FLOAT, + parquet::Type::DOUBLE => Type::DOUBLE, + parquet::Type::BYTE_ARRAY => Type::BYTE_ARRAY, + parquet::Type::FIXED_LEN_BYTE_ARRAY => Type::FIXED_LEN_BYTE_ARRAY, + _ => return Err(general_err!("unexpected type: {}", value.0)), + }) } } impl From<Type> for parquet::Type { fn from(value: Type) -> Self { match value { - Type::BOOLEAN => parquet::Type::Boolean, - Type::INT32 => parquet::Type::Int32, - Type::INT64 => parquet::Type::Int64, - Type::INT96 => parquet::Type::Int96, - Type::FLOAT => parquet::Type::Float, - Type::DOUBLE => parquet::Type::Double, - Type::BYTE_ARRAY => parquet::Type::ByteArray, - Type::FIXED_LEN_BYTE_ARRAY => parquet::Type::FixedLenByteArray, + Type::BOOLEAN => parquet::Type::BOOLEAN, + Type::INT32 => parquet::Type::INT32, + Type::INT64 => parquet::Type::INT64, + Type::INT96 => parquet::Type::INT96, + Type::FLOAT => parquet::Type::FLOAT, + Type::DOUBLE => parquet::Type::DOUBLE, + Type::BYTE_ARRAY => parquet::Type::BYTE_ARRAY, + Type::FIXED_LEN_BYTE_ARRAY => parquet::Type::FIXED_LEN_BYTE_ARRAY, } } } // ---------------------------------------------------------------------- // parquet::ConvertedType <=> ConvertedType conversion -impl From<Option<parquet::ConvertedType>> for ConvertedType { - fn from(option: Option<parquet::ConvertedType>) -> Self { - match option { +impl TryFrom<Option<parquet::ConvertedType>> for ConvertedType { + type Error = ParquetError; + + fn try_from(option: Option<parquet::ConvertedType>) -> Result<Self> { + Ok(match option { None => ConvertedType::NONE, Some(value) => match value { - parquet::ConvertedType::Utf8 => ConvertedType::UTF8, - parquet::ConvertedType::Map => ConvertedType::MAP, - parquet::ConvertedType::MapKeyValue => ConvertedType::MAP_KEY_VALUE, - parquet::ConvertedType::List => ConvertedType::LIST, - parquet::ConvertedType::Enum => ConvertedType::ENUM, - parquet::ConvertedType::Decimal => ConvertedType::DECIMAL, - parquet::ConvertedType::Date => ConvertedType::DATE, - parquet::ConvertedType::TimeMillis => ConvertedType::TIME_MILLIS, - parquet::ConvertedType::TimeMicros => ConvertedType::TIME_MICROS, - parquet::ConvertedType::TimestampMillis => { + parquet::ConvertedType::UTF8 => ConvertedType::UTF8, + parquet::ConvertedType::MAP => ConvertedType::MAP, + parquet::ConvertedType::MAP_KEY_VALUE => ConvertedType::MAP_KEY_VALUE, + parquet::ConvertedType::LIST => ConvertedType::LIST, + parquet::ConvertedType::ENUM => ConvertedType::ENUM, + parquet::ConvertedType::DECIMAL => ConvertedType::DECIMAL, + parquet::ConvertedType::DATE => ConvertedType::DATE, + parquet::ConvertedType::TIME_MILLIS => ConvertedType::TIME_MILLIS, + parquet::ConvertedType::TIME_MICROS => ConvertedType::TIME_MICROS, + parquet::ConvertedType::TIMESTAMP_MILLIS => { ConvertedType::TIMESTAMP_MILLIS } - parquet::ConvertedType::TimestampMicros => { + parquet::ConvertedType::TIMESTAMP_MICROS => { ConvertedType::TIMESTAMP_MICROS } - parquet::ConvertedType::Uint8 => ConvertedType::UINT_8, - parquet::ConvertedType::Uint16 => ConvertedType::UINT_16, - parquet::ConvertedType::Uint32 => ConvertedType::UINT_32, - parquet::ConvertedType::Uint64 => ConvertedType::UINT_64, - parquet::ConvertedType::Int8 => ConvertedType::INT_8, - parquet::ConvertedType::Int16 => ConvertedType::INT_16, - parquet::ConvertedType::Int32 => ConvertedType::INT_32, - parquet::ConvertedType::Int64 => ConvertedType::INT_64, - parquet::ConvertedType::Json => ConvertedType::JSON, - parquet::ConvertedType::Bson => ConvertedType::BSON, - parquet::ConvertedType::Interval => ConvertedType::INTERVAL, + parquet::ConvertedType::UINT_8 => ConvertedType::UINT_8, + parquet::ConvertedType::UINT_16 => ConvertedType::UINT_16, + parquet::ConvertedType::UINT_32 => ConvertedType::UINT_32, + parquet::ConvertedType::UINT_64 => ConvertedType::UINT_64, + parquet::ConvertedType::INT_8 => ConvertedType::INT_8, + parquet::ConvertedType::INT_16 => ConvertedType::INT_16, + parquet::ConvertedType::INT_32 => ConvertedType::INT_32, + parquet::ConvertedType::INT_64 => ConvertedType::INT_64, + parquet::ConvertedType::JSON => ConvertedType::JSON, + parquet::ConvertedType::BSON => ConvertedType::BSON, + parquet::ConvertedType::INTERVAL => ConvertedType::INTERVAL, + _ => return Err(general_err!("unexpected converted type: {}", value.0)), }, - } + }) } } impl From<ConvertedType> for Option<parquet::ConvertedType> { fn from(value: ConvertedType) -> Self { match value { ConvertedType::NONE => None, - ConvertedType::UTF8 => Some(parquet::ConvertedType::Utf8), - ConvertedType::MAP => Some(parquet::ConvertedType::Map), - ConvertedType::MAP_KEY_VALUE => Some(parquet::ConvertedType::MapKeyValue), - ConvertedType::LIST => Some(parquet::ConvertedType::List), - ConvertedType::ENUM => Some(parquet::ConvertedType::Enum), - ConvertedType::DECIMAL => Some(parquet::ConvertedType::Decimal), - ConvertedType::DATE => Some(parquet::ConvertedType::Date), - ConvertedType::TIME_MILLIS => Some(parquet::ConvertedType::TimeMillis), - ConvertedType::TIME_MICROS => Some(parquet::ConvertedType::TimeMicros), + ConvertedType::UTF8 => Some(parquet::ConvertedType::UTF8), Review Comment: It is somewhat annoying that the thrift compiler changed these enum variants to be all caps 😱 ########## parquet/src/basic.rs: ########## @@ -730,113 +736,129 @@ impl From<Option<LogicalType>> for ConvertedType { // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion -impl From<parquet::FieldRepetitionType> for Repetition { - fn from(value: parquet::FieldRepetitionType) -> Self { - match value { - parquet::FieldRepetitionType::Required => Repetition::REQUIRED, - parquet::FieldRepetitionType::Optional => Repetition::OPTIONAL, - parquet::FieldRepetitionType::Repeated => Repetition::REPEATED, - } +impl TryFrom<parquet::FieldRepetitionType> for Repetition { + type Error = ParquetError; + + fn try_from(value: parquet::FieldRepetitionType) -> Result<Self> { + Ok(match value { + parquet::FieldRepetitionType::REQUIRED => Repetition::REQUIRED, + parquet::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL, + parquet::FieldRepetitionType::REPEATED => Repetition::REPEATED, + _ => return Err(general_err!("unexpected repetition type: {}", value.0)), + }) } } impl From<Repetition> for parquet::FieldRepetitionType { fn from(value: Repetition) -> Self { match value { - Repetition::REQUIRED => parquet::FieldRepetitionType::Required, - Repetition::OPTIONAL => parquet::FieldRepetitionType::Optional, - Repetition::REPEATED => parquet::FieldRepetitionType::Repeated, + Repetition::REQUIRED => parquet::FieldRepetitionType::REQUIRED, + Repetition::OPTIONAL => parquet::FieldRepetitionType::OPTIONAL, + Repetition::REPEATED => parquet::FieldRepetitionType::REPEATED, } } } // ---------------------------------------------------------------------- // parquet::Encoding <=> Encoding conversion -impl From<parquet::Encoding> for Encoding { - fn from(value: parquet::Encoding) -> Self { - match value { - parquet::Encoding::Plain => Encoding::PLAIN, - parquet::Encoding::PlainDictionary => Encoding::PLAIN_DICTIONARY, - parquet::Encoding::Rle => Encoding::RLE, - parquet::Encoding::BitPacked => Encoding::BIT_PACKED, - parquet::Encoding::DeltaBinaryPacked => Encoding::DELTA_BINARY_PACKED, - parquet::Encoding::DeltaLengthByteArray => Encoding::DELTA_LENGTH_BYTE_ARRAY, - parquet::Encoding::DeltaByteArray => Encoding::DELTA_BYTE_ARRAY, - parquet::Encoding::RleDictionary => Encoding::RLE_DICTIONARY, - parquet::Encoding::ByteStreamSplit => Encoding::BYTE_STREAM_SPLIT, - } +impl TryFrom<parquet::Encoding> for Encoding { + type Error = ParquetError; + + fn try_from(value: parquet::Encoding) -> Result<Self> { + Ok(match value { + parquet::Encoding::PLAIN => Encoding::PLAIN, + parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY, + parquet::Encoding::RLE => Encoding::RLE, + parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED, + parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED, + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => { + Encoding::DELTA_LENGTH_BYTE_ARRAY + } + parquet::Encoding::DELTA_BYTE_ARRAY => Encoding::DELTA_BYTE_ARRAY, + parquet::Encoding::RLE_DICTIONARY => Encoding::RLE_DICTIONARY, + parquet::Encoding::BYTE_STREAM_SPLIT => Encoding::BYTE_STREAM_SPLIT, + _ => return Err(general_err!("unexpected encoding: {}", value.0)), Review Comment: I was curious about the other values (as in what variants are `_` matching) so I commented it out: ``` Compiling parquet v22.0.0 (/Users/alamb/Software/arrow-rs/parquet) error[E0004]: non-exhaustive patterns: `Encoding(i32::MIN..=-1_i32)`, `Encoding(1_i32)` and `Encoding(10_i32..=i32::MAX)` not covered --> parquet/src/basic.rs:769:18 | 769 | Ok(match value { | ^^^^^ patterns `Encoding(i32::MIN..=-1_i32)`, `Encoding(1_i32)` and `Encoding(10_i32..=i32::MAX)` not covered | ``` In case any other reviewer was interestd ########## parquet/src/basic.rs: ########## @@ -496,105 +496,111 @@ impl fmt::Display for ColumnOrder { // ---------------------------------------------------------------------- // parquet::Type <=> Type conversion -impl From<parquet::Type> for Type { - fn from(value: parquet::Type) -> Self { - match value { - parquet::Type::Boolean => Type::BOOLEAN, - parquet::Type::Int32 => Type::INT32, - parquet::Type::Int64 => Type::INT64, - parquet::Type::Int96 => Type::INT96, - parquet::Type::Float => Type::FLOAT, - parquet::Type::Double => Type::DOUBLE, - parquet::Type::ByteArray => Type::BYTE_ARRAY, - parquet::Type::FixedLenByteArray => Type::FIXED_LEN_BYTE_ARRAY, - } +impl TryFrom<parquet::Type> for Type { + type Error = ParquetError; + + fn try_from(value: parquet::Type) -> Result<Self> { + Ok(match value { + parquet::Type::BOOLEAN => Type::BOOLEAN, + parquet::Type::INT32 => Type::INT32, + parquet::Type::INT64 => Type::INT64, + parquet::Type::INT96 => Type::INT96, + parquet::Type::FLOAT => Type::FLOAT, + parquet::Type::DOUBLE => Type::DOUBLE, + parquet::Type::BYTE_ARRAY => Type::BYTE_ARRAY, + parquet::Type::FIXED_LEN_BYTE_ARRAY => Type::FIXED_LEN_BYTE_ARRAY, + _ => return Err(general_err!("unexpected type: {}", value.0)), Review Comment: ```suggestion _ => return Err(general_err!("unexpected parquet type: {}", value.0)), ``` ########## parquet/src/lib.rs: ########## @@ -61,6 +61,9 @@ macro_rules! experimental { pub mod errors; pub mod basic; +#[allow(clippy::derivable_impls, clippy::match_single_binding)] +pub mod format; Review Comment: ```suggestion /// Automatically generated code for reading parquet thrift definition. // see parquet/CONTRIBUTING.md for instructions on regenerating pub mod format; ``` ########## parquet/src/basic.rs: ########## @@ -730,113 +736,129 @@ impl From<Option<LogicalType>> for ConvertedType { // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion -impl From<parquet::FieldRepetitionType> for Repetition { - fn from(value: parquet::FieldRepetitionType) -> Self { - match value { - parquet::FieldRepetitionType::Required => Repetition::REQUIRED, - parquet::FieldRepetitionType::Optional => Repetition::OPTIONAL, - parquet::FieldRepetitionType::Repeated => Repetition::REPEATED, - } +impl TryFrom<parquet::FieldRepetitionType> for Repetition { + type Error = ParquetError; + + fn try_from(value: parquet::FieldRepetitionType) -> Result<Self> { + Ok(match value { + parquet::FieldRepetitionType::REQUIRED => Repetition::REQUIRED, + parquet::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL, + parquet::FieldRepetitionType::REPEATED => Repetition::REPEATED, + _ => return Err(general_err!("unexpected repetition type: {}", value.0)), + }) } } impl From<Repetition> for parquet::FieldRepetitionType { fn from(value: Repetition) -> Self { match value { - Repetition::REQUIRED => parquet::FieldRepetitionType::Required, - Repetition::OPTIONAL => parquet::FieldRepetitionType::Optional, - Repetition::REPEATED => parquet::FieldRepetitionType::Repeated, + Repetition::REQUIRED => parquet::FieldRepetitionType::REQUIRED, + Repetition::OPTIONAL => parquet::FieldRepetitionType::OPTIONAL, + Repetition::REPEATED => parquet::FieldRepetitionType::REPEATED, } } } // ---------------------------------------------------------------------- // parquet::Encoding <=> Encoding conversion -impl From<parquet::Encoding> for Encoding { - fn from(value: parquet::Encoding) -> Self { - match value { - parquet::Encoding::Plain => Encoding::PLAIN, - parquet::Encoding::PlainDictionary => Encoding::PLAIN_DICTIONARY, - parquet::Encoding::Rle => Encoding::RLE, - parquet::Encoding::BitPacked => Encoding::BIT_PACKED, - parquet::Encoding::DeltaBinaryPacked => Encoding::DELTA_BINARY_PACKED, - parquet::Encoding::DeltaLengthByteArray => Encoding::DELTA_LENGTH_BYTE_ARRAY, - parquet::Encoding::DeltaByteArray => Encoding::DELTA_BYTE_ARRAY, - parquet::Encoding::RleDictionary => Encoding::RLE_DICTIONARY, - parquet::Encoding::ByteStreamSplit => Encoding::BYTE_STREAM_SPLIT, - } +impl TryFrom<parquet::Encoding> for Encoding { + type Error = ParquetError; + + fn try_from(value: parquet::Encoding) -> Result<Self> { + Ok(match value { + parquet::Encoding::PLAIN => Encoding::PLAIN, + parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY, + parquet::Encoding::RLE => Encoding::RLE, + parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED, + parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED, + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => { + Encoding::DELTA_LENGTH_BYTE_ARRAY + } + parquet::Encoding::DELTA_BYTE_ARRAY => Encoding::DELTA_BYTE_ARRAY, + parquet::Encoding::RLE_DICTIONARY => Encoding::RLE_DICTIONARY, + parquet::Encoding::BYTE_STREAM_SPLIT => Encoding::BYTE_STREAM_SPLIT, + _ => return Err(general_err!("unexpected encoding: {}", value.0)), + }) } } impl From<Encoding> for parquet::Encoding { fn from(value: Encoding) -> Self { match value { - Encoding::PLAIN => parquet::Encoding::Plain, - Encoding::PLAIN_DICTIONARY => parquet::Encoding::PlainDictionary, - Encoding::RLE => parquet::Encoding::Rle, - Encoding::BIT_PACKED => parquet::Encoding::BitPacked, - Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DeltaBinaryPacked, - Encoding::DELTA_LENGTH_BYTE_ARRAY => parquet::Encoding::DeltaLengthByteArray, - Encoding::DELTA_BYTE_ARRAY => parquet::Encoding::DeltaByteArray, - Encoding::RLE_DICTIONARY => parquet::Encoding::RleDictionary, - Encoding::BYTE_STREAM_SPLIT => parquet::Encoding::ByteStreamSplit, + Encoding::PLAIN => parquet::Encoding::PLAIN, + Encoding::PLAIN_DICTIONARY => parquet::Encoding::PLAIN_DICTIONARY, + Encoding::RLE => parquet::Encoding::RLE, + Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED, + Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DELTA_BINARY_PACKED, + Encoding::DELTA_LENGTH_BYTE_ARRAY => { + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY + } + Encoding::DELTA_BYTE_ARRAY => parquet::Encoding::DELTA_BYTE_ARRAY, + Encoding::RLE_DICTIONARY => parquet::Encoding::RLE_DICTIONARY, + Encoding::BYTE_STREAM_SPLIT => parquet::Encoding::BYTE_STREAM_SPLIT, } } } // ---------------------------------------------------------------------- // parquet::CompressionCodec <=> Compression conversion -impl From<parquet::CompressionCodec> for Compression { - fn from(value: parquet::CompressionCodec) -> Self { - match value { - parquet::CompressionCodec::Uncompressed => Compression::UNCOMPRESSED, - parquet::CompressionCodec::Snappy => Compression::SNAPPY, - parquet::CompressionCodec::Gzip => Compression::GZIP, - parquet::CompressionCodec::Lzo => Compression::LZO, - parquet::CompressionCodec::Brotli => Compression::BROTLI, - parquet::CompressionCodec::Lz4 => Compression::LZ4, - parquet::CompressionCodec::Zstd => Compression::ZSTD, - } +impl TryFrom<parquet::CompressionCodec> for Compression { + type Error = ParquetError; + + fn try_from(value: parquet::CompressionCodec) -> Result<Self> { + Ok(match value { + parquet::CompressionCodec::UNCOMPRESSED => Compression::UNCOMPRESSED, + parquet::CompressionCodec::SNAPPY => Compression::SNAPPY, + parquet::CompressionCodec::GZIP => Compression::GZIP, + parquet::CompressionCodec::LZO => Compression::LZO, + parquet::CompressionCodec::BROTLI => Compression::BROTLI, + parquet::CompressionCodec::LZ4 => Compression::LZ4, + parquet::CompressionCodec::ZSTD => Compression::ZSTD, + _ => return Err(general_err!("unexpected compression codec: {}", value.0)), + }) } } impl From<Compression> for parquet::CompressionCodec { fn from(value: Compression) -> Self { match value { - Compression::UNCOMPRESSED => parquet::CompressionCodec::Uncompressed, - Compression::SNAPPY => parquet::CompressionCodec::Snappy, - Compression::GZIP => parquet::CompressionCodec::Gzip, - Compression::LZO => parquet::CompressionCodec::Lzo, - Compression::BROTLI => parquet::CompressionCodec::Brotli, - Compression::LZ4 => parquet::CompressionCodec::Lz4, - Compression::ZSTD => parquet::CompressionCodec::Zstd, + Compression::UNCOMPRESSED => parquet::CompressionCodec::UNCOMPRESSED, + Compression::SNAPPY => parquet::CompressionCodec::SNAPPY, + Compression::GZIP => parquet::CompressionCodec::GZIP, + Compression::LZO => parquet::CompressionCodec::LZO, + Compression::BROTLI => parquet::CompressionCodec::BROTLI, + Compression::LZ4 => parquet::CompressionCodec::LZ4, + Compression::ZSTD => parquet::CompressionCodec::ZSTD, } } } // ---------------------------------------------------------------------- // parquet::PageType <=> PageType conversion -impl From<parquet::PageType> for PageType { - fn from(value: parquet::PageType) -> Self { - match value { - parquet::PageType::DataPage => PageType::DATA_PAGE, - parquet::PageType::IndexPage => PageType::INDEX_PAGE, - parquet::PageType::DictionaryPage => PageType::DICTIONARY_PAGE, - parquet::PageType::DataPageV2 => PageType::DATA_PAGE_V2, - } +impl TryFrom<parquet::PageType> for PageType { + type Error = ParquetError; + + fn try_from(value: parquet::PageType) -> Result<Self> { + Ok(match value { + parquet::PageType::DATA_PAGE => PageType::DATA_PAGE, + parquet::PageType::INDEX_PAGE => PageType::INDEX_PAGE, + parquet::PageType::DICTIONARY_PAGE => PageType::DICTIONARY_PAGE, + parquet::PageType::DATA_PAGE_V2 => PageType::DATA_PAGE_V2, + _ => return Err(general_err!("unexpected page type: {}", value.0)), Review Comment: ```suggestion _ => return Err(general_err!("unexpected parquet page type: {}", value.0)), ``` ########## parquet/Cargo.toml: ########## @@ -31,9 +31,8 @@ rust-version = "1.62" [dependencies] ahash = "0.8" -parquet-format = { version = "4.0.0", default-features = false } bytes = { version = "1.1", default-features = false, features = ["std"] } -thrift = { version = "0.13", default-features = false } +thrift = { version = "0.16", default-features = false } Review Comment: 👍 ########## parquet/src/basic.rs: ########## @@ -496,105 +496,111 @@ impl fmt::Display for ColumnOrder { // ---------------------------------------------------------------------- // parquet::Type <=> Type conversion -impl From<parquet::Type> for Type { - fn from(value: parquet::Type) -> Self { - match value { - parquet::Type::Boolean => Type::BOOLEAN, - parquet::Type::Int32 => Type::INT32, - parquet::Type::Int64 => Type::INT64, - parquet::Type::Int96 => Type::INT96, - parquet::Type::Float => Type::FLOAT, - parquet::Type::Double => Type::DOUBLE, - parquet::Type::ByteArray => Type::BYTE_ARRAY, - parquet::Type::FixedLenByteArray => Type::FIXED_LEN_BYTE_ARRAY, - } +impl TryFrom<parquet::Type> for Type { + type Error = ParquetError; + + fn try_from(value: parquet::Type) -> Result<Self> { + Ok(match value { + parquet::Type::BOOLEAN => Type::BOOLEAN, + parquet::Type::INT32 => Type::INT32, + parquet::Type::INT64 => Type::INT64, + parquet::Type::INT96 => Type::INT96, + parquet::Type::FLOAT => Type::FLOAT, + parquet::Type::DOUBLE => Type::DOUBLE, + parquet::Type::BYTE_ARRAY => Type::BYTE_ARRAY, + parquet::Type::FIXED_LEN_BYTE_ARRAY => Type::FIXED_LEN_BYTE_ARRAY, + _ => return Err(general_err!("unexpected type: {}", value.0)), + }) } } impl From<Type> for parquet::Type { fn from(value: Type) -> Self { match value { - Type::BOOLEAN => parquet::Type::Boolean, - Type::INT32 => parquet::Type::Int32, - Type::INT64 => parquet::Type::Int64, - Type::INT96 => parquet::Type::Int96, - Type::FLOAT => parquet::Type::Float, - Type::DOUBLE => parquet::Type::Double, - Type::BYTE_ARRAY => parquet::Type::ByteArray, - Type::FIXED_LEN_BYTE_ARRAY => parquet::Type::FixedLenByteArray, + Type::BOOLEAN => parquet::Type::BOOLEAN, + Type::INT32 => parquet::Type::INT32, + Type::INT64 => parquet::Type::INT64, + Type::INT96 => parquet::Type::INT96, + Type::FLOAT => parquet::Type::FLOAT, + Type::DOUBLE => parquet::Type::DOUBLE, + Type::BYTE_ARRAY => parquet::Type::BYTE_ARRAY, + Type::FIXED_LEN_BYTE_ARRAY => parquet::Type::FIXED_LEN_BYTE_ARRAY, } } } // ---------------------------------------------------------------------- // parquet::ConvertedType <=> ConvertedType conversion -impl From<Option<parquet::ConvertedType>> for ConvertedType { - fn from(option: Option<parquet::ConvertedType>) -> Self { - match option { +impl TryFrom<Option<parquet::ConvertedType>> for ConvertedType { + type Error = ParquetError; + + fn try_from(option: Option<parquet::ConvertedType>) -> Result<Self> { + Ok(match option { None => ConvertedType::NONE, Some(value) => match value { - parquet::ConvertedType::Utf8 => ConvertedType::UTF8, - parquet::ConvertedType::Map => ConvertedType::MAP, - parquet::ConvertedType::MapKeyValue => ConvertedType::MAP_KEY_VALUE, - parquet::ConvertedType::List => ConvertedType::LIST, - parquet::ConvertedType::Enum => ConvertedType::ENUM, - parquet::ConvertedType::Decimal => ConvertedType::DECIMAL, - parquet::ConvertedType::Date => ConvertedType::DATE, - parquet::ConvertedType::TimeMillis => ConvertedType::TIME_MILLIS, - parquet::ConvertedType::TimeMicros => ConvertedType::TIME_MICROS, - parquet::ConvertedType::TimestampMillis => { + parquet::ConvertedType::UTF8 => ConvertedType::UTF8, + parquet::ConvertedType::MAP => ConvertedType::MAP, + parquet::ConvertedType::MAP_KEY_VALUE => ConvertedType::MAP_KEY_VALUE, + parquet::ConvertedType::LIST => ConvertedType::LIST, + parquet::ConvertedType::ENUM => ConvertedType::ENUM, + parquet::ConvertedType::DECIMAL => ConvertedType::DECIMAL, + parquet::ConvertedType::DATE => ConvertedType::DATE, + parquet::ConvertedType::TIME_MILLIS => ConvertedType::TIME_MILLIS, + parquet::ConvertedType::TIME_MICROS => ConvertedType::TIME_MICROS, + parquet::ConvertedType::TIMESTAMP_MILLIS => { ConvertedType::TIMESTAMP_MILLIS } - parquet::ConvertedType::TimestampMicros => { + parquet::ConvertedType::TIMESTAMP_MICROS => { ConvertedType::TIMESTAMP_MICROS } - parquet::ConvertedType::Uint8 => ConvertedType::UINT_8, - parquet::ConvertedType::Uint16 => ConvertedType::UINT_16, - parquet::ConvertedType::Uint32 => ConvertedType::UINT_32, - parquet::ConvertedType::Uint64 => ConvertedType::UINT_64, - parquet::ConvertedType::Int8 => ConvertedType::INT_8, - parquet::ConvertedType::Int16 => ConvertedType::INT_16, - parquet::ConvertedType::Int32 => ConvertedType::INT_32, - parquet::ConvertedType::Int64 => ConvertedType::INT_64, - parquet::ConvertedType::Json => ConvertedType::JSON, - parquet::ConvertedType::Bson => ConvertedType::BSON, - parquet::ConvertedType::Interval => ConvertedType::INTERVAL, + parquet::ConvertedType::UINT_8 => ConvertedType::UINT_8, + parquet::ConvertedType::UINT_16 => ConvertedType::UINT_16, + parquet::ConvertedType::UINT_32 => ConvertedType::UINT_32, + parquet::ConvertedType::UINT_64 => ConvertedType::UINT_64, + parquet::ConvertedType::INT_8 => ConvertedType::INT_8, + parquet::ConvertedType::INT_16 => ConvertedType::INT_16, + parquet::ConvertedType::INT_32 => ConvertedType::INT_32, + parquet::ConvertedType::INT_64 => ConvertedType::INT_64, + parquet::ConvertedType::JSON => ConvertedType::JSON, + parquet::ConvertedType::BSON => ConvertedType::BSON, + parquet::ConvertedType::INTERVAL => ConvertedType::INTERVAL, + _ => return Err(general_err!("unexpected converted type: {}", value.0)), Review Comment: ```suggestion _ => return Err(general_err!("unexpected parquet converted type: {}", value.0)), ``` ########## parquet/CONTRIBUTING.md: ########## @@ -60,7 +60,18 @@ Run `cargo bench` for benchmarks. To build documentation, run `cargo doc --no-deps`. To compile and view in the browser, run `cargo doc --no-deps --open`. -## Update Supported Parquet Version +## Update Parquet Format -To update Parquet format to a newer version, check if [parquet-format](https://github.com/sunchao/parquet-format-rs) -version is available. Then simply update version of `parquet-format` crate in Cargo.toml. +To generate the parquet format code run + +``` +$ git clone https://github.com/apache/thrift +$ cd thrift +$ git checkout v0.16.0 +# docker build just builds a docker image with thrift dependencies +$ docker build -t thrift build/docker/ubuntu-bionic +# build/docker/scripts/cmake.sh actually compiles thrift +$ docker run -v $(pwd):/thrift/src -it thrift build/docker/scripts/cmake.sh && wget https://raw.githubusercontent.com/apache/parquet-format/apache-parquet-format-2.9.0/src/main/thrift/parquet.thrift && ./cmake_build/compiler/cpp/bin/thrift --gen rs parquet.thrift Review Comment: This command did not complete successfully for me. ```shell ... 441: ---------------------------------------------------------------------- 441: Ran 2 tests in 0.012s 441: 441: OK 441: Traceback (most recent call last): 441: File "/thrift/src/test/py/TestServer.py", line 403, in <module> 441: from thrift.TMultiplexedProcessor import TMultiplexedProcessor 441: File "/thrift/src/lib/py/build/lib.linux-x86_64-2.7/thrift/TMultiplexedProcessor.py", line 20, in <module> 441: from thrift.Thrift import TProcessor, TMessageType 441: ImportError: No module named Thrift 441: t.py 441: ---- 441: ---------------- 441: Executing Client/Server tests with various generated code directories 441: Servers to be tested: TSimpleServer, TThreadedServer, TThreadPoolServer, TNonblockingServer, THttpServer, TProcessPoolServer, TForkingServer 441: Directories to be tested: gen-py-default, gen-py-slots, gen-py-oldstyle, gen-py-no_utf8strings, gen-py-dynamic, gen-py-dynamicslots 441: Protocols to be tested: accel, accelc, binary, compact, json, header 441: Options to be tested: ZLIB(yes/no), SSL(yes/no) 441: ---------------- 441: 441: Test run #0: (includes gen-py-default) Server=TSimpleServer, Proto=accel, zlib=False, SSL=False 441: Testing server TSimpleServer: /usr/bin/python /thrift/src/test/py/TestServer.py --protocol=accel --port=9090 TSimpleServer 441: FAIL: Server process (/usr/bin/python /thrift/src/test/py/TestServer.py --protocol=accel --port=9090 TSimpleServer) failed with retcode 1 441: Traceback (most recent call last): 441: File "/thrift/src/test/py/RunClientServer.py", line 323, in <module> 441: sys.exit(main()) 441: File "/thrift/src/test/py/RunClientServer.py", line 315, in main 441: tests.test_feature('gendir', generated_dirs) 441: File "/thrift/src/test/py/RunClientServer.py", line 230, in test_feature 441: if self.run(conf, test_count): 441: File "/thrift/src/test/py/RunClientServer.py", line 219, in run 441: runServiceTest(self.libdir, self.genbase, genpydir, try_server, try_proto, self.port, with_zlib, with_ssl, self.verbose) 441: File "/thrift/src/test/py/RunClientServer.py", line 157, in runServiceTest 441: ensureServerAlive() 441: File "/thrift/src/test/py/RunClientServer.py", line 140, in ensureServerAlive 441: % (server_class, ' '.join(server_args))) 441: Exception: Server subprocess TSimpleServer died, args: /usr/bin/python /thrift/src/test/py/TestServer.py --protocol=accel --port=9090 TSimpleServer 441/441 Test #441: python_test .......................***Failed 28.49 sec 99% tests passed, 6 tests failed out of 441 Total Test time (real) = 250.09 sec The following tests FAILED: 382 - TInterruptTest (Failed) 401 - TNonblockingSSLServerTest (Failed) 403 - SecurityTest (Failed) 404 - SecurityFromBufferTest (Failed) 433 - PythonTestSSLSocket (Failed) 441 - python_test (Failed) Errors while running CTest ``` ########## parquet/src/basic.rs: ########## @@ -730,113 +736,129 @@ impl From<Option<LogicalType>> for ConvertedType { // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion -impl From<parquet::FieldRepetitionType> for Repetition { - fn from(value: parquet::FieldRepetitionType) -> Self { - match value { - parquet::FieldRepetitionType::Required => Repetition::REQUIRED, - parquet::FieldRepetitionType::Optional => Repetition::OPTIONAL, - parquet::FieldRepetitionType::Repeated => Repetition::REPEATED, - } +impl TryFrom<parquet::FieldRepetitionType> for Repetition { + type Error = ParquetError; + + fn try_from(value: parquet::FieldRepetitionType) -> Result<Self> { + Ok(match value { + parquet::FieldRepetitionType::REQUIRED => Repetition::REQUIRED, + parquet::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL, + parquet::FieldRepetitionType::REPEATED => Repetition::REPEATED, + _ => return Err(general_err!("unexpected repetition type: {}", value.0)), + }) } } impl From<Repetition> for parquet::FieldRepetitionType { fn from(value: Repetition) -> Self { match value { - Repetition::REQUIRED => parquet::FieldRepetitionType::Required, - Repetition::OPTIONAL => parquet::FieldRepetitionType::Optional, - Repetition::REPEATED => parquet::FieldRepetitionType::Repeated, + Repetition::REQUIRED => parquet::FieldRepetitionType::REQUIRED, + Repetition::OPTIONAL => parquet::FieldRepetitionType::OPTIONAL, + Repetition::REPEATED => parquet::FieldRepetitionType::REPEATED, } } } // ---------------------------------------------------------------------- // parquet::Encoding <=> Encoding conversion -impl From<parquet::Encoding> for Encoding { - fn from(value: parquet::Encoding) -> Self { - match value { - parquet::Encoding::Plain => Encoding::PLAIN, - parquet::Encoding::PlainDictionary => Encoding::PLAIN_DICTIONARY, - parquet::Encoding::Rle => Encoding::RLE, - parquet::Encoding::BitPacked => Encoding::BIT_PACKED, - parquet::Encoding::DeltaBinaryPacked => Encoding::DELTA_BINARY_PACKED, - parquet::Encoding::DeltaLengthByteArray => Encoding::DELTA_LENGTH_BYTE_ARRAY, - parquet::Encoding::DeltaByteArray => Encoding::DELTA_BYTE_ARRAY, - parquet::Encoding::RleDictionary => Encoding::RLE_DICTIONARY, - parquet::Encoding::ByteStreamSplit => Encoding::BYTE_STREAM_SPLIT, - } +impl TryFrom<parquet::Encoding> for Encoding { + type Error = ParquetError; + + fn try_from(value: parquet::Encoding) -> Result<Self> { + Ok(match value { + parquet::Encoding::PLAIN => Encoding::PLAIN, + parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY, + parquet::Encoding::RLE => Encoding::RLE, + parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED, + parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED, + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => { + Encoding::DELTA_LENGTH_BYTE_ARRAY + } + parquet::Encoding::DELTA_BYTE_ARRAY => Encoding::DELTA_BYTE_ARRAY, + parquet::Encoding::RLE_DICTIONARY => Encoding::RLE_DICTIONARY, + parquet::Encoding::BYTE_STREAM_SPLIT => Encoding::BYTE_STREAM_SPLIT, + _ => return Err(general_err!("unexpected encoding: {}", value.0)), Review Comment: ```suggestion _ => return Err(general_err!("unexpected parquet encoding: {}", value.0)), ``` ########## parquet/CONTRIBUTING.md: ########## @@ -60,7 +60,18 @@ Run `cargo bench` for benchmarks. To build documentation, run `cargo doc --no-deps`. To compile and view in the browser, run `cargo doc --no-deps --open`. -## Update Supported Parquet Version +## Update Parquet Format -To update Parquet format to a newer version, check if [parquet-format](https://github.com/sunchao/parquet-format-rs) -version is available. Then simply update version of `parquet-format` crate in Cargo.toml. +To generate the parquet format code run + +``` +$ git clone https://github.com/apache/thrift +$ cd thrift +$ git checkout v0.16.0 +# docker build just builds a docker image with thrift dependencies +$ docker build -t thrift build/docker/ubuntu-bionic Review Comment: Building this image took quite some time (what a lot of build dependencies!) ########## parquet/src/basic.rs: ########## @@ -730,113 +736,129 @@ impl From<Option<LogicalType>> for ConvertedType { // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion -impl From<parquet::FieldRepetitionType> for Repetition { - fn from(value: parquet::FieldRepetitionType) -> Self { - match value { - parquet::FieldRepetitionType::Required => Repetition::REQUIRED, - parquet::FieldRepetitionType::Optional => Repetition::OPTIONAL, - parquet::FieldRepetitionType::Repeated => Repetition::REPEATED, - } +impl TryFrom<parquet::FieldRepetitionType> for Repetition { + type Error = ParquetError; + + fn try_from(value: parquet::FieldRepetitionType) -> Result<Self> { + Ok(match value { + parquet::FieldRepetitionType::REQUIRED => Repetition::REQUIRED, + parquet::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL, + parquet::FieldRepetitionType::REPEATED => Repetition::REPEATED, + _ => return Err(general_err!("unexpected repetition type: {}", value.0)), Review Comment: I am suggesting adding `parquet` to all the errors thinking of the poor end user who will see this without the potential stack to know what the context is -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
