alamb commented on code in PR #6637:
URL: https://github.com/apache/arrow-rs/pull/6637#discussion_r1961510280
##########
parquet/src/arrow/async_reader/store.rs:
##########
@@ -163,15 +165,22 @@ impl AsyncFileReader for ParquetObjectReader {
// an `impl MetadataFetch` and calls those methods to get data from it.
Due to `Self`'s impl of
// `AsyncFileReader`, the calls to `MetadataFetch::fetch` are just
delegated to
// `Self::get_bytes`.
- fn get_metadata(&mut self) -> BoxFuture<'_, Result<Arc<ParquetMetaData>>> {
+ fn get_metadata<'a>(
+ &'a mut self,
+ #[cfg(feature = "encryption")] file_decryption_properties: Option<
+ &'a FileDecryptionProperties,
Review Comment:
It would make more sense to me if `FileDecryptionProperties` was a field on
the reader.
##########
parquet/Cargo.toml:
##########
@@ -125,6 +128,8 @@ sysinfo = ["dep:sysinfo"]
crc = ["dep:crc32fast"]
# Enable SIMD UTF-8 validation
simdutf8 = ["dep:simdutf8"]
+# Enable Parquet modular encryption support
Review Comment:
Can we please also document this new flag here
https://github.com/apache/arrow-rs/tree/main/parquet#feature-flags
Maybe we should update the feature support matrix as well
##########
parquet/src/arrow/async_reader/metadata.rs:
##########
@@ -127,13 +128,26 @@ impl<F: MetadataFetch> MetadataLoader<F> {
let (metadata, remainder) = if length > suffix_len - FOOTER_SIZE {
let metadata_start = file_size - length - FOOTER_SIZE;
let meta = fetch.fetch(metadata_start..file_size -
FOOTER_SIZE).await?;
- (ParquetMetaDataReader::decode_metadata(&meta)?, None)
+ (
+ ParquetMetaDataReader::decode_metadata(
Review Comment:
I agree with @tustvold and @etseidl here -- using `#cfg(...)` to change
function signatures seems uncommon (I haven't run across it before in Rust) and
I don't think there are any precedents in this crate either`
I think a more standard API would be to make a structure that has fields
which could be controlled. Given there is now more state / options needed to
decode the footer we could put the details into the struct
```rust
let decoder = ParquetMetaDataDecoder::new(&footer)
.read_meta(fetch)
...
```
Or something 🤔
##########
parquet/src/file/metadata/reader.rs:
##########
@@ -578,56 +636,139 @@ impl ParquetMetaDataReader {
if length > suffix_len - FOOTER_SIZE {
let metadata_start = file_size - length - FOOTER_SIZE;
let meta = fetch.fetch(metadata_start..file_size -
FOOTER_SIZE).await?;
- Ok((Self::decode_metadata(&meta)?, None))
+ Ok((
+ Self::decode_metadata(
+ &meta,
+ footer.is_encrypted_footer(),
+ #[cfg(feature = "encryption")]
+ file_decryption_properties,
+ )?,
+ None,
+ ))
} else {
let metadata_start = file_size - length - FOOTER_SIZE -
footer_start;
let slice = &suffix[metadata_start..suffix_len - FOOTER_SIZE];
Ok((
- Self::decode_metadata(slice)?,
+ Self::decode_metadata(
+ slice,
+ footer.is_encrypted_footer(),
+ #[cfg(feature = "encryption")]
+ file_decryption_properties,
+ )?,
Some((footer_start, suffix.slice(..metadata_start))),
))
}
}
- /// Decodes the Parquet footer returning the metadata length in bytes
+ /// Decodes the end of the Parquet footer
///
- /// A parquet footer is 8 bytes long and has the following layout:
+ /// There are 8 bytes at the end of the Parquet footer with the following
layout:
/// * 4 bytes for the metadata length
- /// * 4 bytes for the magic bytes 'PAR1'
+ /// * 4 bytes for the magic bytes 'PAR1' or 'PARE' (encrypted footer)
///
/// ```text
- /// +-----+--------+
- /// | len | 'PAR1' |
- /// +-----+--------+
+ /// +-----+------------------+
+ /// | len | 'PAR1' or 'PARE' |
+ /// +-----+------------------+
/// ```
- pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result<usize> {
- // check this is indeed a parquet file
- if slice[4..] != PARQUET_MAGIC {
+ pub fn decode_footer_tail(slice: &[u8; FOOTER_SIZE]) -> Result<FooterTail>
{
+ let magic = &slice[4..];
+ let encrypted_footer = if magic == PARQUET_MAGIC_ENCR_FOOTER {
+ true
+ } else if magic == PARQUET_MAGIC {
+ false
+ } else {
return Err(general_err!("Invalid Parquet file. Corrupt footer"));
- }
-
+ };
// get the metadata length from the footer
let metadata_len = u32::from_le_bytes(slice[..4].try_into().unwrap());
- // u32 won't be larger than usize in most cases
- Ok(metadata_len as usize)
+ Ok(FooterTail {
+ // u32 won't be larger than usize in most cases
+ metadata_length: metadata_len as usize,
+ encrypted_footer,
+ })
+ }
+
+ /// Decodes the Parquet footer, returning the metadata length in bytes
+ #[deprecated(note = "use decode_footer_tail instead")]
+ pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result<usize> {
+ Self::decode_footer_tail(slice).map(|f| f.metadata_length)
}
/// Decodes [`ParquetMetaData`] from the provided bytes.
///
/// Typically this is used to decode the metadata from the end of a parquet
- /// file. The format of `buf` is the Thift compact binary protocol, as
specified
+ /// file. The format of `buf` is the Thrift compact binary protocol, as
specified
/// by the [Parquet Spec].
///
/// [Parquet Spec]: https://github.com/apache/parquet-format#metadata
- pub fn decode_metadata(buf: &[u8]) -> Result<ParquetMetaData> {
+ pub fn decode_metadata(
+ buf: &[u8],
+ encrypted_footer: bool,
Review Comment:
technically speaking `encypted_footer` here is a new parameter and thus this
would be a breaking API change
As I mentioned above, since the decoder / decoding is becoming more
stateful, I think it is probably time to wrap the decoding logic into a more
encapsulated structure, which would likely also reduce the number of distrinct
APIs / #cfgs needed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]