adamreeve commented on code in PR #6637:
URL: https://github.com/apache/arrow-rs/pull/6637#discussion_r1887800633
##########
parquet/src/file/serialized_reader.rs:
##########
@@ -338,14 +341,38 @@ impl<R: 'static + ChunkReader> RowGroupReader for
SerializedRowGroupReader<'_, R
}
/// Reads a [`PageHeader`] from the provided [`Read`]
-pub(crate) fn read_page_header<T: Read>(input: &mut T) -> Result<PageHeader> {
+pub(crate) fn read_page_header<T: Read>(input: &mut T, crypto_context:
Option<Arc<CryptoContext>>) -> Result<PageHeader> {
let mut prot = TCompactInputProtocol::new(input);
+ if let Some(crypto_context) = crypto_context {
+ // let mut buf = [0; 16 * 1024];
+ // let size = input.read(&mut buf)?;
+
+ let decryptor = &crypto_context.data_decryptor();
+ let file_decryptor = decryptor.footer_decryptor();
+ let aad_file_unique = decryptor.aad_file_unique();
+ // let aad_prefix = decryptor.aad_prefix();
+
+ let aad = create_page_aad(
+ aad_file_unique.as_slice(),
+ ModuleType::DictionaryPageHeader,
+ crypto_context.row_group_ordinal,
+ crypto_context.column_ordinal,
+ 0,
+ )?;
+
+ // todo: This currently fails, possibly due to wrongly generated AAD
+ let buf = file_decryptor.decrypt(prot.read_bytes()?.as_slice(),
aad.as_ref());
Review Comment:
Hi Rok, I had a look at this and noticed a couple of problems. One minor
issue is that the module type is hard coded to be a dictionary page header, but
it looks like the first page encountered should be a data page header just
based on debugging reading the same file from C++. And then in
`create_module_aad`, the page index is written as an i32 but it should be an
i16. Kind of related, there are a bunch of checks there like `row_group_ordinal
> i16::MAX`, but `row_group_ordinal` is an i16 so this is a no-op (not sure if
there is proper error handling when converting these to i16 further up?).
The main problem seems to be that `prot.read_bytes` here reads the length of
the buffer as a Thrift varint, then passes the remaining buffer through to be
decrypted, but for encrypted buffers the length is actually written as a 4 byte
little-endian and the `decrypt` method also expects to receive a ciphertext
buffer that includes the 4 length bytes.
With the following changes I can get to `todo!("Decrypted page header!")`:
```diff
diff --git a/parquet/src/encryption/ciphers.rs
b/parquet/src/encryption/ciphers.rs
index 89515fe0e..fc0b703ba 100644
--- a/parquet/src/encryption/ciphers.rs
+++ b/parquet/src/encryption/ciphers.rs
@@ -194,7 +194,7 @@ fn create_module_aad(file_aad: &[u8], module_type:
ModuleType, row_group_ordinal
}
if row_group_ordinal > i16::MAX {
return Err(general_err!("Encrypted parquet files can't have more
than {} row groups: {}",
- u16::MAX, row_group_ordinal));
+ i16::MAX, row_group_ordinal));
}
if column_ordinal < 0 {
@@ -202,7 +202,7 @@ fn create_module_aad(file_aad: &[u8], module_type:
ModuleType, row_group_ordinal
}
if column_ordinal > i16::MAX {
return Err(general_err!("Encrypted parquet files can't have more
than {} columns: {}",
- u16::MAX, column_ordinal));
+ i16::MAX, column_ordinal));
}
if module_buf[0] != (ModuleType::DataPageHeader as u8) &&
@@ -218,10 +218,11 @@ fn create_module_aad(file_aad: &[u8], module_type:
ModuleType, row_group_ordinal
if page_ordinal < 0 {
return Err(general_err!("Wrong page ordinal: {}", page_ordinal));
}
- if page_ordinal > i32::MAX {
+ if page_ordinal > (i16::MAX as i32){
return Err(general_err!("Encrypted parquet files can't have more
than {} pages in a chunk: {}",
- u16::MAX, page_ordinal));
+ i16::MAX, page_ordinal));
}
+ let page_ordinal = page_ordinal as i16;
let mut aad = Vec::with_capacity(file_aad.len() + 7);
aad.extend_from_slice(file_aad);
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index adf4aa07a..48796b09f 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -342,7 +342,6 @@ impl<R: 'static + ChunkReader> RowGroupReader for
SerializedRowGroupReader<'_, R
/// Reads a [`PageHeader`] from the provided [`Read`]
pub(crate) fn read_page_header<T: Read>(input: &mut T, crypto_context:
Option<Arc<CryptoContext>>) -> Result<PageHeader> {
- let mut prot = TCompactInputProtocol::new(input);
if let Some(crypto_context) = crypto_context {
// let mut buf = [0; 16 * 1024];
// let size = input.read(&mut buf)?;
@@ -354,19 +353,25 @@ pub(crate) fn read_page_header<T: Read>(input: &mut T,
crypto_context: Option<Ar
let aad = create_page_aad(
aad_file_unique.as_slice(),
- ModuleType::DictionaryPageHeader,
+ ModuleType::DataPageHeader,
crypto_context.row_group_ordinal,
crypto_context.column_ordinal,
0,
)?;
- // todo: This currently fails, possibly due to wrongly generated AAD
- let buf = file_decryptor.decrypt(prot.read_bytes()?.as_slice(),
aad.as_ref());
+ let mut len_bytes = [0; 4];
+ input.read_exact(&mut len_bytes)?;
+ let ciphertext_len = u32::from_le_bytes(len_bytes) as usize;
+ let mut ciphertext = vec![0; 4 + ciphertext_len];
+ input.read_exact(&mut ciphertext[4..])?;
+ let buf = file_decryptor.decrypt(&ciphertext, aad.as_ref());
todo!("Decrypted page header!");
let mut prot = TCompactSliceInputProtocol::new(buf.as_slice());
let page_header = PageHeader::read_from_in_protocol(&mut prot)?;
return Ok(page_header)
}
+
+ let mut prot = TCompactInputProtocol::new(input);
let page_header = PageHeader::read_from_in_protocol(&mut prot)?;
Ok(page_header)
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]