adamreeve commented on code in PR #6637:
URL: https://github.com/apache/arrow-rs/pull/6637#discussion_r1887800633


##########
parquet/src/file/serialized_reader.rs:
##########
@@ -338,14 +341,38 @@ impl<R: 'static + ChunkReader> RowGroupReader for 
SerializedRowGroupReader<'_, R
 }
 
 /// Reads a [`PageHeader`] from the provided [`Read`]
-pub(crate) fn read_page_header<T: Read>(input: &mut T) -> Result<PageHeader> {
+pub(crate) fn read_page_header<T: Read>(input: &mut T, crypto_context: 
Option<Arc<CryptoContext>>) -> Result<PageHeader> {
     let mut prot = TCompactInputProtocol::new(input);
+    if let Some(crypto_context) = crypto_context {
+        // let mut buf = [0; 16 * 1024];
+        // let size = input.read(&mut buf)?;
+
+        let decryptor = &crypto_context.data_decryptor();
+        let file_decryptor = decryptor.footer_decryptor();
+        let aad_file_unique = decryptor.aad_file_unique();
+        // let aad_prefix = decryptor.aad_prefix();
+
+        let aad = create_page_aad(
+            aad_file_unique.as_slice(),
+            ModuleType::DictionaryPageHeader,
+            crypto_context.row_group_ordinal,
+            crypto_context.column_ordinal,
+            0,
+        )?;
+
+        // todo: This currently fails, possibly due to wrongly generated AAD
+        let buf = file_decryptor.decrypt(prot.read_bytes()?.as_slice(), 
aad.as_ref());

Review Comment:
   Hi Rok, I had a look at this and noticed a couple of problems. One minor 
issue is that the module type is hard coded to be a dictionary page header, but 
it looks like the first page encountered should be a data page header just 
based on debugging reading the same file from C++. And then in 
`create_module_aad`, the page index is written as an i32 but it should be an 
i16. Kind of related, there are a bunch of checks there like `row_group_ordinal 
> i16::MAX`, but `row_group_ordinal` is an i16 so this is a no-op (not sure if 
there is proper error handling when converting these to i16 further up?).
   
   The main problem seems to be that `prot.read_bytes` here reads the length of 
the buffer as a Thrift varint, then passes the remaining buffer through to be 
decrypted, but for encrypted buffers the length is actually written as a 4 byte 
little-endian and the `decrypt` method also expects to receive a ciphertext 
buffer that includes the 4 length bytes. 
   
   With the following changes I can get to `todo!("Decrypted page header!")`: 
   
   ```diff
   diff --git a/parquet/src/encryption/ciphers.rs 
b/parquet/src/encryption/ciphers.rs
   index 89515fe0e..fc0b703ba 100644
   --- a/parquet/src/encryption/ciphers.rs
   +++ b/parquet/src/encryption/ciphers.rs
   @@ -194,7 +194,7 @@ fn create_module_aad(file_aad: &[u8], module_type: 
ModuleType, row_group_ordinal
        }
        if row_group_ordinal > i16::MAX {
            return Err(general_err!("Encrypted parquet files can't have more 
than {} row groups: {}",
   -            u16::MAX, row_group_ordinal));
   +            i16::MAX, row_group_ordinal));
        }
    
        if column_ordinal < 0 {
   @@ -202,7 +202,7 @@ fn create_module_aad(file_aad: &[u8], module_type: 
ModuleType, row_group_ordinal
        }
        if column_ordinal > i16::MAX {
            return Err(general_err!("Encrypted parquet files can't have more 
than {} columns: {}",
   -            u16::MAX, column_ordinal));
   +            i16::MAX, column_ordinal));
        }
    
        if module_buf[0] != (ModuleType::DataPageHeader as u8) &&
   @@ -218,10 +218,11 @@ fn create_module_aad(file_aad: &[u8], module_type: 
ModuleType, row_group_ordinal
        if page_ordinal < 0 {
            return Err(general_err!("Wrong page ordinal: {}", page_ordinal));
        }
   -    if page_ordinal > i32::MAX {
   +    if page_ordinal > (i16::MAX as i32){
            return Err(general_err!("Encrypted parquet files can't have more 
than {} pages in a chunk: {}",
   -            u16::MAX, page_ordinal));
   +            i16::MAX, page_ordinal));
        }
   +    let page_ordinal = page_ordinal as i16;
    
        let mut aad = Vec::with_capacity(file_aad.len() + 7);
        aad.extend_from_slice(file_aad);
   diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
   index adf4aa07a..48796b09f 100644
   --- a/parquet/src/file/serialized_reader.rs
   +++ b/parquet/src/file/serialized_reader.rs
   @@ -342,7 +342,6 @@ impl<R: 'static + ChunkReader> RowGroupReader for 
SerializedRowGroupReader<'_, R
    
    /// Reads a [`PageHeader`] from the provided [`Read`]
    pub(crate) fn read_page_header<T: Read>(input: &mut T, crypto_context: 
Option<Arc<CryptoContext>>) -> Result<PageHeader> {
   -    let mut prot = TCompactInputProtocol::new(input);
        if let Some(crypto_context) = crypto_context {
            // let mut buf = [0; 16 * 1024];
            // let size = input.read(&mut buf)?;
   @@ -354,19 +353,25 @@ pub(crate) fn read_page_header<T: Read>(input: &mut T, 
crypto_context: Option<Ar
    
            let aad = create_page_aad(
                aad_file_unique.as_slice(),
   -            ModuleType::DictionaryPageHeader,
   +            ModuleType::DataPageHeader,
                crypto_context.row_group_ordinal,
                crypto_context.column_ordinal,
                0,
            )?;
    
   -        // todo: This currently fails, possibly due to wrongly generated AAD
   -        let buf = file_decryptor.decrypt(prot.read_bytes()?.as_slice(), 
aad.as_ref());
   +        let mut len_bytes = [0; 4];
   +        input.read_exact(&mut len_bytes)?;
   +        let ciphertext_len = u32::from_le_bytes(len_bytes) as usize;
   +        let mut ciphertext = vec![0; 4 + ciphertext_len];
   +        input.read_exact(&mut ciphertext[4..])?;
   +        let buf = file_decryptor.decrypt(&ciphertext, aad.as_ref());
            todo!("Decrypted page header!");
            let mut prot = TCompactSliceInputProtocol::new(buf.as_slice());
            let page_header = PageHeader::read_from_in_protocol(&mut prot)?;
            return Ok(page_header)
        }
   +
   +    let mut prot = TCompactInputProtocol::new(input);
        let page_header = PageHeader::read_from_in_protocol(&mut prot)?;
        Ok(page_header)
    }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to