This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new d379b98176 [Parquet] Minor: Update comments in page decompressor 
(#8764)
d379b98176 is described below

commit d379b981768215b16182f78aebddf940a4b45c99
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Nov 6 14:50:23 2025 -0500

    [Parquet] Minor: Update comments in page decompressor (#8764)
    
    # Which issue does this PR close?
    
    - follow on to https://github.com/apache/arrow-rs/pull/8756
    
    # Rationale for this change
    
    @etseidl comments:
    https://github.com/apache/arrow-rs/pull/8756#discussion_r2481506257
    
    > Not relevant to this PR, but I think this TODO has largely been
    addressed by https://github.com/apache/arrow-rs/pull/8376 which enabled
    skipping the decoding of the page statistics.
    
    While I was in here, I also wanted to capture the learning based on
    @mapleFU 's comment
    https://github.com/apache/arrow-rs/pull/8756#discussion_r2482281406
    
    > The code looks good to me but the I don't know if the comment "not
    compressed" can be replaced, if decompress_buffer is called and
    decompressed_size == 0 , seems that it generally means something like
    "this page only have levels, but not have non-null values"? ( Point me
    out if I'm wrong)
    
    # What changes are included in this PR?
    
    Include some comments
    # Are these changes tested?
    
    No (there are no code changes)
    
    # Are there any user-facing changes?
    
    No, this is internal comments only. No code / behavior changes
---
 parquet/src/file/serialized_reader.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
index 1b866a45cf..22bb8ba465 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -387,8 +387,6 @@ pub(crate) fn decode_page(
         can_decompress = header_v2.is_compressed.unwrap_or(true);
     }
 
-    // TODO: page header could be huge because of statistics. We should set a
-    // maximum page header size and abort if that is exceeded.
     let buffer = match decompressor {
         Some(decompressor) if can_decompress => {
             let uncompressed_page_size = 
usize::try_from(page_header.uncompressed_page_size)?;
@@ -398,6 +396,8 @@ pub(crate) fn decode_page(
             let decompressed_size = uncompressed_page_size - offset;
             let mut decompressed = Vec::with_capacity(uncompressed_page_size);
             decompressed.extend_from_slice(&buffer[..offset]);
+            // decompressed size of zero corresponds to a page with no 
non-null values
+            // see 
https://github.com/apache/parquet-format/blob/master/README.md#data-pages
             if decompressed_size > 0 {
                 let compressed = &buffer[offset..];
                 decompressor.decompress(compressed, &mut decompressed, 
Some(decompressed_size))?;

Reply via email to