This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 79a12a7afefdfa59e4fb2570827243abe5070e82
Author: Balazs Hevele <[email protected]>
AuthorDate: Mon Mar 9 12:58:07 2026 +0100

    IMPALA-12137: Do not copy parquet data for dict encoded pages
    
    For dict encoded pages, strings will point to the dictionary, and not
    the data page, so there is no need to make a copy of the page data in
    this case.
    
    Change-Id: I3debd1d8e6b8825723b0d3f4aa82190b2cac1e0e
    Reviewed-on: http://gerrit.cloudera.org:8080/24080
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/parquet/parquet-column-chunk-reader.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/be/src/exec/parquet/parquet-column-chunk-reader.cc 
b/be/src/exec/parquet/parquet-column-chunk-reader.cc
index 1aeebfd05..01a9a8c0c 100644
--- a/be/src/exec/parquet/parquet-column-chunk-reader.cc
+++ b/be/src/exec/parquet/parquet-column-chunk-reader.cc
@@ -382,9 +382,10 @@ Status 
ParquetColumnChunkReader::ReadDataPageData(DataPageInfo* page_info) {
           compressed_size, uncompressed_size));
     }
 
-    // TODO: could skip copying when the data page is dict encoded as strings
-    //       will point to the dictionary instead of the data buffer 
(IMPALA-12137)
-    const bool copy_buffer = value_mem_type_ == ValueMemoryType::VAR_LEN_STR;
+    // If data page is dict encoded, strings will point to the dictionary 
instead of
+    // the data buffer, so there is no need to make a copy of page data.
+    const bool copy_buffer = (value_mem_type_ == ValueMemoryType::VAR_LEN_STR) 
&&
+        !IsDictionaryEncoding(page_info->data_encoding);
 
     if (copy_buffer) {
       // In this case returned batches will have pointers into the data page 
itself.

Reply via email to