This is an automated email from the ASF dual-hosted git repository. michaelsmith pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 79a12a7afefdfa59e4fb2570827243abe5070e82 Author: Balazs Hevele <[email protected]> AuthorDate: Mon Mar 9 12:58:07 2026 +0100 IMPALA-12137: Do not copy parquet data for dict encoded pages For dict encoded pages, strings will point to the dictionary, and not the data page, so there is no need to make a copy of the page data in this case. Change-Id: I3debd1d8e6b8825723b0d3f4aa82190b2cac1e0e Reviewed-on: http://gerrit.cloudera.org:8080/24080 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/parquet/parquet-column-chunk-reader.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/be/src/exec/parquet/parquet-column-chunk-reader.cc b/be/src/exec/parquet/parquet-column-chunk-reader.cc index 1aeebfd05..01a9a8c0c 100644 --- a/be/src/exec/parquet/parquet-column-chunk-reader.cc +++ b/be/src/exec/parquet/parquet-column-chunk-reader.cc @@ -382,9 +382,10 @@ Status ParquetColumnChunkReader::ReadDataPageData(DataPageInfo* page_info) { compressed_size, uncompressed_size)); } - // TODO: could skip copying when the data page is dict encoded as strings - // will point to the dictionary instead of the data buffer (IMPALA-12137) - const bool copy_buffer = value_mem_type_ == ValueMemoryType::VAR_LEN_STR; + // If data page is dict encoded, strings will point to the dictionary instead of + // the data buffer, so there is no need to make a copy of page data. + const bool copy_buffer = (value_mem_type_ == ValueMemoryType::VAR_LEN_STR) && + !IsDictionaryEncoding(page_info->data_encoding); if (copy_buffer) { // In this case returned batches will have pointers into the data page itself.
