(doris) branch branch-4.0 updated: [branch-4.0][fix](parquet) Fix wrong encoding for parquet page v2 #63305 (#63738)

morningman Wed, 27 May 2026 23:09:50 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new e12cce84077 [branch-4.0][fix](parquet) Fix wrong encoding for parquet 
page v2 #63305 (#63738)
e12cce84077 is described below

commit e12cce840774fe152dbd9a47bb156ba4b4812403
Author: Gabriel <[email protected]>
AuthorDate: Thu May 28 14:09:28 2026 +0800

    [branch-4.0][fix](parquet) Fix wrong encoding for parquet page v2 #63305 
(#63738)
    
    ### What problem does this PR solve?
    
    pick #63305
---
 .../exec/format/parquet/vparquet_group_reader.cpp  |  3 ++-
 .../exec/format/parquet/parquet_thrift_test.cpp    | 31 ++++++++++++++++++++++
 .../data/query_p0/test_parquet_dict.out            |  4 +++
 .../suites/query_p0/test_parquet_dict.groovy       | 31 ++++++++++++++++++++++
 4 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index 543c8c44bd4..e8f9fee6b17 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -256,7 +256,8 @@ bool RowGroupReader::is_dictionary_encoded(const 
tparquet::ColumnMetaData& colum
     if (column_metadata.__isset.encoding_stats) {
         // Condition #1 above
         for (const tparquet::PageEncodingStats& enc_stat : 
column_metadata.encoding_stats) {
-            if (enc_stat.page_type == tparquet::PageType::DATA_PAGE &&
+            if ((enc_stat.page_type == tparquet::PageType::DATA_PAGE ||
+                 enc_stat.page_type == tparquet::PageType::DATA_PAGE_V2) &&
                 (enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
                  enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) &&
                 enc_stat.count > 0) {
diff --git a/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp 
b/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp
index 1d9c62ebfad..2fa4f5f46ee 100644
--- a/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp
+++ b/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp
@@ -30,6 +30,7 @@
 #include <memory>
 #include <new>
 #include <ostream>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -457,4 +458,34 @@ TEST_F(ParquetThriftReaderTest, dict_decoder) {
     
read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/dict-decoder.parquet",
                                 
"./be/test/exec/test_data/parquet_scanner/dict-decoder.txt", 12);
 }
+
+TEST_F(ParquetThriftReaderTest, 
is_dictionary_encoded_rejects_plain_data_page_v2) {
+    tparquet::ColumnMetaData column_metadata;
+    column_metadata.type = tparquet::Type::BYTE_ARRAY;
+    column_metadata.__isset.encoding_stats = true;
+
+    tparquet::PageEncodingStats dict_page;
+    dict_page.page_type = tparquet::PageType::DATA_PAGE_V2;
+    dict_page.encoding = tparquet::Encoding::RLE_DICTIONARY;
+    dict_page.count = 2;
+
+    tparquet::PageEncodingStats plain_page;
+    plain_page.page_type = tparquet::PageType::DATA_PAGE_V2;
+    plain_page.encoding = tparquet::Encoding::PLAIN;
+    plain_page.count = 1;
+
+    column_metadata.encoding_stats = {dict_page, plain_page};
+
+    tparquet::RowGroup row_group;
+    row_group.num_rows = 0;
+    RowGroupReader::PositionDeleteContext 
position_delete_ctx(row_group.num_rows, 0);
+    RowGroupReader::LazyReadContext lazy_read_ctx;
+    std::set<uint64_t> column_ids;
+    std::set<uint64_t> filter_column_ids;
+    RowGroupReader row_group_reader(nullptr, {}, 0, row_group, nullptr, 
nullptr,
+                                    position_delete_ctx, lazy_read_ctx, 
nullptr, column_ids,
+                                    filter_column_ids);
+
+    EXPECT_FALSE(row_group_reader.is_dictionary_encoded(column_metadata));
+}
 } // namespace doris::vectorized
diff --git a/regression-test/data/query_p0/test_parquet_dict.out 
b/regression-test/data/query_p0/test_parquet_dict.out
new file mode 100644
index 00000000000..7bbcaf3bf1c
--- /dev/null
+++ b/regression-test/data/query_p0/test_parquet_dict.out
@@ -0,0 +1,4 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !s3_tvf --
+68535cc98406454081424bf8247d783d
+
diff --git a/regression-test/suites/query_p0/test_parquet_dict.groovy 
b/regression-test/suites/query_p0/test_parquet_dict.groovy
new file mode 100644
index 00000000000..c50a89fde3f
--- /dev/null
+++ b/regression-test/suites/query_p0/test_parquet_dict.groovy
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_parquet_dict", "p0") {
+    try {
+        String ak = context.config.otherConfigs.get("ak")
+        String sk = context.config.otherConfigs.get("sk")
+        qt_s3_tvf """ SELECT * FROM FILE (
+            "uri" = 
"https://doris-regression-hk.oss-cn-hongkong.aliyuncs.com/regression/query_p0/test_page_v2.parquet";,
+            "s3.access_key"= "${ak}",
+            "s3.secret_key" = "${sk}",
+            "format" = "parquet"
+        ) where user_id='68535cc98406454081424bf8247d783d' ;
+        """
+    } finally {
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-4.0 updated: [branch-4.0][fix](parquet) Fix wrong encoding for parquet page v2 #63305 (#63738)

Reply via email to