This is an automated email from the ASF dual-hosted git repository.

Gabriel39 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 65136532975 [fix](parquet) Fix wrong encoding for parquet page v2 
(#63305)
65136532975 is described below

commit 6513653297559274f3e42c57457d1be8c31acc35
Author: Gabriel <[email protected]>
AuthorDate: Tue May 19 09:40:35 2026 +0800

    [fix](parquet) Fix wrong encoding for parquet page v2 (#63305)
---
 be/src/format/parquet/vparquet_group_reader.cpp    |  3 ++-
 be/test/format/parquet/parquet_thrift_test.cpp     | 31 ++++++++++++++++++++++
 .../data/query_p0/test_parquet_dict.out            |  4 +++
 .../suites/query_p0/test_parquet_dict.groovy       | 31 ++++++++++++++++++++++
 4 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/be/src/format/parquet/vparquet_group_reader.cpp 
b/be/src/format/parquet/vparquet_group_reader.cpp
index 6863926ddba..7fb2c6fe67c 100644
--- a/be/src/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/format/parquet/vparquet_group_reader.cpp
@@ -277,7 +277,8 @@ bool RowGroupReader::is_dictionary_encoded(const 
tparquet::ColumnMetaData& colum
     if (column_metadata.__isset.encoding_stats) {
         // Condition #1 above
         for (const tparquet::PageEncodingStats& enc_stat : 
column_metadata.encoding_stats) {
-            if (enc_stat.page_type == tparquet::PageType::DATA_PAGE &&
+            if ((enc_stat.page_type == tparquet::PageType::DATA_PAGE ||
+                 enc_stat.page_type == tparquet::PageType::DATA_PAGE_V2) &&
                 (enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
                  enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) &&
                 enc_stat.count > 0) {
diff --git a/be/test/format/parquet/parquet_thrift_test.cpp 
b/be/test/format/parquet/parquet_thrift_test.cpp
index 2253b6c12cc..7171fe3b63c 100644
--- a/be/test/format/parquet/parquet_thrift_test.cpp
+++ b/be/test/format/parquet/parquet_thrift_test.cpp
@@ -30,6 +30,7 @@
 #include <memory>
 #include <new>
 #include <ostream>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -457,4 +458,34 @@ TEST_F(ParquetThriftReaderTest, dict_decoder) {
     
read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/dict-decoder.parquet",
                                 
"./be/test/exec/test_data/parquet_scanner/dict-decoder.txt", 12);
 }
+
+TEST_F(ParquetThriftReaderTest, 
is_dictionary_encoded_rejects_plain_data_page_v2) {
+    tparquet::ColumnMetaData column_metadata;
+    column_metadata.type = tparquet::Type::BYTE_ARRAY;
+    column_metadata.__isset.encoding_stats = true;
+
+    tparquet::PageEncodingStats dict_page;
+    dict_page.page_type = tparquet::PageType::DATA_PAGE_V2;
+    dict_page.encoding = tparquet::Encoding::RLE_DICTIONARY;
+    dict_page.count = 2;
+
+    tparquet::PageEncodingStats plain_page;
+    plain_page.page_type = tparquet::PageType::DATA_PAGE_V2;
+    plain_page.encoding = tparquet::Encoding::PLAIN;
+    plain_page.count = 1;
+
+    column_metadata.encoding_stats = {dict_page, plain_page};
+
+    tparquet::RowGroup row_group;
+    row_group.num_rows = 0;
+    RowGroupReader::PositionDeleteContext 
position_delete_ctx(row_group.num_rows, 0);
+    RowGroupReader::LazyReadContext lazy_read_ctx;
+    std::set<uint64_t> column_ids;
+    std::set<uint64_t> filter_column_ids;
+    RowGroupReader row_group_reader(nullptr, {}, 0, row_group, nullptr, 
nullptr,
+                                    position_delete_ctx, lazy_read_ctx, 
nullptr, column_ids,
+                                    filter_column_ids);
+
+    EXPECT_FALSE(row_group_reader.is_dictionary_encoded(column_metadata));
+}
 } // namespace doris
diff --git a/regression-test/data/query_p0/test_parquet_dict.out 
b/regression-test/data/query_p0/test_parquet_dict.out
new file mode 100644
index 00000000000..7bbcaf3bf1c
--- /dev/null
+++ b/regression-test/data/query_p0/test_parquet_dict.out
@@ -0,0 +1,4 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !s3_tvf --
+68535cc98406454081424bf8247d783d
+
diff --git a/regression-test/suites/query_p0/test_parquet_dict.groovy 
b/regression-test/suites/query_p0/test_parquet_dict.groovy
new file mode 100644
index 00000000000..c50a89fde3f
--- /dev/null
+++ b/regression-test/suites/query_p0/test_parquet_dict.groovy
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_parquet_dict", "p0") {
+    try {
+        String ak = context.config.otherConfigs.get("ak")
+        String sk = context.config.otherConfigs.get("sk")
+        qt_s3_tvf """ SELECT * FROM FILE (
+            "uri" = 
"https://doris-regression-hk.oss-cn-hongkong.aliyuncs.com/regression/query_p0/test_page_v2.parquet";,
+            "s3.access_key"= "${ak}",
+            "s3.secret_key" = "${sk}",
+            "format" = "parquet"
+        ) where user_id='68535cc98406454081424bf8247d783d' ;
+        """
+    } finally {
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to