This is an automated email from the ASF dual-hosted git repository.
Gabriel39 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 65136532975 [fix](parquet) Fix wrong encoding for parquet page v2
(#63305)
65136532975 is described below
commit 6513653297559274f3e42c57457d1be8c31acc35
Author: Gabriel <[email protected]>
AuthorDate: Tue May 19 09:40:35 2026 +0800
[fix](parquet) Fix wrong encoding for parquet page v2 (#63305)
---
be/src/format/parquet/vparquet_group_reader.cpp | 3 ++-
be/test/format/parquet/parquet_thrift_test.cpp | 31 ++++++++++++++++++++++
.../data/query_p0/test_parquet_dict.out | 4 +++
.../suites/query_p0/test_parquet_dict.groovy | 31 ++++++++++++++++++++++
4 files changed, 68 insertions(+), 1 deletion(-)
diff --git a/be/src/format/parquet/vparquet_group_reader.cpp
b/be/src/format/parquet/vparquet_group_reader.cpp
index 6863926ddba..7fb2c6fe67c 100644
--- a/be/src/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/format/parquet/vparquet_group_reader.cpp
@@ -277,7 +277,8 @@ bool RowGroupReader::is_dictionary_encoded(const
tparquet::ColumnMetaData& colum
if (column_metadata.__isset.encoding_stats) {
// Condition #1 above
for (const tparquet::PageEncodingStats& enc_stat :
column_metadata.encoding_stats) {
- if (enc_stat.page_type == tparquet::PageType::DATA_PAGE &&
+ if ((enc_stat.page_type == tparquet::PageType::DATA_PAGE ||
+ enc_stat.page_type == tparquet::PageType::DATA_PAGE_V2) &&
(enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) &&
enc_stat.count > 0) {
diff --git a/be/test/format/parquet/parquet_thrift_test.cpp
b/be/test/format/parquet/parquet_thrift_test.cpp
index 2253b6c12cc..7171fe3b63c 100644
--- a/be/test/format/parquet/parquet_thrift_test.cpp
+++ b/be/test/format/parquet/parquet_thrift_test.cpp
@@ -30,6 +30,7 @@
#include <memory>
#include <new>
#include <ostream>
+#include <set>
#include <string>
#include <utility>
#include <vector>
@@ -457,4 +458,34 @@ TEST_F(ParquetThriftReaderTest, dict_decoder) {
read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/dict-decoder.parquet",
"./be/test/exec/test_data/parquet_scanner/dict-decoder.txt", 12);
}
+
+TEST_F(ParquetThriftReaderTest,
is_dictionary_encoded_rejects_plain_data_page_v2) {
+ tparquet::ColumnMetaData column_metadata;
+ column_metadata.type = tparquet::Type::BYTE_ARRAY;
+ column_metadata.__isset.encoding_stats = true;
+
+ tparquet::PageEncodingStats dict_page;
+ dict_page.page_type = tparquet::PageType::DATA_PAGE_V2;
+ dict_page.encoding = tparquet::Encoding::RLE_DICTIONARY;
+ dict_page.count = 2;
+
+ tparquet::PageEncodingStats plain_page;
+ plain_page.page_type = tparquet::PageType::DATA_PAGE_V2;
+ plain_page.encoding = tparquet::Encoding::PLAIN;
+ plain_page.count = 1;
+
+ column_metadata.encoding_stats = {dict_page, plain_page};
+
+ tparquet::RowGroup row_group;
+ row_group.num_rows = 0;
+ RowGroupReader::PositionDeleteContext
position_delete_ctx(row_group.num_rows, 0);
+ RowGroupReader::LazyReadContext lazy_read_ctx;
+ std::set<uint64_t> column_ids;
+ std::set<uint64_t> filter_column_ids;
+ RowGroupReader row_group_reader(nullptr, {}, 0, row_group, nullptr,
nullptr,
+ position_delete_ctx, lazy_read_ctx,
nullptr, column_ids,
+ filter_column_ids);
+
+ EXPECT_FALSE(row_group_reader.is_dictionary_encoded(column_metadata));
+}
} // namespace doris
diff --git a/regression-test/data/query_p0/test_parquet_dict.out
b/regression-test/data/query_p0/test_parquet_dict.out
new file mode 100644
index 00000000000..7bbcaf3bf1c
--- /dev/null
+++ b/regression-test/data/query_p0/test_parquet_dict.out
@@ -0,0 +1,4 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !s3_tvf --
+68535cc98406454081424bf8247d783d
+
diff --git a/regression-test/suites/query_p0/test_parquet_dict.groovy
b/regression-test/suites/query_p0/test_parquet_dict.groovy
new file mode 100644
index 00000000000..c50a89fde3f
--- /dev/null
+++ b/regression-test/suites/query_p0/test_parquet_dict.groovy
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_parquet_dict", "p0") {
+ try {
+ String ak = context.config.otherConfigs.get("ak")
+ String sk = context.config.otherConfigs.get("sk")
+ qt_s3_tvf """ SELECT * FROM FILE (
+ "uri" =
"https://doris-regression-hk.oss-cn-hongkong.aliyuncs.com/regression/query_p0/test_page_v2.parquet",
+ "s3.access_key"= "${ak}",
+ "s3.secret_key" = "${sk}",
+ "format" = "parquet"
+ ) where user_id='68535cc98406454081424bf8247d783d' ;
+ """
+ } finally {
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]