This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new adb7730b64b [fix](parquet) the end offset of column chunk may be wrong
in parquet metadata #28891 (#28893)
adb7730b64b is described below
commit adb7730b64b0a0d882fec6517af9c2dc73cdb7b5
Author: Ashin Gau <[email protected]>
AuthorDate: Sat Dec 23 22:23:07 2023 +0800
[fix](parquet) the end offset of column chunk may be wrong in parquet
metadata #28891 (#28893)
backport: #28891
---
.../vec/exec/format/parquet/vparquet_column_chunk_reader.cpp | 2 ++
.../vec/exec/format/parquet/vparquet_column_chunk_reader.h | 11 ++++++++++-
be/src/vec/exec/format/parquet/vparquet_page_reader.h | 5 +++++
regression-test/data/external_table_p2/tvf/test_tvf_p2.out | 12 ++++++++++++
.../suites/external_table_p2/tvf/test_tvf_p2.groovy | 6 ++++++
5 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
index aca5ae96e81..579bfa5a4ae 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
@@ -98,10 +98,12 @@ Status ColumnChunkReader::next_page() {
return next_page();
} else if (_page_reader->get_page_header()->type ==
tparquet::PageType::DATA_PAGE_V2) {
_remaining_num_values =
_page_reader->get_page_header()->data_page_header_v2.num_values;
+ _chunk_parsed_values += _remaining_num_values;
_state = HEADER_PARSED;
return Status::OK();
} else {
_remaining_num_values =
_page_reader->get_page_header()->data_page_header.num_values;
+ _chunk_parsed_values += _remaining_num_values;
_state = HEADER_PARSED;
return Status::OK();
}
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
index 9f62fddce70..aa30af1e94c 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
@@ -91,9 +91,17 @@ public:
Status init();
// Whether the chunk reader has a more page to read.
- bool has_next_page() { return _page_reader->has_next_page(); }
+ bool has_next_page() { return _chunk_parsed_values < _metadata.num_values;
}
+ // Deprecated
// Seek to the specific page, page_header_offset must be the start offset
of the page header.
+ // _end_offset may exceed the actual data area, so we can only use the
number of parsed values
+ // to determine whether there are remaining pages to read. That's to say
we can't use the
+ // PageLocation in parquet metadata to seek to the specified page. We
should call next_page()
+ // and skip_page() to skip pages one by one.
+ // todo: change this interface to seek_to_page(int64_t page_header_offset,
size_t num_parsed_values)
+ // and set _chunk_parsed_values = num_parsed_values
+ // [[deprecated]]
void seek_to_page(int64_t page_header_offset) {
_remaining_num_values = 0;
_page_reader->seek_to_page(page_header_offset);
@@ -201,6 +209,7 @@ private:
LevelDecoder _rep_level_decoder;
LevelDecoder _def_level_decoder;
+ size_t _chunk_parsed_values = 0;
uint32_t _remaining_num_values = 0;
Slice _page_data;
std::unique_ptr<uint8_t[]> _decompress_buf;
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.h
b/be/src/vec/exec/format/parquet/vparquet_page_reader.h
index 089409786f4..3b816629c6e 100644
--- a/be/src/vec/exec/format/parquet/vparquet_page_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.h
@@ -45,6 +45,11 @@ public:
uint64_t length);
~PageReader() = default;
+ // Deprecated
+ // Parquet file may not be standardized,
+ // _end_offset may exceed the actual data area.
+ // ColumnChunkReader::has_next_page() use the number of parsed values for
judgment
+ // [[deprecated]]
bool has_next_page() const { return _offset < _end_offset; }
Status next_page_header();
diff --git a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
index 6a44b7322dc..53b454df858 100644
--- a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
+++ b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
@@ -50,6 +50,18 @@
32.024 64.0000 128.901468 32.024 64.0000 128.901468 2023-07-07
2023-07-07 2021-07-07T19:15:31.123456 2023-07-07 2023-07-07
2021-07-07T19:15:31.123456
32.689 64.2580 128.745382 32.689 64.2580 128.745382 2023-11-11
2023-11-11 2022-11-11T16:35:37.123456 2023-11-11 2023-11-11
2022-11-11T16:35:37.123456
+-- !wrong_page_header --
+53587 38687 2689589 99480 2971 1999262 218 386 5265 86
33.14 56.33 40.55 0.00 3487.30 2850.04 4844.38 69.74 0.00 3487.30
3557.04 637.26
+53587 47417 2689589 99480 2971 1999262 218 1216 5265 75
36.98 54.36 46.74 736.15 3505.50 2773.50 4077.00 138.46 736.15 2769.35
2907.81 -4.15
+53587 72713 2689589 99480 2971 1999262 218 1250 5265 15
54.97 85.75 75.46 0.00 1131.90 824.55 1286.25 56.59 0.00 1131.90
1188.49 307.35
+\N 124637 2689589 99480 \N \N 218 196 5265 \N
17.56 33.01 \N \N \N \N 3069.93 17.18 \N 245.52
\N \N
+\N 132020 2689589 \N 2971 \N 218 \N 5265 \N
\N 132.06 \N \N \N 8965.20 12281.58 255.41 \N
\N 3448.10 \N
+53587 132623 2689589 99480 2971 1999262 218 31 5265 74
62.58 90.74 77.12 0.00 5706.88 4630.92 6714.76 57.06 0.00 5706.88
5763.94 1075.96
+53587 298010 2689589 99480 2971 1999262 218 802 5265 59
83.41 159.31 70.09 0.00 4135.31 4921.19 9399.29 289.47 0.00 4135.31
4424.78 -785.88
+47821 4657 8720303 1194037 2971 4208705 574 656 5960 55
68.94 91.69 82.52 0.00 4538.60 3791.70 5042.95 408.47 0.00 4538.60
4947.07 746.90
+47821 50869 8720303 1194037 2971 4208705 574 1284 5960 77
2.87 2.92 0.37 0.00 28.49 220.99 224.84 0.00 0.00 28.49
28.49 -192.50
+47821 51494 8720303 1194037 2971 4208705 574 145 5960 18
69.14 94.72 14.20 0.00 255.60 1244.52 1704.96 7.66 0.00 255.60
263.26 -988.92
+
-- !viewfs --
25001 25001 25001
diff --git a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
index 6564d074097..4f8aa482450 100644
--- a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
+++ b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
@@ -60,6 +60,12 @@ suite("test_tvf_p2", "p2") {
"format" = "parquet");
"""
+ // test for wrong page header
+ qt_wrong_page_header """select * from hdfs(
+ "uri" =
"hdfs://${nameNodeHost}:${hdfsPort}/catalog/tvf/parquet/wrong_page_header.parquet",
+ "format" = "parquet") order by ss_ticket_number,ss_item_sk limit
10;
+ """
+
// viewfs
qt_viewfs """select count(id), count(m1), count(m2)
from hdfs(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]