This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.2-lts by this push:
new 63a11f9d90 [fix](multi-catalog) get dictionary-encode from parquet
metadata (#15524)
63a11f9d90 is described below
commit 63a11f9d90ee6f0e9b25d4506247b75d502e3ef9
Author: Ashin Gau <[email protected]>
AuthorDate: Fri Dec 30 19:42:46 2022 +0800
[fix](multi-catalog) get dictionary-encode from parquet metadata (#15524)
Check whether a parquet is dictionary-encoded from file metadata instead of
reading the first block.
In some versions of parquet writer, the file format is not standard.
We can't read the information from file metadata, but it's a
dictionary-encoded column.
In this case, the performance of reading delete files will be degraded.
cherry-pick #15525
---
be/src/vec/exec/format/parquet/vparquet_reader.h | 2 ++
be/src/vec/exec/format/table/iceberg_reader.cpp | 35 +++++++++++++++---------
2 files changed, 24 insertions(+), 13 deletions(-)
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h
b/be/src/vec/exec/format/parquet/vparquet_reader.h
index 19a7b2534d..5ab9f19024 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -93,6 +93,8 @@ public:
Statistics& statistics() { return _statistics; }
+ const tparquet::FileMetaData* get_meta_data() const { return _t_metadata; }
+
Status set_fill_columns(
const std::unordered_map<std::string, std::tuple<std::string,
const SlotDescriptor*>>&
partition_columns,
diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp
b/be/src/vec/exec/format/table/iceberg_reader.cpp
index 1bdae3a6e2..a4e5bbd618 100644
--- a/be/src/vec/exec/format/table/iceberg_reader.cpp
+++ b/be/src/vec/exec/format/table/iceberg_reader.cpp
@@ -115,17 +115,33 @@ Status IcebergTableReader::init_row_filters(const
TFileRangeDesc& range) {
data_file_path = fs_name + data_file_path;
}
}
- RETURN_IF_ERROR(
- delete_reader.init_reader(delete_file_col_names, nullptr,
nullptr, false));
+ Status init_st =
+ delete_reader.init_reader(delete_file_col_names, nullptr,
nullptr, false);
+ if (init_st.is_end_of_file()) {
+ continue;
+ } else if (!init_st.ok()) {
+ return init_st;
+ }
std::unordered_map<std::string, std::tuple<std::string, const
SlotDescriptor*>>
partition_columns;
std::unordered_map<std::string, VExprContext*> missing_columns;
delete_reader.set_fill_columns(partition_columns, missing_columns);
+ bool dictionary_coded = false;
+ const tparquet::FileMetaData* meta_data =
delete_reader.get_meta_data();
+ for (int i = 0; i < delete_file_col_names.size(); ++i) {
+ if (delete_file_col_names[i] == ICEBERG_FILE_PATH) {
+ // ParquetReader wil return EndOfFile if there's no row
group
+ auto& column_chunk = meta_data->row_groups[0].columns[i];
+ if (column_chunk.__isset.meta_data &&
+ column_chunk.meta_data.__isset.dictionary_page_offset)
{
+ dictionary_coded = true;
+ }
+ break;
+ }
+ }
+
bool eof = false;
- // We can only know whether a parquet file is encoded in
dictionary after reading the first block,
- // so we assume it dictionary encoded first, and reset it false if
error thrown.
- bool dictionary_coded = true;
while (!eof) {
Block block = Block();
for (int i = 0; i < delete_file_col_names.size(); ++i) {
@@ -146,14 +162,7 @@ Status IcebergTableReader::init_row_filters(const
TFileRangeDesc& range) {
}
eof = false;
size_t read_rows = 0;
- Status st = delete_reader.get_next_block(&block, &read_rows,
&eof);
- if (!st.ok()) {
- if (st.to_string() == "[IO_ERROR]Not dictionary coded") {
- dictionary_coded = false;
- continue;
- }
- return st;
- }
+ RETURN_IF_ERROR(delete_reader.get_next_block(&block,
&read_rows, &eof));
if (read_rows > 0) {
ColumnPtr path_column =
block.get_by_name(ICEBERG_FILE_PATH).column;
DCHECK_EQ(path_column->size(), read_rows);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]