This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 50f1931f96 [fix](multi-catalog) get dictionary-encode from parquet
metadata (#15525)
50f1931f96 is described below
commit 50f1931f96d5e96e4a1ad26bb394fb190fdaa875
Author: Ashin Gau <[email protected]>
AuthorDate: Sat Dec 31 19:08:10 2022 +0800
[fix](multi-catalog) get dictionary-encode from parquet metadata (#15525)
---
be/src/vec/exec/format/parquet/vparquet_reader.h | 2 ++
be/src/vec/exec/format/table/iceberg_reader.cpp | 36 +++++++++++++++---------
2 files changed, 25 insertions(+), 13 deletions(-)
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h
b/be/src/vec/exec/format/parquet/vparquet_reader.h
index ac8eb7fa22..9871b512bd 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -94,6 +94,8 @@ public:
Statistics& statistics() { return _statistics; }
+ const tparquet::FileMetaData* get_meta_data() const { return _t_metadata; }
+
Status set_fill_columns(
const std::unordered_map<std::string, std::tuple<std::string,
const SlotDescriptor*>>&
partition_columns,
diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp
b/be/src/vec/exec/format/table/iceberg_reader.cpp
index 1bdae3a6e2..6094d57c92 100644
--- a/be/src/vec/exec/format/table/iceberg_reader.cpp
+++ b/be/src/vec/exec/format/table/iceberg_reader.cpp
@@ -17,6 +17,7 @@
#include "iceberg_reader.h"
+#include "common/status.h"
#include "vec/common/assert_cast.h"
#include "vec/core/column_with_type_and_name.h"
#include "vec/data_types/data_type_factory.hpp"
@@ -115,17 +116,33 @@ Status IcebergTableReader::init_row_filters(const
TFileRangeDesc& range) {
data_file_path = fs_name + data_file_path;
}
}
- RETURN_IF_ERROR(
- delete_reader.init_reader(delete_file_col_names, nullptr,
nullptr, false));
+ Status init_st =
+ delete_reader.init_reader(delete_file_col_names, nullptr,
nullptr, false);
+ if (init_st.is<ErrorCode::END_OF_FILE>()) {
+ continue;
+ } else if (!init_st.ok()) {
+ return init_st;
+ }
std::unordered_map<std::string, std::tuple<std::string, const
SlotDescriptor*>>
partition_columns;
std::unordered_map<std::string, VExprContext*> missing_columns;
delete_reader.set_fill_columns(partition_columns, missing_columns);
+ bool dictionary_coded = false;
+ const tparquet::FileMetaData* meta_data =
delete_reader.get_meta_data();
+ for (int i = 0; i < delete_file_col_names.size(); ++i) {
+ if (delete_file_col_names[i] == ICEBERG_FILE_PATH) {
+ // ParquetReader wil return EndOfFile if there's no row
group
+ auto& column_chunk = meta_data->row_groups[0].columns[i];
+ if (column_chunk.__isset.meta_data &&
+ column_chunk.meta_data.__isset.dictionary_page_offset)
{
+ dictionary_coded = true;
+ }
+ break;
+ }
+ }
+
bool eof = false;
- // We can only know whether a parquet file is encoded in
dictionary after reading the first block,
- // so we assume it dictionary encoded first, and reset it false if
error thrown.
- bool dictionary_coded = true;
while (!eof) {
Block block = Block();
for (int i = 0; i < delete_file_col_names.size(); ++i) {
@@ -146,14 +163,7 @@ Status IcebergTableReader::init_row_filters(const
TFileRangeDesc& range) {
}
eof = false;
size_t read_rows = 0;
- Status st = delete_reader.get_next_block(&block, &read_rows,
&eof);
- if (!st.ok()) {
- if (st.to_string() == "[IO_ERROR]Not dictionary coded") {
- dictionary_coded = false;
- continue;
- }
- return st;
- }
+ RETURN_IF_ERROR(delete_reader.get_next_block(&block,
&read_rows, &eof));
if (read_rows > 0) {
ColumnPtr path_column =
block.get_by_name(ICEBERG_FILE_PATH).column;
DCHECK_EQ(path_column->size(), read_rows);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]