[doris] branch master updated: [fix](multi-catalog) get dictionary-encode from parquet metadata (#15525)

yiguolei Sat, 31 Dec 2022 03:08:25 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 50f1931f96 [fix](multi-catalog) get dictionary-encode from parquet 
metadata (#15525)
50f1931f96 is described below

commit 50f1931f96d5e96e4a1ad26bb394fb190fdaa875
Author: Ashin Gau <[email protected]>
AuthorDate: Sat Dec 31 19:08:10 2022 +0800

    [fix](multi-catalog) get dictionary-encode from parquet metadata (#15525)
---
 be/src/vec/exec/format/parquet/vparquet_reader.h |  2 ++
 be/src/vec/exec/format/table/iceberg_reader.cpp  | 36 +++++++++++++++---------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h 
b/be/src/vec/exec/format/parquet/vparquet_reader.h
index ac8eb7fa22..9871b512bd 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -94,6 +94,8 @@ public:
 
     Statistics& statistics() { return _statistics; }
 
+    const tparquet::FileMetaData* get_meta_data() const { return _t_metadata; }
+
     Status set_fill_columns(
             const std::unordered_map<std::string, std::tuple<std::string, 
const SlotDescriptor*>>&
                     partition_columns,
diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp 
b/be/src/vec/exec/format/table/iceberg_reader.cpp
index 1bdae3a6e2..6094d57c92 100644
--- a/be/src/vec/exec/format/table/iceberg_reader.cpp
+++ b/be/src/vec/exec/format/table/iceberg_reader.cpp
@@ -17,6 +17,7 @@
 
 #include "iceberg_reader.h"
 
+#include "common/status.h"
 #include "vec/common/assert_cast.h"
 #include "vec/core/column_with_type_and_name.h"
 #include "vec/data_types/data_type_factory.hpp"
@@ -115,17 +116,33 @@ Status IcebergTableReader::init_row_filters(const 
TFileRangeDesc& range) {
                     data_file_path = fs_name + data_file_path;
                 }
             }
-            RETURN_IF_ERROR(
-                    delete_reader.init_reader(delete_file_col_names, nullptr, 
nullptr, false));
+            Status init_st =
+                    delete_reader.init_reader(delete_file_col_names, nullptr, 
nullptr, false);
+            if (init_st.is<ErrorCode::END_OF_FILE>()) {
+                continue;
+            } else if (!init_st.ok()) {
+                return init_st;
+            }
             std::unordered_map<std::string, std::tuple<std::string, const 
SlotDescriptor*>>
                     partition_columns;
             std::unordered_map<std::string, VExprContext*> missing_columns;
             delete_reader.set_fill_columns(partition_columns, missing_columns);
 
+            bool dictionary_coded = false;
+            const tparquet::FileMetaData* meta_data = 
delete_reader.get_meta_data();
+            for (int i = 0; i < delete_file_col_names.size(); ++i) {
+                if (delete_file_col_names[i] == ICEBERG_FILE_PATH) {
+                    // ParquetReader wil return EndOfFile if there's no row 
group
+                    auto& column_chunk = meta_data->row_groups[0].columns[i];
+                    if (column_chunk.__isset.meta_data &&
+                        column_chunk.meta_data.__isset.dictionary_page_offset) 
{
+                        dictionary_coded = true;
+                    }
+                    break;
+                }
+            }
+
             bool eof = false;
-            // We can only know whether a parquet file is encoded in 
dictionary after reading the first block,
-            // so we assume it dictionary encoded first, and reset it false if 
error thrown.
-            bool dictionary_coded = true;
             while (!eof) {
                 Block block = Block();
                 for (int i = 0; i < delete_file_col_names.size(); ++i) {
@@ -146,14 +163,7 @@ Status IcebergTableReader::init_row_filters(const 
TFileRangeDesc& range) {
                 }
                 eof = false;
                 size_t read_rows = 0;
-                Status st = delete_reader.get_next_block(&block, &read_rows, 
&eof);
-                if (!st.ok()) {
-                    if (st.to_string() == "[IO_ERROR]Not dictionary coded") {
-                        dictionary_coded = false;
-                        continue;
-                    }
-                    return st;
-                }
+                RETURN_IF_ERROR(delete_reader.get_next_block(&block, 
&read_rows, &eof));
                 if (read_rows > 0) {
                     ColumnPtr path_column = 
block.get_by_name(ICEBERG_FILE_PATH).column;
                     DCHECK_EQ(path_column->size(), read_rows);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] branch master updated: [fix](multi-catalog) get dictionary-encode from parquet metadata (#15525)

Reply via email to