This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new a0661ed9d28 [Fix](multi-catalog) Fix complex type crash when using 
dict filter facility in the parquet-reader. (#27151)
a0661ed9d28 is described below

commit a0661ed9d285b7f2692e5f004987251a8a81d844
Author: Qi Chen <kaka11.c...@gmail.com>
AuthorDate: Fri Nov 17 13:43:58 2023 +0800

    [Fix](multi-catalog) Fix complex type crash when using dict filter facility 
in the parquet-reader. (#27151)
    
    - Fix complex type crash when using the dict filter facility in the 
parquet-reader by turning off the dict filter facility in this case.
    - Add orc complex types regression test.
---
 .../exec/format/parquet/vparquet_group_reader.cpp  |  3 +-
 .../exec/format/parquet/vparquet_group_reader.h    |  2 ++
 be/src/vec/exec/format/parquet/vparquet_reader.cpp |  6 ++--
 be/src/vec/exec/format/parquet/vparquet_reader.h   |  2 --
 .../external_table_p2/hive/test_complex_types.out  | 34 ++++++++++++++++++----
 .../hive/test_complex_types.groovy                 | 16 ++++++++++
 6 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index abb37a8bfc7..8360ab23e07 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -146,7 +146,8 @@ Status RowGroupReader::init(
         const string& predicate_col_name = predicate_col_names[i];
         int slot_id = predicate_col_slot_ids[i];
         auto field = 
const_cast<FieldSchema*>(schema.get_column(predicate_col_name));
-        if (_can_filter_by_dict(slot_id,
+        if (!_lazy_read_ctx.has_complex_type &&
+            _can_filter_by_dict(slot_id,
                                 
_row_group_meta.columns[field->physical_column_index].meta_data)) {
             _dict_filter_cols.emplace_back(std::make_pair(predicate_col_name, 
slot_id));
         } else {
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h 
b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
index aeb1404b12c..2a7e163ed0a 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
@@ -97,6 +97,8 @@ public:
         std::unordered_map<std::string, VExprContextSPtr> 
predicate_missing_columns;
         // lazy read missing columns or all missing columns
         std::unordered_map<std::string, VExprContextSPtr> missing_columns;
+        // should turn off filtering by page index, lazy read and dict filter 
if having complex type
+        bool has_complex_type = false;
     };
 
     /**
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 810f5a1c8a6..6eb139a7b0e 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -416,7 +416,7 @@ Status ParquetReader::set_fill_columns(
         _lazy_read_ctx.all_read_columns.emplace_back(read_col);
         PrimitiveType column_type = schema.get_column(read_col)->type.type;
         if (column_type == TYPE_ARRAY || column_type == TYPE_MAP || 
column_type == TYPE_STRUCT) {
-            _has_complex_type = true;
+            _lazy_read_ctx.has_complex_type = true;
         }
         if (predicate_columns.size() > 0) {
             auto iter = predicate_columns.find(read_col);
@@ -450,7 +450,7 @@ Status ParquetReader::set_fill_columns(
         }
     }
 
-    if (!_has_complex_type && _enable_lazy_mat &&
+    if (!_lazy_read_ctx.has_complex_type && _enable_lazy_mat &&
         _lazy_read_ctx.predicate_columns.first.size() > 0 &&
         _lazy_read_ctx.lazy_read_columns.size() > 0) {
         _lazy_read_ctx.can_lazy_read = true;
@@ -736,7 +736,7 @@ Status ParquetReader::_process_page_index(const 
tparquet::RowGroup& row_group,
         _statistics.read_rows += row_group.num_rows;
     };
 
-    if (_has_complex_type || _lazy_read_ctx.conjuncts.empty() ||
+    if (_lazy_read_ctx.has_complex_type || _lazy_read_ctx.conjuncts.empty() ||
         _colname_to_value_range == nullptr || 
_colname_to_value_range->empty()) {
         read_whole_row_group();
         return Status::OK();
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h 
b/be/src/vec/exec/format/parquet/vparquet_reader.h
index 8c763b236fb..0e9bf4907f2 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -239,8 +239,6 @@ private:
     RowRange _whole_range = RowRange(0, 0);
     const std::vector<int64_t>* _delete_rows = nullptr;
     int64_t _delete_rows_index = 0;
-    // should turn off filtering by page index and lazy read if having complex 
type
-    bool _has_complex_type = false;
 
     // Used for column lazy read.
     RowGroupReader::LazyReadContext _lazy_read_ctx;
diff --git a/regression-test/data/external_table_p2/hive/test_complex_types.out 
b/regression-test/data/external_table_p2/hive/test_complex_types.out
index 1bfb858fae2..fb712c63d30 100644
--- a/regression-test/data/external_table_p2/hive/test_complex_types.out
+++ b/regression-test/data/external_table_p2/hive/test_complex_types.out
@@ -12,7 +12,7 @@
 [0.98055020292316664, 0.53302915957540542, 0.30024744873379805, 
0.48563601750302665, 0.76871064251586241, 0.69935066449251015, 
0.28493548088258069, 0.34734174551861408, 0.13500129443045072, 
0.97081321037009394, 0.18583042639943448, 0.48863372645520731, 
0.36354741695157655, 0.56408452689711752, 0.1374134087807577, 
0.77665476474516226, 0.58353232966683177, 0.36544595471103491, 
0.54797767099937644, 0.83799325421171922, 0.15665046278350814, 
0.03371222042250388, 0.1699781825927229, 0.35796304 [...]
 
 -- !map_contains_key --
-1077   [0.78055609958738448, 0.93034890022695593, 0.25295229975218769, 
0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736, 
0.52781260099838434, 0.52875058412167075, 0.426116738236779, 
0.42300502393871175, 0.53270263300536513, 0.60254817779426029, 
0.27107336472576271, 0.613792118138183, 0.0021003027835629906, 
0.32006750487285818, 0.54856110146602044, 0.51215105813137074, 
0.51451366528053577]  {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946, 
"CPDH4G-ZXGPkku-3wY-ktaQ [...]
+1077   [0.78055609958738448, 0.93034890022695593, 0.25295229975218769, 
0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736, 
0.52781260099838434, 0.52875058412167075, 0.426116738236779, 
0.42300502393871175, 0.53270263300536513, 0.60254817779426029, 
0.27107336472576271, 0.613792118138183, 0.0021003027835629906, 
0.32006750487285818, 0.54856110146602044, 0.51215105813137074, 
0.51451366528053577]  {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946, 
"CPDH4G-ZXGPkku-3wY-ktaQ [...]
 
 -- !array_max --
 11028
@@ -23,14 +23,38 @@
 -- !array_last --
 0.9899828598260161
 
+-- !null_struct_element_orc --
+0
+
+-- !map_key_select_orc --
+38111  0.770169659057425
+
+-- !map_keys_orc --
+["9wXr9n-TBm9Wyt-r8H-SkAq", "CPDH4G-ZXGPkku-3wY-ktaQ", 
"RvNlMt-HHjHN5M-VjP-xHAI", "qKIhKy-Ws344os-haX-2pmT", 
"DOJJ5l-UEkwVMs-x9F-HifD", "m871g8-1eFi7jt-oBq-S0yc", 
"wXugVP-v2fc6IF-DeU-On3T", "B0mXFX-QvgUgo7-Dih-6rDu", 
"E9zv3F-xMqSbMa-il4-FuDg", "msuFIN-ZkKO8TY-tu4-veH0", 
"0rSUyl-Un07aIW-KAx-WHnX", "XvbmO8-WA6oAqc-ihc-s8IL", 
"G6B6RD-AicAlZb-16u-Pn1I", "coDK0Q-tMg1294-JMQ-ZWQu", 
"4c0aWh-yhL6BOX-rRu-1n0r", "G4iUcG-ZhWw62v-VLt-n6lH", 
"IIB7qD-WQistwT-Vux-0c9B", "7cTyuR-5ssXm2S-sJR-JTIZ", "3KPh [...]
+
+-- !map_values_orc --
+[0.98055020292316664, 0.53302915957540542, 0.30024744873379805, 
0.48563601750302665, 0.76871064251586241, 0.69935066449251015, 
0.28493548088258069, 0.34734174551861408, 0.13500129443045072, 
0.97081321037009394, 0.18583042639943448, 0.48863372645520731, 
0.36354741695157655, 0.56408452689711752, 0.1374134087807577, 
0.77665476474516226, 0.58353232966683177, 0.36544595471103491, 
0.54797767099937644, 0.83799325421171922, 0.15665046278350814, 
0.03371222042250388, 0.1699781825927229, 0.35796304 [...]
+
+-- !map_contains_key_orc --
+1077   [0.78055609958738448, 0.93034890022695593, 0.25295229975218769, 
0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736, 
0.52781260099838434, 0.52875058412167075, 0.426116738236779, 
0.42300502393871175, 0.53270263300536513, 0.60254817779426029, 
0.27107336472576271, 0.613792118138183, 0.0021003027835629906, 
0.32006750487285818, 0.54856110146602044, 0.51215105813137074, 
0.51451366528053577]  {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946, 
"CPDH4G-ZXGPkku-3wY-ktaQ [...]
+
+-- !array_max_orc --
+11028
+
+-- !array_filter_orc --
+11028
+
+-- !array_last_orc --
+0.9899828598260161
+
 -- !offsets_check --
-0      [1, 2]  [[], [3], NULL] {"a":1, "b":2}  {"e", NULL}
+0      [1, 2]  [[], [3], null] {"a":1, "b":2}  {"s1": "e", "s2": null}
 1      []      []      {}      \N
-2      \N      \N      \N      {"h", 10}
-3      [5, NULL]       [[6, 7], [8, NULL], NULL]       {"f":1, "g":NULL}       
{NULL, 9}
+2      \N      \N      \N      {"s1": "h", "s2": 10}
+3      [5, null]       [[6, 7], [8, null], null]       {"f":1, "g":null}       
{"s1": null, "s2": 9}
 
 -- !map_with_nullable_key --
-\N     \N      \N      \N      \N      \N      \N      \N      \N              
test            test    
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 [...]
+\N     \N      \N      \N      \N      \N      \N      \N      \N              
test            test    
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 [...]
 
 -- !date_dict --
 2036-12-28     1898-12-28      2539-12-28
diff --git 
a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy 
b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
index 32ef138c4bd..1a90570ff42 100644
--- a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
@@ -51,6 +51,22 @@ suite("test_complex_types", 
"p2,external,hive,external_remote,external_remote_hi
 
         qt_array_last """select max(array_last(i -> i > 0, capacity)) from byd 
where array_last(i -> i > 0, capacity) < 0.99"""
 
+        qt_null_struct_element_orc """select count(struct_element(favor, 
'tip')) from byd where id % 13 = 0"""
+
+        qt_map_key_select_orc """select id, singles["p0X72J-mkMe40O-vOa-opfI"] 
as map_key from byd where singles["p0X72J-mkMe40O-vOa-opfI"] is not null"""
+
+        qt_map_keys_orc """select map_keys(singles) from byd where id = 1077"""
+
+        qt_map_values_orc """select map_values(singles) from byd where id = 
1433"""
+
+        qt_map_contains_key_orc """select * from byd where 
map_contains_key(singles, 'B0mXFX-QvgUgo7-Dih-6rDu') = 1"""
+
+        qt_array_max_orc """select count(array_max(capacity)) from byd where 
array_max(capacity) > 0.99"""
+
+        qt_array_filter_orc """select count(array_size(array_filter(i -> (i > 
0.99), capacity))) from byd where array_size(array_filter(i -> (i > 0.99), 
capacity))"""
+
+        qt_array_last_orc """select max(array_last(i -> i > 0, capacity)) from 
byd where array_last(i -> i > 0, capacity) < 0.99"""
+
         qt_offsets_check """select * from complex_offsets_check order by id"""
 
         qt_map_with_nullable_key """select * from parquet_all_types limit 1"""


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to