This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new a0661ed9d28 [Fix](multi-catalog) Fix complex type crash when using dict filter facility in the parquet-reader. (#27151) a0661ed9d28 is described below commit a0661ed9d285b7f2692e5f004987251a8a81d844 Author: Qi Chen <kaka11.c...@gmail.com> AuthorDate: Fri Nov 17 13:43:58 2023 +0800 [Fix](multi-catalog) Fix complex type crash when using dict filter facility in the parquet-reader. (#27151) - Fix complex type crash when using the dict filter facility in the parquet-reader by turning off the dict filter facility in this case. - Add orc complex types regression test. --- .../exec/format/parquet/vparquet_group_reader.cpp | 3 +- .../exec/format/parquet/vparquet_group_reader.h | 2 ++ be/src/vec/exec/format/parquet/vparquet_reader.cpp | 6 ++-- be/src/vec/exec/format/parquet/vparquet_reader.h | 2 -- .../external_table_p2/hive/test_complex_types.out | 34 ++++++++++++++++++---- .../hive/test_complex_types.groovy | 16 ++++++++++ 6 files changed, 52 insertions(+), 11 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index abb37a8bfc7..8360ab23e07 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -146,7 +146,8 @@ Status RowGroupReader::init( const string& predicate_col_name = predicate_col_names[i]; int slot_id = predicate_col_slot_ids[i]; auto field = const_cast<FieldSchema*>(schema.get_column(predicate_col_name)); - if (_can_filter_by_dict(slot_id, + if (!_lazy_read_ctx.has_complex_type && + _can_filter_by_dict(slot_id, _row_group_meta.columns[field->physical_column_index].meta_data)) { _dict_filter_cols.emplace_back(std::make_pair(predicate_col_name, slot_id)); } else { diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h index aeb1404b12c..2a7e163ed0a 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h @@ -97,6 +97,8 @@ public: std::unordered_map<std::string, VExprContextSPtr> predicate_missing_columns; // lazy read missing columns or all missing columns std::unordered_map<std::string, VExprContextSPtr> missing_columns; + // should turn off filtering by page index, lazy read and dict filter if having complex type + bool has_complex_type = false; }; /** diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 810f5a1c8a6..6eb139a7b0e 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -416,7 +416,7 @@ Status ParquetReader::set_fill_columns( _lazy_read_ctx.all_read_columns.emplace_back(read_col); PrimitiveType column_type = schema.get_column(read_col)->type.type; if (column_type == TYPE_ARRAY || column_type == TYPE_MAP || column_type == TYPE_STRUCT) { - _has_complex_type = true; + _lazy_read_ctx.has_complex_type = true; } if (predicate_columns.size() > 0) { auto iter = predicate_columns.find(read_col); @@ -450,7 +450,7 @@ Status ParquetReader::set_fill_columns( } } - if (!_has_complex_type && _enable_lazy_mat && + if (!_lazy_read_ctx.has_complex_type && _enable_lazy_mat && _lazy_read_ctx.predicate_columns.first.size() > 0 && _lazy_read_ctx.lazy_read_columns.size() > 0) { _lazy_read_ctx.can_lazy_read = true; @@ -736,7 +736,7 @@ Status ParquetReader::_process_page_index(const tparquet::RowGroup& row_group, _statistics.read_rows += row_group.num_rows; }; - if (_has_complex_type || _lazy_read_ctx.conjuncts.empty() || + if (_lazy_read_ctx.has_complex_type || _lazy_read_ctx.conjuncts.empty() || _colname_to_value_range == nullptr || _colname_to_value_range->empty()) { read_whole_row_group(); return Status::OK(); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h index 8c763b236fb..0e9bf4907f2 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -239,8 +239,6 @@ private: RowRange _whole_range = RowRange(0, 0); const std::vector<int64_t>* _delete_rows = nullptr; int64_t _delete_rows_index = 0; - // should turn off filtering by page index and lazy read if having complex type - bool _has_complex_type = false; // Used for column lazy read. RowGroupReader::LazyReadContext _lazy_read_ctx; diff --git a/regression-test/data/external_table_p2/hive/test_complex_types.out b/regression-test/data/external_table_p2/hive/test_complex_types.out index 1bfb858fae2..fb712c63d30 100644 --- a/regression-test/data/external_table_p2/hive/test_complex_types.out +++ b/regression-test/data/external_table_p2/hive/test_complex_types.out @@ -12,7 +12,7 @@ [0.98055020292316664, 0.53302915957540542, 0.30024744873379805, 0.48563601750302665, 0.76871064251586241, 0.69935066449251015, 0.28493548088258069, 0.34734174551861408, 0.13500129443045072, 0.97081321037009394, 0.18583042639943448, 0.48863372645520731, 0.36354741695157655, 0.56408452689711752, 0.1374134087807577, 0.77665476474516226, 0.58353232966683177, 0.36544595471103491, 0.54797767099937644, 0.83799325421171922, 0.15665046278350814, 0.03371222042250388, 0.1699781825927229, 0.35796304 [...] -- !map_contains_key -- -1077 [0.78055609958738448, 0.93034890022695593, 0.25295229975218769, 0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736, 0.52781260099838434, 0.52875058412167075, 0.426116738236779, 0.42300502393871175, 0.53270263300536513, 0.60254817779426029, 0.27107336472576271, 0.613792118138183, 0.0021003027835629906, 0.32006750487285818, 0.54856110146602044, 0.51215105813137074, 0.51451366528053577] {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946, "CPDH4G-ZXGPkku-3wY-ktaQ [...] +1077 [0.78055609958738448, 0.93034890022695593, 0.25295229975218769, 0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736, 0.52781260099838434, 0.52875058412167075, 0.426116738236779, 0.42300502393871175, 0.53270263300536513, 0.60254817779426029, 0.27107336472576271, 0.613792118138183, 0.0021003027835629906, 0.32006750487285818, 0.54856110146602044, 0.51215105813137074, 0.51451366528053577] {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946, "CPDH4G-ZXGPkku-3wY-ktaQ [...] -- !array_max -- 11028 @@ -23,14 +23,38 @@ -- !array_last -- 0.9899828598260161 +-- !null_struct_element_orc -- +0 + +-- !map_key_select_orc -- +38111 0.770169659057425 + +-- !map_keys_orc -- +["9wXr9n-TBm9Wyt-r8H-SkAq", "CPDH4G-ZXGPkku-3wY-ktaQ", "RvNlMt-HHjHN5M-VjP-xHAI", "qKIhKy-Ws344os-haX-2pmT", "DOJJ5l-UEkwVMs-x9F-HifD", "m871g8-1eFi7jt-oBq-S0yc", "wXugVP-v2fc6IF-DeU-On3T", "B0mXFX-QvgUgo7-Dih-6rDu", "E9zv3F-xMqSbMa-il4-FuDg", "msuFIN-ZkKO8TY-tu4-veH0", "0rSUyl-Un07aIW-KAx-WHnX", "XvbmO8-WA6oAqc-ihc-s8IL", "G6B6RD-AicAlZb-16u-Pn1I", "coDK0Q-tMg1294-JMQ-ZWQu", "4c0aWh-yhL6BOX-rRu-1n0r", "G4iUcG-ZhWw62v-VLt-n6lH", "IIB7qD-WQistwT-Vux-0c9B", "7cTyuR-5ssXm2S-sJR-JTIZ", "3KPh [...] + +-- !map_values_orc -- +[0.98055020292316664, 0.53302915957540542, 0.30024744873379805, 0.48563601750302665, 0.76871064251586241, 0.69935066449251015, 0.28493548088258069, 0.34734174551861408, 0.13500129443045072, 0.97081321037009394, 0.18583042639943448, 0.48863372645520731, 0.36354741695157655, 0.56408452689711752, 0.1374134087807577, 0.77665476474516226, 0.58353232966683177, 0.36544595471103491, 0.54797767099937644, 0.83799325421171922, 0.15665046278350814, 0.03371222042250388, 0.1699781825927229, 0.35796304 [...] + +-- !map_contains_key_orc -- +1077 [0.78055609958738448, 0.93034890022695593, 0.25295229975218769, 0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736, 0.52781260099838434, 0.52875058412167075, 0.426116738236779, 0.42300502393871175, 0.53270263300536513, 0.60254817779426029, 0.27107336472576271, 0.613792118138183, 0.0021003027835629906, 0.32006750487285818, 0.54856110146602044, 0.51215105813137074, 0.51451366528053577] {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946, "CPDH4G-ZXGPkku-3wY-ktaQ [...] + +-- !array_max_orc -- +11028 + +-- !array_filter_orc -- +11028 + +-- !array_last_orc -- +0.9899828598260161 + -- !offsets_check -- -0 [1, 2] [[], [3], NULL] {"a":1, "b":2} {"e", NULL} +0 [1, 2] [[], [3], null] {"a":1, "b":2} {"s1": "e", "s2": null} 1 [] [] {} \N -2 \N \N \N {"h", 10} -3 [5, NULL] [[6, 7], [8, NULL], NULL] {"f":1, "g":NULL} {NULL, 9} +2 \N \N \N {"s1": "h", "s2": 10} +3 [5, null] [[6, 7], [8, null], null] {"f":1, "g":null} {"s1": null, "s2": 9} -- !map_with_nullable_key -- -\N \N \N \N \N \N \N \N \N test test aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...] +\N \N \N \N \N \N \N \N \N test test aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...] -- !date_dict -- 2036-12-28 1898-12-28 2539-12-28 diff --git a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy index 32ef138c4bd..1a90570ff42 100644 --- a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy +++ b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy @@ -51,6 +51,22 @@ suite("test_complex_types", "p2,external,hive,external_remote,external_remote_hi qt_array_last """select max(array_last(i -> i > 0, capacity)) from byd where array_last(i -> i > 0, capacity) < 0.99""" + qt_null_struct_element_orc """select count(struct_element(favor, 'tip')) from byd where id % 13 = 0""" + + qt_map_key_select_orc """select id, singles["p0X72J-mkMe40O-vOa-opfI"] as map_key from byd where singles["p0X72J-mkMe40O-vOa-opfI"] is not null""" + + qt_map_keys_orc """select map_keys(singles) from byd where id = 1077""" + + qt_map_values_orc """select map_values(singles) from byd where id = 1433""" + + qt_map_contains_key_orc """select * from byd where map_contains_key(singles, 'B0mXFX-QvgUgo7-Dih-6rDu') = 1""" + + qt_array_max_orc """select count(array_max(capacity)) from byd where array_max(capacity) > 0.99""" + + qt_array_filter_orc """select count(array_size(array_filter(i -> (i > 0.99), capacity))) from byd where array_size(array_filter(i -> (i > 0.99), capacity))""" + + qt_array_last_orc """select max(array_last(i -> i > 0, capacity)) from byd where array_last(i -> i > 0, capacity) < 0.99""" + qt_offsets_check """select * from complex_offsets_check order by id""" qt_map_with_nullable_key """select * from parquet_all_types limit 1""" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org