This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 6ced1524ddd [Fix](multi-catalog) Fix complex type crash when using
dict filter facility in the parquet-reader. (#27151) (#27187)
6ced1524ddd is described below
commit 6ced1524ddd99d007004ce247490a5b2ba41e693
Author: Qi Chen <[email protected]>
AuthorDate: Fri Nov 17 22:21:20 2023 +0800
[Fix](multi-catalog) Fix complex type crash when using dict filter facility
in the parquet-reader. (#27151) (#27187)
- Fix complex type crash when using the dict filter facility in the
parquet-reader by turning off the dict filter facility in this case.
- Add orc complex types regression test.
---
.../exec/format/parquet/vparquet_group_reader.cpp | 3 ++-
.../exec/format/parquet/vparquet_group_reader.h | 2 ++
be/src/vec/exec/format/parquet/vparquet_reader.cpp | 6 +++---
be/src/vec/exec/format/parquet/vparquet_reader.h | 2 --
.../external_table_p2/hive/test_complex_types.out | 24 ++++++++++++++++++++++
.../hive/test_complex_types.groovy | 16 +++++++++++++++
6 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index a4f77e2c056..f1e5dc42801 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -147,7 +147,8 @@ Status RowGroupReader::init(
const string& predicate_col_name = predicate_col_names[i];
int slot_id = predicate_col_slot_ids[i];
auto field =
const_cast<FieldSchema*>(schema.get_column(predicate_col_name));
- if (_can_filter_by_dict(slot_id,
+ if (!_lazy_read_ctx.has_complex_type &&
+ _can_filter_by_dict(slot_id,
_row_group_meta.columns[field->physical_column_index].meta_data)) {
_dict_filter_cols.emplace_back(std::make_pair(predicate_col_name,
slot_id));
} else {
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h
b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
index c44899b5836..0063fca768d 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
@@ -98,6 +98,8 @@ public:
std::unordered_map<std::string, VExprContextSPtr>
predicate_missing_columns;
// lazy read missing columns or all missing columns
std::unordered_map<std::string, VExprContextSPtr> missing_columns;
+ // should turn off filtering by page index, lazy read and dict filter
if having complex type
+ bool has_complex_type = false;
};
/**
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 124f623f2e9..53655a187e7 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -410,7 +410,7 @@ Status ParquetReader::set_fill_columns(
_lazy_read_ctx.all_read_columns.emplace_back(read_col);
PrimitiveType column_type = schema.get_column(read_col)->type.type;
if (column_type == TYPE_ARRAY || column_type == TYPE_MAP ||
column_type == TYPE_STRUCT) {
- _has_complex_type = true;
+ _lazy_read_ctx.has_complex_type = true;
}
if (predicate_columns.size() > 0) {
auto iter = predicate_columns.find(read_col);
@@ -444,7 +444,7 @@ Status ParquetReader::set_fill_columns(
}
}
- if (!_has_complex_type && _enable_lazy_mat &&
+ if (!_lazy_read_ctx.has_complex_type && _enable_lazy_mat &&
_lazy_read_ctx.predicate_columns.first.size() > 0 &&
_lazy_read_ctx.lazy_read_columns.size() > 0) {
_lazy_read_ctx.can_lazy_read = true;
@@ -743,7 +743,7 @@ Status ParquetReader::_process_page_index(const
tparquet::RowGroup& row_group,
_statistics.read_rows += row_group.num_rows;
};
- if (_has_complex_type || _lazy_read_ctx.conjuncts.empty() ||
+ if (_lazy_read_ctx.has_complex_type || _lazy_read_ctx.conjuncts.empty() ||
_colname_to_value_range == nullptr ||
_colname_to_value_range->empty()) {
read_whole_row_group();
return Status::OK();
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h
b/be/src/vec/exec/format/parquet/vparquet_reader.h
index 6efd0bd7237..b46dae1e29c 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -243,8 +243,6 @@ private:
RowRange _whole_range = RowRange(0, 0);
const std::vector<int64_t>* _delete_rows = nullptr;
int64_t _delete_rows_index = 0;
- // should turn off filtering by page index and lazy read if having complex
type
- bool _has_complex_type = false;
// Used for column lazy read.
RowGroupReader::LazyReadContext _lazy_read_ctx;
diff --git a/regression-test/data/external_table_p2/hive/test_complex_types.out
b/regression-test/data/external_table_p2/hive/test_complex_types.out
index c103dab0387..fb712c63d30 100644
--- a/regression-test/data/external_table_p2/hive/test_complex_types.out
+++ b/regression-test/data/external_table_p2/hive/test_complex_types.out
@@ -23,6 +23,30 @@
-- !array_last --
0.9899828598260161
+-- !null_struct_element_orc --
+0
+
+-- !map_key_select_orc --
+38111 0.770169659057425
+
+-- !map_keys_orc --
+["9wXr9n-TBm9Wyt-r8H-SkAq", "CPDH4G-ZXGPkku-3wY-ktaQ",
"RvNlMt-HHjHN5M-VjP-xHAI", "qKIhKy-Ws344os-haX-2pmT",
"DOJJ5l-UEkwVMs-x9F-HifD", "m871g8-1eFi7jt-oBq-S0yc",
"wXugVP-v2fc6IF-DeU-On3T", "B0mXFX-QvgUgo7-Dih-6rDu",
"E9zv3F-xMqSbMa-il4-FuDg", "msuFIN-ZkKO8TY-tu4-veH0",
"0rSUyl-Un07aIW-KAx-WHnX", "XvbmO8-WA6oAqc-ihc-s8IL",
"G6B6RD-AicAlZb-16u-Pn1I", "coDK0Q-tMg1294-JMQ-ZWQu",
"4c0aWh-yhL6BOX-rRu-1n0r", "G4iUcG-ZhWw62v-VLt-n6lH",
"IIB7qD-WQistwT-Vux-0c9B", "7cTyuR-5ssXm2S-sJR-JTIZ", "3KPh [...]
+
+-- !map_values_orc --
+[0.98055020292316664, 0.53302915957540542, 0.30024744873379805,
0.48563601750302665, 0.76871064251586241, 0.69935066449251015,
0.28493548088258069, 0.34734174551861408, 0.13500129443045072,
0.97081321037009394, 0.18583042639943448, 0.48863372645520731,
0.36354741695157655, 0.56408452689711752, 0.1374134087807577,
0.77665476474516226, 0.58353232966683177, 0.36544595471103491,
0.54797767099937644, 0.83799325421171922, 0.15665046278350814,
0.03371222042250388, 0.1699781825927229, 0.35796304 [...]
+
+-- !map_contains_key_orc --
+1077 [0.78055609958738448, 0.93034890022695593, 0.25295229975218769,
0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736,
0.52781260099838434, 0.52875058412167075, 0.426116738236779,
0.42300502393871175, 0.53270263300536513, 0.60254817779426029,
0.27107336472576271, 0.613792118138183, 0.0021003027835629906,
0.32006750487285818, 0.54856110146602044, 0.51215105813137074,
0.51451366528053577] {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946,
"CPDH4G-ZXGPkku-3wY-ktaQ [...]
+
+-- !array_max_orc --
+11028
+
+-- !array_filter_orc --
+11028
+
+-- !array_last_orc --
+0.9899828598260161
+
-- !offsets_check --
0 [1, 2] [[], [3], null] {"a":1, "b":2} {"s1": "e", "s2": null}
1 [] [] {} \N
diff --git
a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
index c1f540e6ca1..d050d9a80a4 100644
--- a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
@@ -51,6 +51,22 @@ suite("test_complex_types", "p2") {
qt_array_last """select max(array_last(i -> i > 0, capacity)) from byd
where array_last(i -> i > 0, capacity) < 0.99"""
+ qt_null_struct_element_orc """select count(struct_element(favor,
'tip')) from byd where id % 13 = 0"""
+
+ qt_map_key_select_orc """select id, singles["p0X72J-mkMe40O-vOa-opfI"]
as map_key from byd where singles["p0X72J-mkMe40O-vOa-opfI"] is not null"""
+
+ qt_map_keys_orc """select map_keys(singles) from byd where id = 1077"""
+
+ qt_map_values_orc """select map_values(singles) from byd where id =
1433"""
+
+ qt_map_contains_key_orc """select * from byd where
map_contains_key(singles, 'B0mXFX-QvgUgo7-Dih-6rDu') = 1"""
+
+ qt_array_max_orc """select count(array_max(capacity)) from byd where
array_max(capacity) > 0.99"""
+
+ qt_array_filter_orc """select count(array_size(array_filter(i -> (i >
0.99), capacity))) from byd where array_size(array_filter(i -> (i > 0.99),
capacity))"""
+
+ qt_array_last_orc """select max(array_last(i -> i > 0, capacity)) from
byd where array_last(i -> i > 0, capacity) < 0.99"""
+
qt_offsets_check """select * from complex_offsets_check order by id"""
qt_map_with_nullable_key """select * from parquet_all_types limit 1"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]