This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new a0661ed9d28 [Fix](multi-catalog) Fix complex type crash when using
dict filter facility in the parquet-reader. (#27151)
a0661ed9d28 is described below
commit a0661ed9d285b7f2692e5f004987251a8a81d844
Author: Qi Chen <[email protected]>
AuthorDate: Fri Nov 17 13:43:58 2023 +0800
[Fix](multi-catalog) Fix complex type crash when using dict filter facility
in the parquet-reader. (#27151)
- Fix complex type crash when using the dict filter facility in the
parquet-reader by turning off the dict filter facility in this case.
- Add orc complex types regression test.
---
.../exec/format/parquet/vparquet_group_reader.cpp | 3 +-
.../exec/format/parquet/vparquet_group_reader.h | 2 ++
be/src/vec/exec/format/parquet/vparquet_reader.cpp | 6 ++--
be/src/vec/exec/format/parquet/vparquet_reader.h | 2 --
.../external_table_p2/hive/test_complex_types.out | 34 ++++++++++++++++++----
.../hive/test_complex_types.groovy | 16 ++++++++++
6 files changed, 52 insertions(+), 11 deletions(-)
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index abb37a8bfc7..8360ab23e07 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -146,7 +146,8 @@ Status RowGroupReader::init(
const string& predicate_col_name = predicate_col_names[i];
int slot_id = predicate_col_slot_ids[i];
auto field =
const_cast<FieldSchema*>(schema.get_column(predicate_col_name));
- if (_can_filter_by_dict(slot_id,
+ if (!_lazy_read_ctx.has_complex_type &&
+ _can_filter_by_dict(slot_id,
_row_group_meta.columns[field->physical_column_index].meta_data)) {
_dict_filter_cols.emplace_back(std::make_pair(predicate_col_name,
slot_id));
} else {
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h
b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
index aeb1404b12c..2a7e163ed0a 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
@@ -97,6 +97,8 @@ public:
std::unordered_map<std::string, VExprContextSPtr>
predicate_missing_columns;
// lazy read missing columns or all missing columns
std::unordered_map<std::string, VExprContextSPtr> missing_columns;
+ // should turn off filtering by page index, lazy read and dict filter
if having complex type
+ bool has_complex_type = false;
};
/**
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 810f5a1c8a6..6eb139a7b0e 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -416,7 +416,7 @@ Status ParquetReader::set_fill_columns(
_lazy_read_ctx.all_read_columns.emplace_back(read_col);
PrimitiveType column_type = schema.get_column(read_col)->type.type;
if (column_type == TYPE_ARRAY || column_type == TYPE_MAP ||
column_type == TYPE_STRUCT) {
- _has_complex_type = true;
+ _lazy_read_ctx.has_complex_type = true;
}
if (predicate_columns.size() > 0) {
auto iter = predicate_columns.find(read_col);
@@ -450,7 +450,7 @@ Status ParquetReader::set_fill_columns(
}
}
- if (!_has_complex_type && _enable_lazy_mat &&
+ if (!_lazy_read_ctx.has_complex_type && _enable_lazy_mat &&
_lazy_read_ctx.predicate_columns.first.size() > 0 &&
_lazy_read_ctx.lazy_read_columns.size() > 0) {
_lazy_read_ctx.can_lazy_read = true;
@@ -736,7 +736,7 @@ Status ParquetReader::_process_page_index(const
tparquet::RowGroup& row_group,
_statistics.read_rows += row_group.num_rows;
};
- if (_has_complex_type || _lazy_read_ctx.conjuncts.empty() ||
+ if (_lazy_read_ctx.has_complex_type || _lazy_read_ctx.conjuncts.empty() ||
_colname_to_value_range == nullptr ||
_colname_to_value_range->empty()) {
read_whole_row_group();
return Status::OK();
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h
b/be/src/vec/exec/format/parquet/vparquet_reader.h
index 8c763b236fb..0e9bf4907f2 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -239,8 +239,6 @@ private:
RowRange _whole_range = RowRange(0, 0);
const std::vector<int64_t>* _delete_rows = nullptr;
int64_t _delete_rows_index = 0;
- // should turn off filtering by page index and lazy read if having complex
type
- bool _has_complex_type = false;
// Used for column lazy read.
RowGroupReader::LazyReadContext _lazy_read_ctx;
diff --git a/regression-test/data/external_table_p2/hive/test_complex_types.out
b/regression-test/data/external_table_p2/hive/test_complex_types.out
index 1bfb858fae2..fb712c63d30 100644
--- a/regression-test/data/external_table_p2/hive/test_complex_types.out
+++ b/regression-test/data/external_table_p2/hive/test_complex_types.out
@@ -12,7 +12,7 @@
[0.98055020292316664, 0.53302915957540542, 0.30024744873379805,
0.48563601750302665, 0.76871064251586241, 0.69935066449251015,
0.28493548088258069, 0.34734174551861408, 0.13500129443045072,
0.97081321037009394, 0.18583042639943448, 0.48863372645520731,
0.36354741695157655, 0.56408452689711752, 0.1374134087807577,
0.77665476474516226, 0.58353232966683177, 0.36544595471103491,
0.54797767099937644, 0.83799325421171922, 0.15665046278350814,
0.03371222042250388, 0.1699781825927229, 0.35796304 [...]
-- !map_contains_key --
-1077 [0.78055609958738448, 0.93034890022695593, 0.25295229975218769,
0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736,
0.52781260099838434, 0.52875058412167075, 0.426116738236779,
0.42300502393871175, 0.53270263300536513, 0.60254817779426029,
0.27107336472576271, 0.613792118138183, 0.0021003027835629906,
0.32006750487285818, 0.54856110146602044, 0.51215105813137074,
0.51451366528053577] {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946,
"CPDH4G-ZXGPkku-3wY-ktaQ [...]
+1077 [0.78055609958738448, 0.93034890022695593, 0.25295229975218769,
0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736,
0.52781260099838434, 0.52875058412167075, 0.426116738236779,
0.42300502393871175, 0.53270263300536513, 0.60254817779426029,
0.27107336472576271, 0.613792118138183, 0.0021003027835629906,
0.32006750487285818, 0.54856110146602044, 0.51215105813137074,
0.51451366528053577] {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946,
"CPDH4G-ZXGPkku-3wY-ktaQ [...]
-- !array_max --
11028
@@ -23,14 +23,38 @@
-- !array_last --
0.9899828598260161
+-- !null_struct_element_orc --
+0
+
+-- !map_key_select_orc --
+38111 0.770169659057425
+
+-- !map_keys_orc --
+["9wXr9n-TBm9Wyt-r8H-SkAq", "CPDH4G-ZXGPkku-3wY-ktaQ",
"RvNlMt-HHjHN5M-VjP-xHAI", "qKIhKy-Ws344os-haX-2pmT",
"DOJJ5l-UEkwVMs-x9F-HifD", "m871g8-1eFi7jt-oBq-S0yc",
"wXugVP-v2fc6IF-DeU-On3T", "B0mXFX-QvgUgo7-Dih-6rDu",
"E9zv3F-xMqSbMa-il4-FuDg", "msuFIN-ZkKO8TY-tu4-veH0",
"0rSUyl-Un07aIW-KAx-WHnX", "XvbmO8-WA6oAqc-ihc-s8IL",
"G6B6RD-AicAlZb-16u-Pn1I", "coDK0Q-tMg1294-JMQ-ZWQu",
"4c0aWh-yhL6BOX-rRu-1n0r", "G4iUcG-ZhWw62v-VLt-n6lH",
"IIB7qD-WQistwT-Vux-0c9B", "7cTyuR-5ssXm2S-sJR-JTIZ", "3KPh [...]
+
+-- !map_values_orc --
+[0.98055020292316664, 0.53302915957540542, 0.30024744873379805,
0.48563601750302665, 0.76871064251586241, 0.69935066449251015,
0.28493548088258069, 0.34734174551861408, 0.13500129443045072,
0.97081321037009394, 0.18583042639943448, 0.48863372645520731,
0.36354741695157655, 0.56408452689711752, 0.1374134087807577,
0.77665476474516226, 0.58353232966683177, 0.36544595471103491,
0.54797767099937644, 0.83799325421171922, 0.15665046278350814,
0.03371222042250388, 0.1699781825927229, 0.35796304 [...]
+
+-- !map_contains_key_orc --
+1077 [0.78055609958738448, 0.93034890022695593, 0.25295229975218769,
0.662270811026298, 0.664725297532439, 0.10194410917644769, 0.96140593006881736,
0.52781260099838434, 0.52875058412167075, 0.426116738236779,
0.42300502393871175, 0.53270263300536513, 0.60254817779426029,
0.27107336472576271, 0.613792118138183, 0.0021003027835629906,
0.32006750487285818, 0.54856110146602044, 0.51215105813137074,
0.51451366528053577] {"9wXr9n-TBm9Wyt-r8H-SkAq":0.93383290104809946,
"CPDH4G-ZXGPkku-3wY-ktaQ [...]
+
+-- !array_max_orc --
+11028
+
+-- !array_filter_orc --
+11028
+
+-- !array_last_orc --
+0.9899828598260161
+
-- !offsets_check --
-0 [1, 2] [[], [3], NULL] {"a":1, "b":2} {"e", NULL}
+0 [1, 2] [[], [3], null] {"a":1, "b":2} {"s1": "e", "s2": null}
1 [] [] {} \N
-2 \N \N \N {"h", 10}
-3 [5, NULL] [[6, 7], [8, NULL], NULL] {"f":1, "g":NULL}
{NULL, 9}
+2 \N \N \N {"s1": "h", "s2": 10}
+3 [5, null] [[6, 7], [8, null], null] {"f":1, "g":null}
{"s1": null, "s2": 9}
-- !map_with_nullable_key --
-\N \N \N \N \N \N \N \N \N
test test
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
[...]
+\N \N \N \N \N \N \N \N \N
test test
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
[...]
-- !date_dict --
2036-12-28 1898-12-28 2539-12-28
diff --git
a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
index 32ef138c4bd..1a90570ff42 100644
--- a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
@@ -51,6 +51,22 @@ suite("test_complex_types",
"p2,external,hive,external_remote,external_remote_hi
qt_array_last """select max(array_last(i -> i > 0, capacity)) from byd
where array_last(i -> i > 0, capacity) < 0.99"""
+ qt_null_struct_element_orc """select count(struct_element(favor,
'tip')) from byd where id % 13 = 0"""
+
+ qt_map_key_select_orc """select id, singles["p0X72J-mkMe40O-vOa-opfI"]
as map_key from byd where singles["p0X72J-mkMe40O-vOa-opfI"] is not null"""
+
+ qt_map_keys_orc """select map_keys(singles) from byd where id = 1077"""
+
+ qt_map_values_orc """select map_values(singles) from byd where id =
1433"""
+
+ qt_map_contains_key_orc """select * from byd where
map_contains_key(singles, 'B0mXFX-QvgUgo7-Dih-6rDu') = 1"""
+
+ qt_array_max_orc """select count(array_max(capacity)) from byd where
array_max(capacity) > 0.99"""
+
+ qt_array_filter_orc """select count(array_size(array_filter(i -> (i >
0.99), capacity))) from byd where array_size(array_filter(i -> (i > 0.99),
capacity))"""
+
+ qt_array_last_orc """select max(array_last(i -> i > 0, capacity)) from
byd where array_last(i -> i > 0, capacity) < 0.99"""
+
qt_offsets_check """select * from complex_offsets_check order by id"""
qt_map_with_nullable_key """select * from parquet_all_types limit 1"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]