This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 8af1e7f27f [Fix](orc-reader) Fix incorrect result if null partition fields in orc file. (#23369) 8af1e7f27f is described below commit 8af1e7f27fd33b90352aabb7e325edfa20c87eb0 Author: Qi Chen <kaka11.c...@gmail.com> AuthorDate: Sat Aug 26 00:13:11 2023 +0800 [Fix](orc-reader) Fix incorrect result if null partition fields in orc file. (#23369) Fix incorrect result if null partition fields in orc file. ### Root Cause Theoretically, the underlying file of the hive partition table should not contain partition fields. But we found that in some user scenarios, the partition field will exist in the underlying orc/parquet file and are null values. As a result, the pushed down partition field which are null values. filter incorrectly. ### Solution we handle this case by only reading non-partition fields. The parquet reader is already handled this way, this PR handles the orc reader. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 29 +++++++++-------- .../hive/test_external_catalog_hive.out | 36 ++++++++++++++++++++++ .../hive/test_external_catalog_hive.groovy | 11 +++++++ 3 files changed, 64 insertions(+), 12 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 362157d41b..61418f8e40 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -632,19 +632,24 @@ bool OrcReader::_init_search_argument( for (int i = 0; i < root_type.getSubtypeCount(); ++i) { type_map.emplace(_get_field_name_lower_case(&root_type, i), root_type.getSubtype(i)); } - for (auto it = colname_to_value_range->begin(); it != colname_to_value_range->end(); ++it) { - auto type_it = type_map.find(it->first); - if (type_it != type_map.end()) { - std::visit( - [&](auto& range) { - std::vector<OrcPredicate> value_predicates = - value_range_to_predicate(range, type_it->second); - for (auto& range_predicate : value_predicates) { - predicates.emplace_back(range_predicate); - } - }, - it->second); + for (auto& col_name : _lazy_read_ctx.all_read_columns) { + auto iter = colname_to_value_range->find(col_name); + if (iter == colname_to_value_range->end()) { + continue; } + auto type_it = type_map.find(col_name); + if (type_it == type_map.end()) { + continue; + } + std::visit( + [&](auto& range) { + std::vector<OrcPredicate> value_predicates = + value_range_to_predicate(range, type_it->second); + for (auto& range_predicate : value_predicates) { + predicates.emplace_back(range_predicate); + } + }, + iter->second); } if (predicates.empty()) { return false; diff --git a/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out b/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out index 0cb7db4732..124239d174 100644 --- a/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out +++ b/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out @@ -126,3 +126,39 @@ Z6n2t4XA2n7CXTECJ,PE,iBbsCh0RE1Dd2A,z48 -- !null_expr_dict_filter_parquet -- 4844 4363 +-- !par_fields_in_file_orc1 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet1 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc2 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet2 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc3 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet3 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc4 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet4 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc5 -- + +-- !par_fields_in_file_parquet5 -- + diff --git a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy index 4ca7c8d417..7a9a80aff0 100644 --- a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy +++ b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy @@ -97,6 +97,17 @@ suite("test_external_catalog_hive", "p2,external,hive,external_remote,external_r qt_null_expr_dict_filter_orc """ select count(*), count(distinct user_no) from multi_catalog.dict_fitler_test_orc WHERE partitions in ('2023-08-21') and actual_intf_type = 'type1' and (REUSE_FLAG<> 'y' or REUSE_FLAG is null); """ qt_null_expr_dict_filter_parquet """ select count(*), count(distinct user_no) from multi_catalog.dict_fitler_test_parquet WHERE partitions in ('2023-08-21') and actual_intf_type = 'type1' and (REUSE_FLAG<> 'y' or REUSE_FLAG is null); """ + // test par fields in file + qt_par_fields_in_file_orc1 """ select * from multi_catalog.par_fields_in_file_orc where year = 2023 and month = 8 order by id; """ + qt_par_fields_in_file_parquet1 """ select * from multi_catalog.par_fields_in_file_parquet where year = 2023 and month = 8 order by id; """ + qt_par_fields_in_file_orc2 """ select * from multi_catalog.par_fields_in_file_orc where year = 2023 order by id; """ + qt_par_fields_in_file_parquet2 """ select * from multi_catalog.par_fields_in_file_parquet where year = 2023 order by id; """ + qt_par_fields_in_file_orc3 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 order by id; """ + qt_par_fields_in_file_parquet3 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 order by id; """ + qt_par_fields_in_file_orc4 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 and year >= 2022 order by id; """ + qt_par_fields_in_file_parquet4 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 and year >= 2022 order by id; """ + qt_par_fields_in_file_orc5 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 and year = 2022 order by id; """ + qt_par_fields_in_file_parquet5 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 and year = 2022 order by id; """ // test remember last used database after switch / rename catalog sql """switch ${catalog_name};""" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org