This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1-lakehouse in repository https://gitbox.apache.org/repos/asf/doris.git
commit 88df781e2bc2c5c91d00da6d0808c9ffc34e8c8d Author: Socrates <[email protected]> AuthorDate: Wed Dec 18 15:32:17 2024 +0800 [fix](orc) ignore null values when the literals of in_predicate contains (#45104) ### What problem does this PR solve? Related PR: #43255 Problem Summary: Should ignore null values when the literals of in_predicate contains null value, like `in (1, null)` For example, init table in hive: ```sql CREATE TABLE sample_orc_table ( id INT, name STRING, age INT ) STORED AS ORC; INSERT INTO TABLE sample_orc_table VALUES (1, 'Alice', 25), (2, NULL, NULL); ``` select result in Doris should be: ```sql mysql> select * from sample_orc_table where age in (null,25); +------+-------+------+ | id | name | age | +------+-------+------+ | 1 | Alice | 25 | +------+-------+------+ 1 row in set (0.30 sec) mysql> select * from sample_orc_table where age in (25); +------+-------+------+ | id | name | age | +------+-------+------+ | 1 | Alice | 25 | +------+-------+------+ 1 row in set (0.27 sec) mysql> select * from sample_orc_table where age in (null); Empty set (0.01 sec) mysql> select * from sample_orc_table where age is null; +------+------+------+ | id | name | age | +------+------+------+ | 2 | NULL | NULL | +------+------+------+ 1 row in set (0.11 sec) ``` --- be/src/apache-orc | 2 +- be/src/vec/exec/format/orc/vorc_reader.cpp | 24 +++++++++++++++------ be/test/vec/exec/orc_reader_test.cpp | 3 +++ .../data/external_table_p0/hive/test_hive_orc.out | Bin 91283 -> 92747 bytes .../external_table_p0/hive/test_hive_orc.groovy | 22 +++++++++++++++++++ 5 files changed, 43 insertions(+), 8 deletions(-) diff --git a/be/src/apache-orc b/be/src/apache-orc index db01184f765..2f937bdc764 160000 --- a/be/src/apache-orc +++ b/be/src/apache-orc @@ -1 +1 @@ -Subproject commit db01184f765c03496e4107bd3ac37c077ac4bc5f +Subproject commit 2f937bdc76406f150b484b6e57629aa8a03d48b6 diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 765791daef8..fd3ca829517 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -587,6 +587,10 @@ std::tuple<bool, orc::Literal, orc::PredicateDataType> OrcReader::_make_orc_lite // only get the predicate_type return std::make_tuple(true, orc::Literal(true), predicate_type); } + // this only happens when the literals of in_predicate contains null value, like in (1, null) + if (literal->get_column_ptr()->is_null_at(0)) { + return std::make_tuple(false, orc::Literal(false), predicate_type); + } auto literal_data = literal->get_column_ptr()->get_data_at(0); auto* slot = _tuple_descriptor->slots()[slot_ref->column_id()]; auto slot_type = slot->type(); @@ -674,12 +678,13 @@ bool OrcReader::_check_rest_children_can_push_down(const VExprSPtr& expr) { return false; } + bool at_least_one_child_can_push_down = false; for (size_t i = 1; i < expr->children().size(); ++i) { - if (!_check_literal_can_push_down(expr, i)) { - return false; + if (_check_literal_can_push_down(expr, i)) { + at_least_one_child_can_push_down = true; } } - return true; + return at_least_one_child_can_push_down; } // check if the expr can be pushed down to orc reader @@ -782,12 +787,17 @@ void OrcReader::_build_filter_in(const VExprSPtr& expr, for (size_t i = 1; i < expr->children().size(); ++i) { DCHECK(expr->children()[i]->is_literal()); const auto* literal = static_cast<const VLiteral*>(expr->children()[i].get()); - DCHECK(_vliteral_to_orc_literal.contains(literal)); - auto orc_literal = _vliteral_to_orc_literal.find(literal)->second; - literals.emplace_back(orc_literal); + if (_vliteral_to_orc_literal.contains(literal)) { + auto orc_literal = _vliteral_to_orc_literal.find(literal)->second; + literals.emplace_back(orc_literal); + } } DCHECK(!literals.empty()); - builder->in(slot_ref->expr_name(), predicate_type, literals); + if (literals.size() == 1) { + builder->equals(slot_ref->expr_name(), predicate_type, literals[0]); + } else { + builder->in(slot_ref->expr_name(), predicate_type, literals); + } } void OrcReader::_build_is_null(const VExprSPtr& expr, diff --git a/be/test/vec/exec/orc_reader_test.cpp b/be/test/vec/exec/orc_reader_test.cpp index f2bba434368..ff7452ae625 100644 --- a/be/test/vec/exec/orc_reader_test.cpp +++ b/be/test/vec/exec/orc_reader_test.cpp @@ -111,6 +111,8 @@ TEST_F(OrcReaderTest, test_build_search_argument) { R"|({"1":{"lst":["rec",11,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":11},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"lt"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0}, [...] // select count(o_orderkey) from tpch1_orc.orders where o_orderkey < 1 + 1; R"|({"1":{"lst":["rec",3,{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":11},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"lt"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]},"3":{"i64":-1}},{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]},"3":{"i64":-1}}]},"4":{"rec":{"1":{"lst":["rec",1,{"1":{ [...] + // select count(o_orderkey) from tpch1_orc.orders where o_orderkey in (null, 25); + R"|({"1":{"lst":["rec",4,{"1":{"i32":11},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":5},"4":{"i32":3},"11":{"rec":{"1":{"tf":0}}},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":16},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]},"3":{"i64":-1}}},"4":{"i32":0},"15":{"rec":{"1":{"i32":0},"2":{"i32":0},"3":{"i32":-1}}},"20":{"i32":-1},"29":{"tf":1},"36":{"str":"o_orderkey"}},{ [...] // SELECT count(o_orderkey) FROM tpch1_orc.orders WHERE o_comment LIKE '%delayed%' OR o_orderpriority = '1-URGENT'; R"|({"1":{"lst":["rec",7,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":20},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"like"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{" [...] // select count(o_orderkey) from tpch1_orc.orders where o_orderkey between 1 and 100 or random() > 0.5; @@ -143,6 +145,7 @@ TEST_F(OrcReaderTest, test_build_search_argument) { "leaf-0 = (o_orderkey < 2), leaf-1 = (o_custkey < 36901), expr = (or leaf-0 (not " "leaf-1))", "leaf-0 = (o_orderkey < 2), expr = leaf-0", + "leaf-0 = (o_orderkey = 25), expr = leaf-0", CANNOT_PUSH_DOWN_ERROR, CANNOT_PUSH_DOWN_ERROR, CANNOT_PUSH_DOWN_ERROR, diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc.out b/regression-test/data/external_table_p0/hive/test_hive_orc.out index b34f276020c..03942dbe9fb 100644 Binary files a/regression-test/data/external_table_p0/hive/test_hive_orc.out and b/regression-test/data/external_table_p0/hive/test_hive_orc.out differ diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy index 6457d2b3edd..8d85feaa77a 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy @@ -90,6 +90,28 @@ suite("test_hive_orc", "all_types,p0,external,hive,external_docker,external_dock qt_predicate_pushdown6 """ SELECT count(o_orderkey) FROM tpch1_orc.orders WHERE o_orderstatus <> 'F' AND o_custkey < 54321; """ qt_predicate_pushdown7 """ SELECT count(o_orderkey) FROM tpch1_orc.orders WHERE o_comment LIKE '%delayed%' OR o_orderpriority = '1-URGENT'; """ qt_predicate_pushdown8 """ SELECT count(o_orderkey) FROM tpch1_orc.orders WHERE o_orderkey IN (1000000, 2000000, 3000000) OR o_clerk = 'Clerk#000000470'; """ + + qt_predicate_pushdown_in1 """ select count(*) from orc_all_types where boolean_col in (null); """ + qt_predicate_pushdown_in2 """ select count(*) from orc_all_types where boolean_col in (null, 0); """ + qt_predicate_pushdown_in3 """ select count(*) from orc_all_types where boolean_col in (null, 1); """ + + def test_col_is_null = { String col -> + "qt_orc_all_types_${col}_is_null" """ select count(*) from orc_all_types where ${col} is null; """ + } + test_col_is_null("tinyint_col") + test_col_is_null("smallint_col") + test_col_is_null("int_col") + test_col_is_null("bigint_col") + test_col_is_null("boolean_col") + test_col_is_null("float_col") + test_col_is_null("double_col") + test_col_is_null("string_col") + test_col_is_null("binary_col") + test_col_is_null("timestamp_col") + test_col_is_null("decimal_col") + test_col_is_null("char_col") + test_col_is_null("varchar_col") + test_col_is_null("date_col") } String enabled = context.config.otherConfigs.get("enableHiveTest") --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
