This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 69ddefe2752 [Fix](multi-catalog) Fix string dictionary filtering when
using null related functions in parquet and orc reader by disabling dictionary
filtering when predicates contain functions #35335 (#35514)
69ddefe2752 is described below
commit 69ddefe27528f67ef66cec60e51a3aff9c0eac81
Author: Qi Chen <[email protected]>
AuthorDate: Tue May 28 23:30:13 2024 +0800
[Fix](multi-catalog) Fix string dictionary filtering when using null
related functions in parquet and orc reader by disabling dictionary filtering
when predicates contain functions #35335 (#35514)
---
be/src/vec/exec/format/orc/vorc_reader.cpp | 21 ++--
.../exec/format/parquet/vparquet_group_reader.cpp | 21 ++--
.../hive/scripts/create_preinstalled_table.hql | 42 +++++++
.../test_string_dict_filter.orc | Bin 0 -> 1652 bytes
.../test_string_dict_filter.parquet | Bin 0 -> 2292 bytes
.../hive/test_string_dict_filter.out | 115 ++++++++++++++++++
.../hive/test_string_dict_filter.groovy | 129 +++++++++++++++++++++
7 files changed, 306 insertions(+), 22 deletions(-)
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index a188c2e3eb5..4a7944defee 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -1830,21 +1830,20 @@ bool OrcReader::_can_filter_by_dict(int slot_id) {
return false;
}
- // TODO:check expr like 'a > 10 is null', 'a > 10' should can be filter by
dict.
std::function<bool(const VExpr* expr)> visit_function_call = [&](const
VExpr* expr) {
+ // TODO: The current implementation of dictionary filtering does not
take into account
+ // the implementation of NULL values because the dictionary itself
does not contain
+ // NULL value encoding. As a result, many NULL-related functions or
expressions
+ // cannot work properly, such as is null, is not null, coalesce, etc.
+ // Here we first disable dictionary filtering when predicate contains
functions.
+ // Implementation of NULL value dictionary filtering will be carried
out later.
if (expr->node_type() == TExprNodeType::FUNCTION_CALL) {
- std::string is_null_str;
- std::string function_name = expr->fn().name.function_name;
- if (function_name.compare("is_null_pred") == 0 ||
- function_name.compare("is_not_null_pred") == 0) {
+ return false;
+ }
+ for (auto& child : expr->children()) {
+ if (!visit_function_call(child.get())) {
return false;
}
- } else {
- for (auto& child : expr->children()) {
- if (!visit_function_call(child.get())) {
- return false;
- }
- }
}
return true;
};
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index f1e5dc42801..2d8f9cbb48e 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -197,21 +197,20 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id,
return false;
}
- // TODO:check expr like 'a > 10 is null', 'a > 10' should can be filter by
dict.
std::function<bool(const VExpr* expr)> visit_function_call = [&](const
VExpr* expr) {
+ // TODO: The current implementation of dictionary filtering does not
take into account
+ // the implementation of NULL values because the dictionary itself
does not contain
+ // NULL value encoding. As a result, many NULL-related functions or
expressions
+ // cannot work properly, such as is null, is not null, coalesce, etc.
+ // Here we first disable dictionary filtering when predicate contains
functions.
+ // Implementation of NULL value dictionary filtering will be carried
out later.
if (expr->node_type() == TExprNodeType::FUNCTION_CALL) {
- std::string is_null_str;
- std::string function_name = expr->fn().name.function_name;
- if (function_name.compare("is_null_pred") == 0 ||
- function_name.compare("is_not_null_pred") == 0) {
+ return false;
+ }
+ for (auto& child : expr->children()) {
+ if (!visit_function_call(child.get())) {
return false;
}
- } else {
- for (auto& child : expr->children()) {
- if (!visit_function_call(child.get())) {
- return false;
- }
- }
}
return true;
};
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
index e88516ffb6f..765958527aa 100644
---
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
@@ -1808,6 +1808,48 @@ TBLPROPERTIES (
msck repair table string_col_dict_plain_mixed_orc;
+CREATE TABLE `test_string_dict_filter_parquet`(
+ `o_orderkey` int,
+ `o_custkey` int,
+ `o_orderstatus` string,
+ `o_totalprice` decimal(15,2),
+ `o_orderdate` date,
+ `o_orderpriority` string,
+ `o_clerk` string,
+ `o_shippriority` int,
+ `o_comment` string)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+LOCATION
+
'/user/doris/preinstalled_data/parquet_table/test_string_dict_filter_parquet';
+
+msck repair table test_string_dict_filter_parquet;
+
+CREATE TABLE `test_string_dict_filter_orc`(
+ `o_orderkey` int,
+ `o_custkey` int,
+ `o_orderstatus` string,
+ `o_totalprice` decimal(15,2),
+ `o_orderdate` date,
+ `o_orderpriority` string,
+ `o_clerk` string,
+ `o_shippriority` int,
+ `o_comment` string)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+LOCATION
+ '/user/doris/preinstalled_data/orc_table/test_string_dict_filter_orc';
+
+msck repair table test_string_dict_filter_orc;
+
show tables;
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_string_dict_filter_orc/test_string_dict_filter.orc
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_string_dict_filter_orc/test_string_dict_filter.orc
new file mode 100644
index 00000000000..30638e4117c
Binary files /dev/null and
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/test_string_dict_filter_orc/test_string_dict_filter.orc
differ
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_string_dict_filter_parquet/test_string_dict_filter.parquet
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_string_dict_filter_parquet/test_string_dict_filter.parquet
new file mode 100644
index 00000000000..f3e189da7e9
Binary files /dev/null and
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/test_string_dict_filter_parquet/test_string_dict_filter.parquet
differ
diff --git
a/regression-test/data/external_table_p0/hive/test_string_dict_filter.out
b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out
new file mode 100644
index 00000000000..f8f59b864cc
--- /dev/null
+++ b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out
@@ -0,0 +1,115 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !q01 --
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q02 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+2 78002 O 46929.18 1996-12-01 1-URGENT
Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot
+4 136777 O 32151.78 1995-10-11 \N Clerk#000000124
0 sits. slyly regular warthogs cajole. regular, regular theodolites acro
+
+-- !q03 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+2 78002 O 46929.18 1996-12-01 1-URGENT
Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+4 136777 O 32151.78 1995-10-11 \N Clerk#000000124
0 sits. slyly regular warthogs cajole. regular, regular theodolites acro
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q04 --
+4 136777 O 32151.78 1995-10-11 \N Clerk#000000124
0 sits. slyly regular warthogs cajole. regular, regular theodolites acro
+
+-- !q05 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+2 78002 O 46929.18 1996-12-01 1-URGENT
Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q06 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q07 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q08 --
+
+-- !q09 --
+2 78002 O 46929.18 1996-12-01 1-URGENT
Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q10 --
+null
+
+-- !q11 --
+null
+
+-- !q12 --
+null
+
+-- !q13 --
+null
+
+-- !q14 --
+null
+
+-- !q01 --
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q02 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+2 78002 O 46929.18 1996-12-01 1-URGENT
Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot
+4 136777 O 32151.78 1995-10-11 \N Clerk#000000124
0 sits. slyly regular warthogs cajole. regular, regular theodolites acro
+
+-- !q03 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+2 78002 O 46929.18 1996-12-01 1-URGENT
Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+4 136777 O 32151.78 1995-10-11 \N Clerk#000000124
0 sits. slyly regular warthogs cajole. regular, regular theodolites acro
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q04 --
+4 136777 O 32151.78 1995-10-11 \N Clerk#000000124
0 sits. slyly regular warthogs cajole. regular, regular theodolites acro
+
+-- !q05 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+2 78002 O 46929.18 1996-12-01 1-URGENT
Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q06 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q07 --
+1 36901 O 173665.47 1996-01-02 5-LOW Clerk#000000951
0 nstructions sleep furiously among
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q08 --
+
+-- !q09 --
+2 78002 O 46929.18 1996-12-01 1-URGENT
Clerk#000000880 0 foxes. pending accounts at the pending, silent asymptot
+3 123314 F 193846.25 1993-10-14 5-LOW Clerk#000000955
0 sly final accounts boost. carefully regular ideas cajole carefully.
depos
+5 44485 F 144659.20 1994-07-30 5-LOW Clerk#000000925
0 quickly. bold deposits sleep slyly. packages use slyly
+
+-- !q10 --
+null
+
+-- !q11 --
+null
+
+-- !q12 --
+null
+
+-- !q13 --
+null
+
+-- !q14 --
+null
+
diff --git
a/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
new file mode 100644
index 00000000000..e213aabbdc8
--- /dev/null
+++
b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
@@ -0,0 +1,129 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_string_dict_filter",
"p0,external,hive,external_docker,external_docker_hive") {
+ def q_parquet = {
+ qt_q01 """
+ select * from test_string_dict_filter_parquet where o_orderstatus =
'F';
+ """
+ qt_q02 """
+ select * from test_string_dict_filter_parquet where o_orderstatus =
'O';
+ """
+ qt_q03 """
+ select * from test_string_dict_filter_parquet where o_orderstatus in
('O', 'F');
+ """
+ qt_q04 """
+ select * from test_string_dict_filter_parquet where o_orderpriority is
null;
+ """
+ qt_q05 """
+ select * from test_string_dict_filter_parquet where o_orderpriority is
not null;
+ """
+ qt_q06 """
+ select * from test_string_dict_filter_parquet where o_orderpriority in
('5-LOW', NULL);
+ """
+ qt_q07 """
+ select * from test_string_dict_filter_parquet where o_orderpriority in
('5-LOW') and o_orderstatus in ('O', 'F');
+ """
+ qt_q08 """
+ select * from test_string_dict_filter_parquet where o_orderpriority in
('1-URGENT') and o_orderstatus in ('F');
+ """
+ qt_q09 """
+ select * from test_string_dict_filter_parquet where o_orderpriority in
('1-URGENT') or o_orderstatus in ('F');
+ """
+ qt_q10 """
+ select * from ( select IF(o_orderpriority IS NULL, 'null',
o_orderpriority) AS o_orderpriority from test_string_dict_filter_parquet ) as A
where o_orderpriority = 'null';
+ """
+ qt_q11 """
+ select * from ( select IF(o_orderpriority IS NOT NULL,
o_orderpriority, 'null') AS o_orderpriority from
test_string_dict_filter_parquet ) as A where o_orderpriority = 'null';
+ """
+ qt_q12 """
+ select * from ( select IFNULL(o_orderpriority, 'null') AS
o_orderpriority from test_string_dict_filter_parquet ) as A where
o_orderpriority = 'null';
+ """
+ qt_q13 """
+ select * from ( select IFNULL(o_orderpriority, 'null') AS
o_orderpriority from test_string_dict_filter_parquet ) as A where
o_orderpriority = 'null';
+ """
+ qt_q14 """
+ select * from ( select COALESCE(o_orderpriority, 'null') AS
o_orderpriority from test_string_dict_filter_parquet ) as A where
o_orderpriority = 'null';
+ """
+ }
+ def q_orc = {
+ qt_q01 """
+ select * from test_string_dict_filter_orc where o_orderstatus = 'F';
+ """
+ qt_q02 """
+ select * from test_string_dict_filter_orc where o_orderstatus = 'O';
+ """
+ qt_q03 """
+ select * from test_string_dict_filter_orc where o_orderstatus in ('O',
'F');
+ """
+ qt_q04 """
+ select * from test_string_dict_filter_orc where o_orderpriority is
null;
+ """
+ qt_q05 """
+ select * from test_string_dict_filter_orc where o_orderpriority is not
null;
+ """
+ qt_q06 """
+ select * from test_string_dict_filter_orc where o_orderpriority in
('5-LOW', NULL);
+ """
+ qt_q07 """
+ select * from test_string_dict_filter_orc where o_orderpriority in
('5-LOW') and o_orderstatus in ('O', 'F');
+ """
+ qt_q08 """
+ select * from test_string_dict_filter_orc where o_orderpriority in
('1-URGENT') and o_orderstatus in ('F');
+ """
+ qt_q09 """
+ select * from test_string_dict_filter_orc where o_orderpriority in
('1-URGENT') or o_orderstatus in ('F');
+ """
+ qt_q10 """
+ select * from ( select IF(o_orderpriority IS NULL, 'null',
o_orderpriority) AS o_orderpriority from test_string_dict_filter_orc ) as A
where o_orderpriority = 'null';
+ """
+ qt_q11 """
+ select * from ( select IF(o_orderpriority IS NOT NULL,
o_orderpriority, 'null') AS o_orderpriority from test_string_dict_filter_orc )
as A where o_orderpriority = 'null';
+ """
+ qt_q12 """
+ select * from ( select IFNULL(o_orderpriority, 'null') AS
o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority =
'null';
+ """
+ qt_q13 """
+ select * from ( select IFNULL(o_orderpriority, 'null') AS
o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority =
'null';
+ """
+ qt_q14 """
+ select * from ( select COALESCE(o_orderpriority, 'null') AS
o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority =
'null';
+ """
+ }
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ try {
+ String hms_port = context.config.otherConfigs.get("hms_port")
+ String catalog_name = "test_string_dict_filter"
+ String externalEnvIp =
context.config.otherConfigs.get("externalEnvIp")
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """create catalog if not exists ${catalog_name} properties (
+ "type"="hms",
+ 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+ );"""
+ sql """use `${catalog_name}`.`default`"""
+
+ q_parquet()
+ q_orc()
+
+ sql """drop catalog if exists ${catalog_name}"""
+ } finally {
+ }
+ }
+}
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]