This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new a7df843bfda [fix](hive)fix select count(*) hive full acid tb opt
error. (#46732)
a7df843bfda is described below
commit a7df843bfdaf02bb4b8c110dc00058f7e480b43f
Author: daidai <[email protected]>
AuthorDate: Fri Jan 10 18:18:27 2025 +0800
[fix](hive)fix select count(*) hive full acid tb opt error. (#46732)
### What problem does this PR solve?
Problem Summary:
before pr : #44038
In the previous PR, the generation method of split in the count( * )
scenario was optimized.
However, there were some problems with the hive acid table. This PR
mainly fixes this and adds tests.
In the count( * ) scenario, reading the hive full acid table cannot be
optimized, and the file still needs to be split (merge on read is
required), and the hive insert only acid table does not need to be
split.
---
.../vec/exec/format/table/transactional_hive_reader.cpp | 1 +
.../apache/doris/datasource/hive/source/HiveScanNode.java | 4 ++--
.../external_table_p0/hive/test_transactional_hive.out | 15 +++++++++++++++
.../hive/test_hive_translation_insert_only.out | 9 +++++++++
.../external_table_p0/hive/test_transactional_hive.groovy | 12 ++++++++++++
.../hive/test_hive_translation_insert_only.groovy | 5 +++++
6 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/be/src/vec/exec/format/table/transactional_hive_reader.cpp
b/be/src/vec/exec/format/table/transactional_hive_reader.cpp
index 18642ab1218..caf24270018 100644
--- a/be/src/vec/exec/format/table/transactional_hive_reader.cpp
+++ b/be/src/vec/exec/format/table/transactional_hive_reader.cpp
@@ -205,6 +205,7 @@ Status TransactionalHiveReader::init_row_filters(const
TFileRangeDesc& range,
++num_delete_files;
}
if (num_delete_rows > 0) {
+ orc_reader->set_push_down_agg_type(TPushAggOp::NONE);
orc_reader->set_delete_rows(&_delete_rows);
COUNTER_UPDATE(_transactional_orc_profile.num_delete_files,
num_delete_files);
COUNTER_UPDATE(_transactional_orc_profile.num_delete_rows,
num_delete_rows);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index c559570432f..08cf6582447 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -289,12 +289,12 @@ public class HiveScanNode extends FileQueryScanNode {
* we don't need to split the file because for parquet/orc format,
only metadata is read.
* If we split the file, we will read metadata of a file multiple
times, which is not efficient.
*
- * - Hive Transactional Table may need merge on read, so do not apply
this optimization.
+ * - Hive Full Acid Transactional Table may need merge on read, so do
not apply this optimization.
* - If the file format is not parquet/orc, eg, text, we need to split
the file to increase the parallelism.
*/
boolean needSplit = true;
if (getPushDownAggNoGroupingOp() == TPushAggOp.COUNT
- && hiveTransaction != null) {
+ && !(hmsTable.isHiveTransactionalTable() &&
hmsTable.isFullAcidTable())) {
int totalFileNum = 0;
for (FileCacheValue fileCacheValue : fileCaches) {
if (fileCacheValue.getFiles() != null) {
diff --git
a/regression-test/data/external_table_p0/hive/test_transactional_hive.out
b/regression-test/data/external_table_p0/hive/test_transactional_hive.out
index 060fa8c048e..94e32a43db7 100644
--- a/regression-test/data/external_table_p0/hive/test_transactional_hive.out
+++ b/regression-test/data/external_table_p0/hive/test_transactional_hive.out
@@ -122,3 +122,18 @@ F
-- !16 --
4 DD
+-- !count_1 --
+3
+
+-- !count_2 --
+6
+
+-- !count_3 --
+4
+
+-- !count_4 --
+3
+
+-- !count_5 --
+3
+
diff --git
a/regression-test/data/external_table_p2/hive/test_hive_translation_insert_only.out
b/regression-test/data/external_table_p2/hive/test_hive_translation_insert_only.out
index e4bdb3fe32d..f43a630f4a3 100644
---
a/regression-test/data/external_table_p2/hive/test_hive_translation_insert_only.out
+++
b/regression-test/data/external_table_p2/hive/test_hive_translation_insert_only.out
@@ -19,3 +19,12 @@
4 D
5 E
+-- !count_1 --
+4
+
+-- !count_2 --
+5
+
+-- !count_3 --
+5
+
diff --git
a/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
b/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
index 4f7008ec172..a12ab8a4f78 100644
---
a/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
@@ -114,6 +114,14 @@ suite("test_transactional_hive",
"p0,external,hive,external_docker,external_dock
}
}
+
+ def test_acid_count = {
+ qt_count_1 """ select count(*) from orc_full_acid; """ // 3
+ qt_count_2 """ select count(*) from orc_full_acid_par; """ // 6
+ qt_count_3 """ select count(*) from orc_to_acid_compacted_tb; """ //4
+ qt_count_4 """ select count(*) from orc_acid_minor; """ //3
+ qt_count_5 """ select count(*) from orc_acid_major; """ //3
+ }
String enabled = context.config.otherConfigs.get("enableHiveTest")
@@ -148,6 +156,10 @@ suite("test_transactional_hive",
"p0,external,hive,external_docker,external_dock
test_acid()
test_acid_write()
+
+ test_acid_count()
+
+
sql """drop catalog if exists ${catalog_name}"""
} finally {
}
diff --git
a/regression-test/suites/external_table_p2/hive/test_hive_translation_insert_only.groovy
b/regression-test/suites/external_table_p2/hive/test_hive_translation_insert_only.groovy
index 758417c3237..f7135175152 100644
---
a/regression-test/suites/external_table_p2/hive/test_hive_translation_insert_only.groovy
+++
b/regression-test/suites/external_table_p2/hive/test_hive_translation_insert_only.groovy
@@ -45,6 +45,11 @@ suite("test_hive_translation_insert_only",
"p2,external,hive,external_remote,ext
qt_2 """ select * from parquet_insert_only_major order by id """
qt_3 """ select * from orc_insert_only_minor order by id """
+ qt_count_1 """ select count(*) from text_insert_only """ //4
+ qt_count_2 """ select count(*) from parquet_insert_only_major """ //5
+ qt_count_3 """ select count(*) from orc_insert_only_minor """ //5
+
+
sql """drop catalog ${hms_catalog_name};"""
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]