This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 4dad7c94da [fix](orc) fix the count(*) pushdown issue in orc format
(#24446)
4dad7c94da is described below
commit 4dad7c94da364498759b620798e0723c7ff788bf
Author: Mingyu Chen <[email protected]>
AuthorDate: Sat Sep 16 09:57:39 2023 +0800
[fix](orc) fix the count(*) pushdown issue in orc format (#24446)
In previous, when querying hive table in orc format, and the file is
splitted.
the result of select count(*) may be multiple of the real row number.
This is because the number of rows should be got after orc strip prune,
otherwise, it may return wrong result
---
be/src/apache-orc | 2 +-
be/src/vec/exec/format/orc/vorc_reader.cpp | 5 +-
be/src/vec/exec/format/orc/vorc_reader.h | 2 +
.../tablefunction/HdfsTableValuedFunction.java | 2 +-
.../tvf/test_hdfs_tvf_compression.out | 18 +++++++
.../tvf/test_hdfs_tvf_compression.groovy | 63 ++++++++++++++++++++++
6 files changed, 88 insertions(+), 4 deletions(-)
diff --git a/be/src/apache-orc b/be/src/apache-orc
index 78bbe2e41f..a7c0af50f8 160000
--- a/be/src/apache-orc
+++ b/be/src/apache-orc
@@ -1 +1 @@
-Subproject commit 78bbe2e41f2140b803855d683fae5e1a4b734a37
+Subproject commit a7c0af50f8ca8ff7cddaf8675473a037f8b13143
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index deb48bcc0b..06f41a2edc 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -245,8 +245,6 @@ Status OrcReader::_create_file_reader() {
}
return Status::InternalError("Init OrcReader failed. reason = {}",
_err_msg);
}
- _remaining_rows = _reader->getNumberOfRows();
-
return Status::OK();
}
@@ -789,6 +787,9 @@ Status OrcReader::set_fill_columns(
auto& selected_type = _row_reader->getSelectedType();
int idx = 0;
_init_select_types(selected_type, idx);
+
+ _remaining_rows = _row_reader->getNumberOfRows();
+
} catch (std::exception& e) {
return Status::InternalError("Failed to create orc row reader. reason
= {}", e.what());
}
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h
b/be/src/vec/exec/format/orc/vorc_reader.h
index a9b564f560..133c92e7d1 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -489,6 +489,8 @@ private:
void set_remaining_rows(int64_t rows) { _remaining_rows = rows; }
private:
+ // This is only for count(*) short circuit read.
+ // save the total number of rows in range
int64_t _remaining_rows = 0;
RuntimeProfile* _profile = nullptr;
RuntimeState* _state = nullptr;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java
index 385d9d11ad..55c898b29f 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java
@@ -70,7 +70,7 @@ public class HdfsTableValuedFunction extends
ExternalFileTableValuedFunction {
// because HADOOP_FS_NAME contains upper and lower case
locationProperties.put(HdfsResource.HADOOP_FS_NAME,
params.get(key));
} else {
- throw new AnalysisException(key + " is invalid property");
+ locationProperties.put(key, params.get(key));
}
}
diff --git
a/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out
b/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out
index a92e6f28cb..6d92ffffc2 100644
--- a/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out
+++ b/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out
@@ -248,3 +248,21 @@ c133 TEXT Yes false \N NONE
-- !plain_2 --
+-- !count_parquet_0 --
+1062734
+
+-- !count_parquet_1 --
+1062734
+
+-- !count_orc_0 --
+2777636
+
+-- !count_orc_1 --
+2777636
+
+-- !count_text_0 --
+144730
+
+-- !count_text_1 --
+144730
+
diff --git
a/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy
b/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy
index 2f07106957..40dc3c2440 100644
---
a/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy
+++
b/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy
@@ -105,7 +105,70 @@ suite("test_hdfs_tvf_compression",
"p2,external,tvf,external_remote,external_rem
"column_separator" = '\001',
"compress_type" = "plain") where c2="abc" order by c3,c4,c10 limit
5;
"""
+
+ // test count(*) push down
+ def test_data_dir = "hdfs://${nameNodeHost}:${hdfsPort}"
+ // parquet
+ sql """set file_split_size=0;"""
+ qt_count_parquet_0 """
+ select count(*) from
+ HDFS(
+ "uri" =
"${test_data_dir}/test_data/ckbench_hits.part-00000.snappy.parquet",
+ "fs.defaultFS" = "${baseFs}",
+ "format" = "parquet"
+ );
+ """
+
+ sql """set file_split_size=388608;"""
+ qt_count_parquet_1 """
+ select count(*) from
+ HDFS(
+ "uri" =
"${test_data_dir}/test_data/ckbench_hits.part-00000.snappy.parquet",
+ "fs.defaultFS" = "${baseFs}",
+ "format" = "parquet"
+ );
+ """
+
+ // orc
+ sql """set file_split_size=0;"""
+ qt_count_orc_0 """
+ select count(*) from
+ HDFS(
+ "uri" = "${test_data_dir}/test_data/ckbench_hits.000000_0.orc",
+ "fs.defaultFS" = "${baseFs}",
+ "format" = "orc"
+ );
+ """
+
+ sql """set file_split_size=388608;"""
+ qt_count_orc_1 """
+ select count(*) from
+ HDFS(
+ "uri" = "${test_data_dir}/test_data/ckbench_hits.000000_0.orc",
+ "fs.defaultFS" = "${baseFs}",
+ "format" = "orc"
+ );
+ """
+ // text
+ sql """set file_split_size=0;"""
+ qt_count_text_0 """
+ select count(*) from
+ HDFS(
+ "uri" =
"${test_data_dir}/test_data/tpcds_catalog_returns_data-m-00000.txt",
+ "fs.defaultFS" = "${baseFs}",
+ "format" = "csv"
+ );
+ """
+ sql """set file_split_size=388608;"""
+ qt_count_text_1 """
+ select count(*) from
+ HDFS(
+ "uri" =
"${test_data_dir}/test_data/tpcds_catalog_returns_data-m-00000.txt",
+ "fs.defaultFS" = "${baseFs}",
+ "format" = "csv"
+ );
+ """
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]