This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 6fd8f5cd2f [Fix](parquet-reader) Fix parquet string column min max
statistics issue which caused query result incorrectly. (#21675)
6fd8f5cd2f is described below
commit 6fd8f5cd2f1a45aa42776283f25d650572e0ffd4
Author: Qi Chen <[email protected]>
AuthorDate: Fri Jul 14 00:09:41 2023 +0800
[Fix](parquet-reader) Fix parquet string column min max statistics issue
which caused query result incorrectly. (#21675)
In parquet, min and max statistics may not be able to handle UTF8 correctly.
Current processing method is using min_value and max_value statistics
introduced by PARQUET-1025 if they are used.
If not, current processing method is temporarily ignored. A better way is
try to read min and max statistics if it contains
only ASCII characters. I will improve it in the future PR.
---
be/src/vec/exec/format/parquet/parquet_pred_cmp.h | 14 +-
be/src/vec/exec/format/parquet/vparquet_reader.cpp | 15 ++-
.../hive/test_multi_langs.out | 148 +++++++++++++++++++++
.../hive/test_multi_langs.groovy | 61 +++++++++
4 files changed, 231 insertions(+), 7 deletions(-)
diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
index 8d3057312b..c76fa95f4a 100644
--- a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
+++ b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
@@ -120,7 +120,7 @@ private:
static bool _filter_by_min_max(const ColumnValueRange<primitive_type>&
col_val_range,
const ScanPredicate& predicate, const
FieldSchema* col_schema,
const std::string& encoded_min, const
std::string& encoded_max,
- const cctz::time_zone& ctz) {
+ const cctz::time_zone& ctz, bool
use_min_max_value = false) {
using CppType = typename PrimitiveTypeTraits<primitive_type>::CppType;
std::vector<CppType> predicate_values;
for (const void* v : predicate.values) {
@@ -144,6 +144,13 @@ private:
case TYPE_CHAR:
[[fallthrough]];
case TYPE_STRING:
+ // TODO: In parquet, min and max statistics may not be able to
handle UTF8 correctly.
+ // Current processing method is using min_value and max_value
statistics introduced by PARQUET-1025 if they are used.
+ // If not, current processing method is temporarily ignored. A
better way is try to read min and max statistics
+ // if it contains only ASCII characters.
+ if (!use_min_max_value) {
+ return false;
+ }
if constexpr (std::is_same_v<CppType, StringRef>) {
min_value = StringRef(encoded_min);
max_value = StringRef(encoded_max);
@@ -372,7 +379,8 @@ public:
static bool filter_by_stats(const ColumnValueRangeType& col_val_range,
const FieldSchema* col_schema, bool
is_set_min_max,
const std::string& encoded_min, const
std::string& encoded_max,
- bool is_all_null, const cctz::time_zone& ctz) {
+ bool is_all_null, const cctz::time_zone& ctz,
+ bool use_min_max_value = false) {
bool need_filter = false;
std::visit(
[&](auto&& range) {
@@ -387,7 +395,7 @@ public:
}
for (auto& filter : filters) {
need_filter |= _filter_by_min_max(range, filter,
col_schema, encoded_min,
- encoded_max, ctz);
+ encoded_max, ctz,
use_min_max_value);
if (need_filter) {
break;
}
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 9b179384e2..fed33b6d28 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -836,15 +836,22 @@ Status ParquetReader::_process_column_stat_filter(const
std::vector<tparquet::Co
auto& statistic = meta_data.statistics;
bool is_all_null =
(statistic.__isset.null_count && statistic.null_count ==
meta_data.num_values);
- bool is_set_min_max = (statistic.__isset.max && statistic.__isset.min);
+ bool is_set_min_max = (statistic.__isset.max && statistic.__isset.min)
||
+ (statistic.__isset.max_value &&
statistic.__isset.min_value);
if ((!is_set_min_max) && (!is_all_null)) {
continue;
}
const FieldSchema* col_schema = schema_desc.get_column(col_name);
// Min-max of statistic is plain-encoded value
- *filter_group =
- ParquetPredicate::filter_by_stats(slot_iter->second,
col_schema, is_set_min_max,
- statistic.min,
statistic.max, is_all_null, *_ctz);
+ if (statistic.__isset.min_value) {
+ *filter_group = ParquetPredicate::filter_by_stats(
+ slot_iter->second, col_schema, is_set_min_max,
statistic.min_value,
+ statistic.max_value, is_all_null, *_ctz, true);
+ } else {
+ *filter_group = ParquetPredicate::filter_by_stats(
+ slot_iter->second, col_schema, is_set_min_max,
statistic.min, statistic.max,
+ is_all_null, *_ctz, false);
+ }
if (*filter_group) {
break;
}
diff --git
a/regression-test/data/external_table_emr_p2/hive/test_multi_langs.out
b/regression-test/data/external_table_emr_p2/hive/test_multi_langs.out
new file mode 100644
index 0000000000..bebfc26854
--- /dev/null
+++ b/regression-test/data/external_table_emr_p2/hive/test_multi_langs.out
@@ -0,0 +1,148 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !01 --
+2 是
+
+-- !02 --
+1
+2 是
+3 III类户
+
+-- !03 --
+2 1
+
+-- !04 --
+5 ありがとう
+
+-- !05 --
+1 你好
+2 谢谢
+3 再见
+4 こんにちは
+5 ありがとう
+6 さようなら
+7 안녕하세요
+8 감사합니다
+9 안녕히 가세요
+10 Hola
+11 Gracias
+12 Adiós
+13 Hallo
+14 Danke
+15 Auf Wiedersehen
+16 مرحبا
+17 شكرًا
+18 مع السلامة
+19 Bonjour
+20 Merci
+21 Au revoir
+22 Ciao
+23 Grazie
+24 Arrivederci
+25 Olá
+26 Obrigado
+27 Adeus
+28 Hello
+29 Thank you
+30 Goodbye
+
+-- !06 --
+5 1
+
+-- !01 --
+2 是
+
+-- !02 --
+1
+2 是
+3 III类户
+
+-- !03 --
+2 1
+
+-- !04 --
+5 ありがとう
+
+-- !05 --
+1 你好
+2 谢谢
+3 再见
+4 こんにちは
+5 ありがとう
+6 さようなら
+7 안녕하세요
+8 감사합니다
+9 안녕히 가세요
+10 Hola
+11 Gracias
+12 Adiós
+13 Hallo
+14 Danke
+15 Auf Wiedersehen
+16 مرحبا
+17 شكرًا
+18 مع السلامة
+19 Bonjour
+20 Merci
+21 Au revoir
+22 Ciao
+23 Grazie
+24 Arrivederci
+25 Olá
+26 Obrigado
+27 Adeus
+28 Hello
+29 Thank you
+30 Goodbye
+
+-- !06 --
+5 1
+
+-- !01 --
+2 是
+
+-- !02 --
+1
+2 是
+3 III类户
+
+-- !03 --
+2 1
+
+-- !04 --
+5 ありがとう
+
+-- !05 --
+1 你好
+2 谢谢
+3 再见
+4 こんにちは
+5 ありがとう
+6 さようなら
+7 안녕하세요
+8 감사합니다
+9 안녕히 가세요
+10 Hola
+11 Gracias
+12 Adiós
+13 Hallo
+14 Danke
+15 Auf Wiedersehen
+16 مرحبا
+17 شكرًا
+18 مع السلامة
+19 Bonjour
+20 Merci
+21 Au revoir
+22 Ciao
+23 Grazie
+24 Arrivederci
+25 Olá
+26 Obrigado
+27 Adeus
+28 Hello
+29 Thank you
+30 Goodbye
+
+-- !06 --
+5 1
+
diff --git
a/regression-test/suites/external_table_emr_p2/hive/test_multi_langs.groovy
b/regression-test/suites/external_table_emr_p2/hive/test_multi_langs.groovy
new file mode 100644
index 0000000000..937fd9039a
--- /dev/null
+++ b/regression-test/suites/external_table_emr_p2/hive/test_multi_langs.groovy
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_multi_langs", "p2") {
+
+ def formats = ["_parquet", "_orc", "_text"]
+ def q1 = """select * from test_chineseSUFFIX where col1='是' order by id"""
+ def q2 = """select * from test_chineseSUFFIX order by id"""
+ def q3 = """select id, count(col1) from test_chineseSUFFIX where col1='是'
group by id order by id"""
+ def q4 = """select * from test_multi_langsSUFFIX where col1='ありがとう' order
by id"""
+ def q5 = """select * from test_multi_langsSUFFIX order by id"""
+ def q6 = """select id, count(col1) from test_multi_langsSUFFIX where
col1='ありがとう' group by id order by id"""
+
+ String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ try {
+ String extHiveHmsHost =
context.config.otherConfigs.get("extHiveHmsHost")
+ String extHiveHmsPort =
context.config.otherConfigs.get("extHiveHmsPort")
+ String catalog_name = "test_multi_langs"
+
+ sql """drop catalog if exists ${catalog_name};"""
+ sql """
+ create catalog if not exists ${catalog_name} properties (
+ 'type'='hms',
+ 'hive.metastore.uris' =
'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
+ );
+ """
+ logger.info("catalog " + catalog_name + " created")
+ sql """switch ${catalog_name};"""
+ logger.info("switched to catalog " + catalog_name)
+ sql """use multi_catalog;"""
+ logger.info("use multi_catalog")
+
+ for (String format in formats) {
+ logger.info("Process format " + format)
+ qt_01 q1.replace("SUFFIX", format)
+ qt_02 q2.replace("SUFFIX", format)
+ qt_03 q3.replace("SUFFIX", format)
+ qt_04 q4.replace("SUFFIX", format)
+ qt_05 q5.replace("SUFFIX", format)
+ qt_06 q6.replace("SUFFIX", format)
+ }
+ sql """drop catalog if exists ${catalog_name}"""
+ } finally {
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]