This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 8b14577be5f [Branch3.0](Serde) Support hive compatible output format
(#50469)
8b14577be5f is described below
commit 8b14577be5fbe558abba5744e7ac32e6b52489c7
Author: Tiewei Fang <[email protected]>
AuthorDate: Tue May 6 09:58:25 2025 +0800
[Branch3.0](Serde) Support hive compatible output format (#50469)
picked from: #49036
---
.../vec/data_types/serde/data_type_array_serde.cpp | 5 +-
.../vec/data_types/serde/data_type_map_serde.cpp | 7 +-
.../data_types/serde/data_type_number_serde.cpp | 9 +-
be/src/vec/data_types/serde/data_type_serde.h | 20 ++++
.../data_types/serde/data_type_struct_serde.cpp | 5 +-
be/src/vec/sink/vmysql_result_writer.cpp | 16 +++
.../org/apache/doris/nereids/NereidsPlanner.java | 1 +
.../java/org/apache/doris/qe/SessionVariable.java | 10 +-
gensrc/thrift/PaloInternalService.thrift | 3 +-
.../datatype_p0/serde/test_serde_dialect_hive.out | Bin 0 -> 2029 bytes
.../serde/test_serde_dialect_hive.groovy | 107 +++++++++++++++++++++
11 files changed, 175 insertions(+), 8 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_array_serde.cpp
b/be/src/vec/data_types/serde/data_type_array_serde.cpp
index 872dd84d8c7..e5fc7461e45 100644
--- a/be/src/vec/data_types/serde/data_type_array_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_array_serde.cpp
@@ -336,7 +336,8 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const
IColumn& column,
const auto end_arr_element = offsets[row_idx_of_col_arr];
for (int j = begin_arr_element; j < end_arr_element; ++j) {
if (j != begin_arr_element) {
- if (0 != result.push_string(", ", 2)) {
+ if (0 != result.push_string(options.mysql_collection_delim.c_str(),
+
options.mysql_collection_delim.size())) {
return Status::InternalError("pack mysql buffer failed.");
}
}
@@ -345,6 +346,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const
IColumn& column,
return Status::InternalError("pack mysql buffer failed.");
}
} else {
+ ++options.level;
if (is_nested_string && options.wrapper_len > 0) {
if (0 != result.push_string(options.nested_string_wrapper,
options.wrapper_len)) {
return Status::InternalError("pack mysql buffer failed.");
@@ -358,6 +360,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const
IColumn& column,
RETURN_IF_ERROR(
nested_serde->write_column_to_mysql(data, result, j,
false, options));
}
+ --options.level;
}
}
if (0 != result.push_string("]", 1)) {
diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp
b/be/src/vec/data_types/serde/data_type_map_serde.cpp
index 2140885942d..bf018ce3a80 100644
--- a/be/src/vec/data_types/serde/data_type_map_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp
@@ -418,7 +418,8 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const
IColumn& column,
auto& offsets = map_column.get_offsets();
for (auto j = offsets[col_index - 1]; j < offsets[col_index]; ++j) {
if (j != offsets[col_index - 1]) {
- if (0 != result.push_string(", ", 2)) {
+ if (0 != result.push_string(options.mysql_collection_delim.c_str(),
+
options.mysql_collection_delim.size())) {
return Status::InternalError("pack mysql buffer failed.");
}
}
@@ -427,6 +428,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const
IColumn& column,
return Status::InternalError("pack mysql buffer failed.");
}
} else {
+ ++options.level;
if (is_key_string && options.wrapper_len > 0) {
if (0 != result.push_string(options.nested_string_wrapper,
options.wrapper_len)) {
return Status::InternalError("pack mysql buffer failed.");
@@ -440,6 +442,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const
IColumn& column,
RETURN_IF_ERROR(key_serde->write_column_to_mysql(nested_keys_column, result, j,
false,
options));
}
+ --options.level;
}
if (0 != result.push_string(&options.map_key_delim, 1)) {
return Status::InternalError("pack mysql buffer failed.");
@@ -449,6 +452,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const
IColumn& column,
return Status::InternalError("pack mysql buffer failed.");
}
} else {
+ ++options.level;
if (is_val_string && options.wrapper_len > 0) {
if (0 != result.push_string(options.nested_string_wrapper,
options.wrapper_len)) {
return Status::InternalError("pack mysql buffer failed.");
@@ -462,6 +466,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const
IColumn& column,
RETURN_IF_ERROR(value_serde->write_column_to_mysql(nested_values_column,
result, j,
false,
options));
}
+ --options.level;
}
}
if (0 != result.push_string("}", 1)) {
diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp
b/be/src/vec/data_types/serde/data_type_number_serde.cpp
index 972668e65fd..e81343c9ede 100644
--- a/be/src/vec/data_types/serde/data_type_number_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp
@@ -271,8 +271,15 @@ Status
DataTypeNumberSerDe<T>::_write_column_to_mysql(const IColumn& column,
int buf_ret = 0;
auto& data = assert_cast<const ColumnType&>(column).get_data();
const auto col_index = index_check_const(row_idx, col_const);
- if constexpr (std::is_same_v<T, Int8> || std::is_same_v<T, UInt8>) {
+ if constexpr (std::is_same_v<T, Int8>) {
buf_ret = result.push_tinyint(data[col_index]);
+ } else if constexpr (std::is_same_v<T, UInt8>) {
+ if (options.level > 0 && !options.is_bool_value_num) {
+ std::string bool_value = data[col_index] ? "true" : "false";
+ result.push_string(bool_value.c_str(), bool_value.size());
+ } else {
+ buf_ret = result.push_tinyint(data[col_index]);
+ }
} else if constexpr (std::is_same_v<T, Int16> || std::is_same_v<T,
UInt16>) {
buf_ret = result.push_smallint(data[col_index]);
} else if constexpr (std::is_same_v<T, Int32> || std::is_same_v<T,
UInt32>) {
diff --git a/be/src/vec/data_types/serde/data_type_serde.h
b/be/src/vec/data_types/serde/data_type_serde.h
index 105b1bbaedd..c46b3f311a2 100644
--- a/be/src/vec/data_types/serde/data_type_serde.h
+++ b/be/src/vec/data_types/serde/data_type_serde.h
@@ -181,6 +181,26 @@ public:
const char* nested_string_wrapper;
int wrapper_len;
+ /**
+ * mysql_collection_delim is used to separate elements in collection,
such as array, map, struct
+ * It is used to write to mysql.
+ */
+ std::string mysql_collection_delim = ", ";
+
+ /**
+ * is_bool_value_num is used to display bool value in collection, such
as array, map, struct
+ * eg, if set to true, the array<true> will be:
+ * [1]
+ * if set to false, the array<true> will be:
+ * [true]
+ */
+ bool is_bool_value_num = true;
+
+ /**
+ * Indicate the nested level of column. It is used to control some
behavior of serde
+ */
+ mutable int level = 0;
+
[[nodiscard]] char get_collection_delimiter(
int hive_text_complex_type_delimiter_level) const {
CHECK(0 <= hive_text_complex_type_delimiter_level &&
diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
index aefea80f0c7..0b1bb025482 100644
--- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
@@ -352,7 +352,8 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const
IColumn& column,
bool begin = true;
for (size_t j = 0; j < elem_serdes_ptrs.size(); ++j) {
if (!begin) {
- if (0 != result.push_string(", ", 2)) {
+ if (0 != result.push_string(options.mysql_collection_delim.c_str(),
+
options.mysql_collection_delim.size())) {
return Status::InternalError("pack mysql buffer failed.");
}
}
@@ -376,6 +377,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const
IColumn& column,
return Status::InternalError("pack mysql buffer failed.");
}
} else {
+ ++options.level;
if (remove_nullable(col.get_column_ptr(j))->is_column_string() &&
options.wrapper_len > 0) {
if (0 != result.push_string(options.nested_string_wrapper,
options.wrapper_len)) {
@@ -390,6 +392,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const
IColumn& column,
RETURN_IF_ERROR(elem_serdes_ptrs[j]->write_column_to_mysql(
col.get_column(j), result, col_index, false, options));
}
+ --options.level;
}
begin = false;
}
diff --git a/be/src/vec/sink/vmysql_result_writer.cpp
b/be/src/vec/sink/vmysql_result_writer.cpp
index 932ee955590..0f69fe9f5b9 100644
--- a/be/src/vec/sink/vmysql_result_writer.cpp
+++ b/be/src/vec/sink/vmysql_result_writer.cpp
@@ -123,6 +123,8 @@ Status VMysqlResultWriter<is_binary_format>::_set_options(
_options.map_key_delim = ':';
_options.null_format = "null";
_options.null_len = 4;
+ _options.mysql_collection_delim = ", ";
+ _options.is_bool_value_num = true;
break;
case TSerdeDialect::PRESTO:
// eg:
@@ -133,6 +135,20 @@ Status VMysqlResultWriter<is_binary_format>::_set_options(
_options.map_key_delim = '=';
_options.null_format = "NULL";
_options.null_len = 4;
+ _options.mysql_collection_delim = ", ";
+ _options.is_bool_value_num = true;
+ break;
+ case TSerdeDialect::HIVE:
+ // eg:
+ // array: ["abc","def","",null]
+ // map: {"k1":null,"k2":"v3"}
+ _options.nested_string_wrapper = "\"";
+ _options.wrapper_len = 1;
+ _options.map_key_delim = ':';
+ _options.null_format = "null";
+ _options.null_len = 4;
+ _options.mysql_collection_delim = ",";
+ _options.is_bool_value_num = false;
break;
default:
return Status::InternalError("unknown serde dialect: {}",
serde_dialect);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
index 885a32c70b2..3a987a9443a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java
@@ -785,6 +785,7 @@ public class NereidsPlanner extends Planner {
statementContext.setFormatOptions(FormatOptions.getForPresto());
break;
case "doris":
+ case "hive":
statementContext.setFormatOptions(FormatOptions.getDefault());
break;
default:
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 69f9e978044..f4faf33b90f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -4483,9 +4483,11 @@ public class SessionVariable implements Serializable,
Writable {
throw new UnsupportedOperationException("serdeDialect value is
empty");
}
- if (!serdeDialect.equalsIgnoreCase("doris") &&
!serdeDialect.equalsIgnoreCase("presto")
- && !serdeDialect.equalsIgnoreCase("trino")) {
- LOG.warn("serdeDialect value is invalid, the invalid value is {}",
serdeDialect);
+ if (!serdeDialect.equalsIgnoreCase("doris")
+ && !serdeDialect.equalsIgnoreCase("presto")
+ && !serdeDialect.equalsIgnoreCase("trino")
+ && !serdeDialect.equalsIgnoreCase("hive")) {
+ LOG.warn("serde dialect value is invalid, the invalid value is
{}", serdeDialect);
throw new UnsupportedOperationException(
"sqlDialect value is invalid, the invalid value is " +
serdeDialect);
}
@@ -4657,6 +4659,8 @@ public class SessionVariable implements Serializable,
Writable {
case "presto":
case "trino":
return TSerdeDialect.PRESTO;
+ case "hive":
+ return TSerdeDialect.HIVE;
default:
throw new IllegalArgumentException("Unknown serde dialect: " +
serdeDialect);
}
diff --git a/gensrc/thrift/PaloInternalService.thrift
b/gensrc/thrift/PaloInternalService.thrift
index 21b1898d029..a15195d4cbd 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -83,7 +83,8 @@ struct TResourceLimit {
enum TSerdeDialect {
DORIS = 0,
- PRESTO = 1
+ PRESTO = 1,
+ HIVE = 2
}
// Query options that correspond to PaloService.PaloQueryOptions,
diff --git a/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out
b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out
new file mode 100644
index 00000000000..3ea1043cdf6
Binary files /dev/null and
b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out differ
diff --git
a/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy
b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy
new file mode 100644
index 00000000000..b8e3037d770
--- /dev/null
+++ b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_serde_dialect_hive", "p0") {
+
+ sql """create database if not exists test_serde_dialect_hive;"""
+ sql """use test_serde_dialect_hive;"""
+ sql """drop table if exists test_serde_dialect_hive_tbl"""
+ sql """
+ create table if not exists test_serde_dialect_hive_tbl (
+ c1 tinyint,
+ c2 smallint,
+ c3 int,
+ c4 bigint,
+ c5 largeint,
+ c6 float,
+ c7 double,
+ c8 decimal(27, 9),
+ c9 date,
+ c10 datetime,
+ c11 datetime(6),
+ c12 ipv4,
+ c13 ipv6,
+ c14 string,
+ c15 char(6),
+ c16 varchar(1024),
+ c17 boolean,
+ c18 json,
+ c19 array<int>,
+ c20 array<double>,
+ c21 array<decimal(10, 5)>,
+ c22 array<string>,
+ c23 array<map<string, string>>,
+ c24 array<array<string>>,
+ c25 array<struct<s_id:int(11), s_name:string, s_address:string>>,
+ c26 array<struct<s_id:struct<k1:string, k2:decimal(10,2)>,
s_name:array<ipv4>, s_address:map<string, ipv6>>>,
+ c27 map<string, string>,
+ c28 map<string, array<array<string>>>,
+ c29 map<int, map<string, array<array<string>>>>,
+ c30 map<decimal(5, 3), array<struct<s_id:struct<k1:string,
k2:decimal(10,2)>, s_name:array<string>, s_address:map<string, string>>>>,
+ c31 struct<s_id:int(11), s_name:string, s_address:string>,
+ c32 struct<s_id:int(11), s_name:array<string>, s_address:string>,
+ c33 array<date>,
+ c34 array<datetime(3)>,
+ c35 array<boolean>,
+ c36 struct<s_id:int(11), s_name:string, s_gender:boolean>,
+ c37 map<string, boolean>
+ )
+ distributed by random buckets 1
+ properties("replication_num" = "1");
+ """
+
+ sql """
+ insert into test_serde_dialect_hive_tbl
+ (c1, c2,c3,
c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c27,c28,c29,c31,c32,c33,c34,c35,c36,c37)
+ values(
+ 1,2,3,4,5,1.1,2.0000,123456.123456789,"2024-06-30", "2024-06-30
10:10:11", "2024-06-30 10:10:11.123456",
+ '59.50.185.152',
+ 'ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff',
+ 'this is a string with , and "',
+ 'abc ef',
+ ' 123ndedwdw',
+ true,
+ '[1, 2, 3, 4, 5]',
+ [1,2,3,null,5],
+ [1.1,2.1,3.1,null,5.00],
+ [1.1,2.1,3.00000,null,5.12345],
+ ['abc', 'de, f"', null, ''],
+ [{'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'}, {'k1': 'v1', 'k2':
null, 'k3 , "abc':'', 'k4':'a , "a'}],
+ [['abc', 'de, f"', null, ''],[],null],
+ {'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'},
+ {'k1': [['abc', 'de, f"', null, ''],[],null], 'k2': null},
+ {10: {'k1': [['abc', 'de, f"', null, ''],[],null]}, 11: null},
+ named_struct('s_id', 100, 's_name', 'abc , "', 's_address', null),
+ named_struct('s_id', null, 's_name', ['abc', 'de, f"', null, ''],
's_address', ''),
+ ['2024-06-01',null,'2024-06-03'],
+ ['2024-06-01 10:10:10',null,'2024-06-03 01:11:23.123'],
+ [true, true, false, false, true, false, false],
+ named_struct('s_id', 100, 's_name', 'abc , "', 's_gender', true),
+ {'k1': false, 'k2': true, 'k3':true, 'k4': false}
+ );
+ """
+
+ sql """set serde_dialect="doris";"""
+ qt_sql01 """select * from test_serde_dialect_hive_tbl"""
+ sql """set serde_dialect="hive";"""
+ qt_sql01 """select * from test_serde_dialect_hive_tbl"""
+
+ test {
+ sql """set serde_dialect="invalid""""
+ exception "sqlDialect value is invalid"
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]