This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new f1db6bd8c1 [feature](hive)append support for struct and map column
type on textfile format of hive table (#22347)
f1db6bd8c1 is described below
commit f1db6bd8c1a652bd5171de508ba8c9c41073d605
Author: daidai <[email protected]>
AuthorDate: Thu Aug 10 13:47:58 2023 +0800
[feature](hive)append support for struct and map column type on textfile
format of hive table (#22347)
1. append support for struct and map column type on textfile format of
hive table.
2. optimizer code that array column type.
```mysql
+------+------------------------------------+
| id | perf |
+------+------------------------------------+
| 1 | {"key1":"value1", "key2":"value2"} |
| 1 | {"key1":"value1", "key2":"value2"} |
| 2 | {"name":"John", "age":"30"} |
+------+------------------------------------+
```
```mysql
+---------+------------------+
| column1 | column2 |
+---------+------------------+
| 1 | {10, "data1", 1} |
| 2 | {20, "data2", 0} |
| 3 | {30, "data3", 1} |
+---------+------------------+
```
Summarizes support for complex types(support assign delimiter) :
1. array< primitive_type > and array< array< ... > >
2. map< primitive_type , primitive_type >
3. Struct< primitive_type , primitive_type ... >
---
be/src/exec/text_converter.cpp | 288 +++++++--------------
be/src/exec/text_converter.h | 18 +-
be/src/vec/exec/format/csv/csv_reader.cpp | 16 +-
be/src/vec/exec/format/csv/csv_reader.h | 6 +-
.../doris/planner/external/HiveScanNode.java | 68 ++++-
gensrc/thrift/PlanNodes.thrift | 3 +-
.../hive/test_hive_text_complex_type.out | 15 ++
.../hive/test_hive_text_complex_type.groovy | 46 ++++
8 files changed, 248 insertions(+), 212 deletions(-)
diff --git a/be/src/exec/text_converter.cpp b/be/src/exec/text_converter.cpp
index 5cfa078fb8..1346b14a7b 100644
--- a/be/src/exec/text_converter.cpp
+++ b/be/src/exec/text_converter.cpp
@@ -35,16 +35,20 @@
#include "util/string_parser.hpp"
#include "vec/columns/column_array.h"
#include "vec/columns/column_complex.h"
+#include "vec/columns/column_map.h"
#include "vec/columns/column_nullable.h"
#include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
#include "vec/columns/column_vector.h"
#include "vec/core/types.h"
#include "vec/runtime/vdatetime_value.h"
namespace doris {
-TextConverter::TextConverter(char escape_char, char array_delimiter)
- : _escape_char(escape_char), _array_delimiter(array_delimiter) {}
+TextConverter::TextConverter(char escape_char, char collection_delimiter, char
map_kv_delimiter)
+ : _escape_char(escape_char),
+ _collection_delimiter(collection_delimiter),
+ _map_kv_delimiter(map_kv_delimiter) {}
void TextConverter::write_string_column(const SlotDescriptor* slot_desc,
vectorized::MutableColumnPtr*
column_ptr, const char* data,
@@ -62,12 +66,15 @@ void TextConverter::write_string_column(const
SlotDescriptor* slot_desc,
}
}
-bool TextConverter::write_vec_column(const SlotDescriptor* slot_desc,
- vectorized::IColumn* nullable_col_ptr,
const char* data,
- size_t len, bool copy_string, bool
need_escape, size_t rows) {
+bool TextConverter::_write_data(const TypeDescriptor& type_desc,
+ vectorized::IColumn* nullable_col_ptr, const
char* data, size_t len,
+ bool copy_string, bool need_escape, size_t
rows,
+ char array_delimiter) {
vectorized::IColumn* col_ptr = nullable_col_ptr;
// \N means it's NULL
- if (slot_desc->is_nullable()) {
+ std::string col_type_name = col_ptr->get_name();
+ bool is_null_able = typeid(*nullable_col_ptr) ==
typeid(vectorized::ColumnNullable);
+ if (is_null_able) {
auto* nullable_column =
reinterpret_cast<vectorized::ColumnNullable*>(nullable_col_ptr);
if ((len == 2 && data[0] == '\\' && data[1] == 'N') || len ==
SQL_NULL_DATA) {
nullable_column->insert_many_defaults(rows);
@@ -82,7 +89,7 @@ bool TextConverter::write_vec_column(const SlotDescriptor*
slot_desc,
StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
size_t origin_size = col_ptr->size();
// Parse the raw-text data. Translate the text string to internal format.
- switch (slot_desc->type().type) {
+ switch (type_desc.type) {
case TYPE_HLL: {
HyperLogLog hyper_log_log(Slice(data, len));
auto& hyper_data =
reinterpret_cast<vectorized::ColumnHLL*>(col_ptr)->get_data();
@@ -244,7 +251,7 @@ bool TextConverter::write_vec_column(const SlotDescriptor*
slot_desc,
case TYPE_DECIMAL32: {
StringParser::ParseResult result = StringParser::PARSE_SUCCESS;
int32_t value = StringParser::string_to_decimal<TYPE_DECIMAL32,
int32_t>(
- data, len, slot_desc->type().precision,
slot_desc->type().scale, &result);
+ data, len, type_desc.precision, type_desc.scale, &result);
if (result != StringParser::PARSE_SUCCESS) {
parse_result = StringParser::PARSE_FAILURE;
break;
@@ -257,7 +264,7 @@ bool TextConverter::write_vec_column(const SlotDescriptor*
slot_desc,
case TYPE_DECIMAL64: {
StringParser::ParseResult result = StringParser::PARSE_SUCCESS;
int64_t value = StringParser::string_to_decimal<TYPE_DECIMAL64,
int64_t>(
- data, len, slot_desc->type().precision,
slot_desc->type().scale, &result);
+ data, len, type_desc.precision, type_desc.scale, &result);
if (result != StringParser::PARSE_SUCCESS) {
parse_result = StringParser::PARSE_FAILURE;
break;
@@ -271,7 +278,7 @@ bool TextConverter::write_vec_column(const SlotDescriptor*
slot_desc,
StringParser::ParseResult result = StringParser::PARSE_SUCCESS;
vectorized::Int128 value =
StringParser::string_to_decimal<TYPE_DECIMAL128I,
vectorized::Int128>(
- data, len, slot_desc->type().precision,
slot_desc->type().scale, &result);
+ data, len, type_desc.precision, type_desc.scale,
&result);
if (result != StringParser::PARSE_SUCCESS) {
parse_result = StringParser::PARSE_FAILURE;
break;
@@ -282,200 +289,94 @@ bool TextConverter::write_vec_column(const
SlotDescriptor* slot_desc,
break;
}
case TYPE_ARRAY: {
- std::function<vectorized::Array(int, int, char, const
TypeDescriptor&)> func =
- [&](int left, int right, char split,
- const TypeDescriptor& type) -> vectorized::Array {
- vectorized::Array array;
- int fr = left;
- for (int i = left; i <= right + 1; i++) {
- auto Sub_type = type.children[0];
- if (i <= right && data[i] != split && data[i] !=
_array_delimiter) {
- continue;
- }
- if (Sub_type.type == TYPE_ARRAY) {
- array.push_back(func(fr, i - 1, split + 1, Sub_type));
- } else {
- StringParser::ParseResult local_parse_result =
StringParser::PARSE_SUCCESS;
- switch (Sub_type.type) {
- case TYPE_HLL: {
- DCHECK(false) << "not support type: "
- << "array<HyperLogLog>\n";
- break;
- }
- case TYPE_STRING:
- case TYPE_VARCHAR:
- case TYPE_CHAR: {
- size_t sz = i - fr;
- if (need_escape) {
- unescape_string_on_spot(data + fr, &sz);
- }
- array.push_back(std::string(data + fr, sz));
- break;
- }
- case TYPE_BOOLEAN: {
- bool num = StringParser::string_to_bool(data + fr, i -
fr,
-
&local_parse_result);
- array.push_back((uint8_t)num);
- break;
- }
- case TYPE_TINYINT: {
- int8_t num = StringParser::string_to_int<int8_t>(data
+ fr, i - fr,
-
&local_parse_result);
- array.push_back(num);
- break;
- }
- case TYPE_SMALLINT: {
- int16_t num =
StringParser::string_to_int<int16_t>(data + fr, i - fr,
-
&local_parse_result);
- array.push_back(num);
- break;
- }
- case TYPE_INT: {
- int32_t num =
StringParser::string_to_int<int32_t>(data + fr, i - fr,
-
&local_parse_result);
- array.push_back(num);
- break;
- }
- case TYPE_BIGINT: {
- int64_t num =
StringParser::string_to_int<int64_t>(data + fr, i - fr,
-
&local_parse_result);
- array.push_back(num);
- break;
- }
- case TYPE_LARGEINT: {
- __int128 num =
StringParser::string_to_int<__int128>(data + fr, i - fr,
-
&local_parse_result);
- array.push_back(num);
- break;
- }
- case TYPE_FLOAT: {
- float num = StringParser::string_to_float<float>(data
+ fr, i - fr,
-
&local_parse_result);
- array.push_back(num);
- break;
- }
- case TYPE_DOUBLE: {
- double num =
StringParser::string_to_float<double>(data + fr, i - fr,
-
&local_parse_result);
- array.push_back(num);
- break;
- }
- case TYPE_DATE: {
- vectorized::VecDateTimeValue ts_slot;
- if (!ts_slot.from_date_str(data + fr, i - fr)) {
- local_parse_result = StringParser::PARSE_FAILURE;
- break;
- }
- ts_slot.cast_to_date();
- array.push_back(*reinterpret_cast<int64_t*>(&ts_slot));
- break;
- }
- case TYPE_DATEV2: {
- vectorized::DateV2Value<vectorized::DateV2ValueType>
ts_slot;
- if (!ts_slot.from_date_str(data + fr, i - fr)) {
- local_parse_result = StringParser::PARSE_FAILURE;
- break;
- }
- uint32_t int_val = ts_slot.to_date_int_val();
- array.push_back(int_val);
- break;
- }
- case TYPE_DATETIME: {
- vectorized::VecDateTimeValue ts_slot;
- if (!ts_slot.from_date_str(data + fr, i - fr)) {
- local_parse_result = StringParser::PARSE_FAILURE;
- break;
- }
- ts_slot.to_datetime();
- array.push_back((int64_t)ts_slot);
- break;
- }
- case TYPE_DATETIMEV2: {
-
vectorized::DateV2Value<vectorized::DateTimeV2ValueType> ts_slot;
- if (!ts_slot.from_date_str(data + fr, i - fr)) {
- local_parse_result = StringParser::PARSE_FAILURE;
- break;
- }
- uint64_t int_val = ts_slot.to_date_int_val();
- array.push_back(int_val);
- break;
- }
- case TYPE_DECIMALV2: {
- DecimalV2Value decimal_slot;
- if (decimal_slot.parse_from_str(data + fr, i - fr)) {
- local_parse_result = StringParser::PARSE_FAILURE;
- break;
- }
- array.push_back(decimal_slot.value());
- break;
- }
- case TYPE_DECIMAL32: {
- StringParser::ParseResult result =
StringParser::PARSE_SUCCESS;
- int32_t value =
StringParser::string_to_decimal<TYPE_DECIMAL32, int32_t>(
- data + fr, i - fr, Sub_type.precision,
Sub_type.scale, &result);
- if (result != StringParser::PARSE_SUCCESS) {
- local_parse_result = StringParser::PARSE_FAILURE;
- break;
- }
- array.push_back(value);
- break;
- }
- case TYPE_DECIMAL64: {
- StringParser::ParseResult result =
StringParser::PARSE_SUCCESS;
- int64_t value =
StringParser::string_to_decimal<TYPE_DECIMAL64, int64_t>(
- data + fr, i - fr, Sub_type.precision,
Sub_type.scale, &result);
- if (result != StringParser::PARSE_SUCCESS) {
- local_parse_result = StringParser::PARSE_FAILURE;
- break;
- }
- array.push_back(value);
- break;
- }
- case TYPE_DECIMAL128I: {
- StringParser::ParseResult result =
StringParser::PARSE_SUCCESS;
- vectorized::Int128 value =
-
StringParser::string_to_decimal<TYPE_DECIMAL128I,
-
vectorized::Int128>(
- data + fr, i - fr, Sub_type.precision,
Sub_type.scale,
- &result);
- if (result != StringParser::PARSE_SUCCESS) {
- local_parse_result = StringParser::PARSE_FAILURE;
- break;
- }
- array.push_back(value);
- break;
- }
- default: {
- DCHECK(false) << "bad slot type: array<" << Sub_type
<< ">";
- break;
- }
- }
+ auto col = reinterpret_cast<vectorized::ColumnArray*>(col_ptr);
+
+ std::vector<std::pair<size_t, size_t>> ranges;
+ for (size_t i = 0, from = 0; i <= len; i++) {
+ if (i < len && data[i] != array_delimiter && data[i] !=
_collection_delimiter) {
+ continue;
+ }
+ ranges.push_back({from, i - from});
+ from = i + 1;
+ }
+
+ auto sub_type = type_desc.children[0];
+ for (int i = 0; i < rows; i++) {
+ for (auto range : ranges) {
+ _write_data(sub_type, &col->get_data(), data + range.first,
range.second,
+ copy_string, need_escape, 1, array_delimiter + 1);
+ }
+ col->get_offsets().push_back(col->get_offsets().back() +
ranges.size());
+ }
+
+ break;
+ }
+ case TYPE_MAP: {
+ auto col = reinterpret_cast<vectorized::ColumnMap*>(col_ptr);
- if (local_parse_result != StringParser::PARSE_SUCCESS) {
- parse_result = local_parse_result;
- return array;
- }
- }
- fr = i + 1;
+ std::vector<std::array<size_t, 3>> ranges;
+ for (size_t i = 0, from = 0, kv = 0; i <= len; i++) {
+ /*
+ * In hive , when you special map key and value delimiter as ':'
+ * for map<int,timestamp> column , the query result is correct ,
but
+ * for map<timestamp, int> column and map<timestamp,timestamp>
column , the query result is incorrect,
+ * because this field have many '_map_kv_delimiter'.
+ *
+ * So i use 'kv <= from' in order to get _map_kv_delimiter that
appears first.
+ * */
+ if (i < len && data[i] == _map_kv_delimiter && kv <= from) {
+ kv = i;
+ continue;
+ }
+ if (i == len || data[i] == _collection_delimiter) {
+ ranges.push_back({from, kv, i - 1});
+ from = i + 1;
}
- return array;
- };
+ }
- auto array = func(0, len - 1, '\002', slot_desc->type());
+ auto key_type = type_desc.children[0];
+ auto value_type = type_desc.children[1];
for (int i = 0; i < rows; i++) {
- reinterpret_cast<vectorized::ColumnArray*>(col_ptr)->insert(array);
+ for (auto range : ranges) {
+ _write_data(key_type, &col->get_keys(), data + range[0],
range[1] - range[0],
+ copy_string, need_escape, 1, array_delimiter + 1);
+
+ _write_data(value_type, &col->get_values(), data + range[1] +
1,
+ range[2] - range[1], copy_string, need_escape, 1,
array_delimiter + 1);
+ }
+
+ col->get_offsets().push_back(col->get_offsets().back() +
ranges.size());
}
break;
}
+ case TYPE_STRUCT: {
+ auto col = reinterpret_cast<vectorized::ColumnStruct*>(col_ptr);
+
+ std::vector<std::pair<size_t, size_t>> ranges;
+ for (size_t i = 0, from = 0; i <= len; i++) {
+ if (i == len || data[i] == _collection_delimiter) {
+ ranges.push_back({from, i - from});
+ from = i + 1;
+ }
+ }
+ for (int i = 0; i < rows; i++) {
+ for (size_t loc = 0; loc < col->get_columns().size(); loc++) {
+ _write_data(type_desc.children[loc], &col->get_column(loc),
+ data + ranges[loc].first, ranges[loc].second,
copy_string, need_escape,
+ rows, array_delimiter + 1);
+ }
+ }
+ break;
+ }
default:
- DCHECK(false) << "bad slot type: " << slot_desc->type();
+ DCHECK(false) << "bad slot type: " << type_desc;
break;
}
if (UNLIKELY(parse_result == StringParser::PARSE_FAILURE)) {
- if (true == slot_desc->is_nullable()) {
+ if (is_null_able) {
auto* nullable_column =
reinterpret_cast<vectorized::ColumnNullable*>(nullable_col_ptr);
size_t size = nullable_column->get_null_map_data().size();
doris::vectorized::NullMap& null_map_data =
nullable_column->get_null_map_data();
@@ -489,6 +390,13 @@ bool TextConverter::write_vec_column(const SlotDescriptor*
slot_desc,
return true;
}
+bool TextConverter::write_vec_column(const SlotDescriptor* slot_desc,
+ vectorized::IColumn* nullable_col_ptr,
const char* data,
+ size_t len, bool copy_string, bool
need_escape, size_t rows) {
+ return _write_data(slot_desc->type(), nullable_col_ptr, data, len,
copy_string, need_escape,
+ rows, '\2');
+}
+
void TextConverter::unescape_string_on_spot(const char* src, size_t* len) {
const char* start = src;
char* dest_ptr = const_cast<char*>(src);
diff --git a/be/src/exec/text_converter.h b/be/src/exec/text_converter.h
index 083c7c6881..ef4e87f5a5 100644
--- a/be/src/exec/text_converter.h
+++ b/be/src/exec/text_converter.h
@@ -31,7 +31,7 @@ class TextConverter {
public:
static constexpr char NULL_STR[3] = {'\\', 'N', '\0'};
- TextConverter(char escape_char, char array_delimiter = '\2');
+ TextConverter(char escape_char, char collection_delimiter = '\2', char
map_kv_delimiter = '\3');
void write_string_column(const SlotDescriptor* slot_desc,
vectorized::MutableColumnPtr* column_ptr, const
char* data,
@@ -57,11 +57,23 @@ public:
size_t rows);
void unescape_string_on_spot(const char* src, size_t* len);
- void set_array_delimiter(char array_delimiter) { _array_delimiter =
array_delimiter; }
+ void set_collection_delimiter(char collection_delimiter) {
+ _collection_delimiter = collection_delimiter;
+ }
+ void set_map_kv_delimiter(char mapkv_delimiter) { _map_kv_delimiter =
mapkv_delimiter; }
private:
+ bool _write_data(const TypeDescriptor& type_desc, vectorized::IColumn*
nullable_col_ptr,
+ const char* data, size_t len, bool copy_string, bool
need_escape, size_t rows,
+ char array_delimiter);
+
char _escape_char;
- char _array_delimiter;
+
+ //struct,array and map delimiter
+ char _collection_delimiter;
+
+ //map key and value delimiter
+ char _map_kv_delimiter;
};
} // namespace doris
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 59bae18dcc..93cc148248 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -200,9 +200,11 @@ Status CsvReader::init_reader(bool is_load) {
_line_delimiter = _params.file_attributes.text_params.line_delimiter;
_line_delimiter_length = _line_delimiter.size();
- //get array delimiter
- _array_delimiter = _params.file_attributes.text_params.array_delimiter;
- _text_converter->set_array_delimiter(_array_delimiter[0]);
+ _collection_delimiter =
_params.file_attributes.text_params.collection_delimiter;
+ _text_converter->set_collection_delimiter(_collection_delimiter[0]);
+
+ _map_kv_delimiter = _params.file_attributes.text_params.mapkv_delimiter;
+ _text_converter->set_map_kv_delimiter(_map_kv_delimiter[0]);
if (_params.file_attributes.__isset.trim_double_quotes) {
_trim_double_quotes = _params.file_attributes.trim_double_quotes;
@@ -693,9 +695,11 @@ Status CsvReader::_prepare_parse(size_t* read_line, bool*
is_parse_name) {
_line_delimiter = _params.file_attributes.text_params.line_delimiter;
_line_delimiter_length = _line_delimiter.size();
- //get array delimiter
- _array_delimiter = _params.file_attributes.text_params.array_delimiter;
- _text_converter->set_array_delimiter(_array_delimiter[0]);
+ _collection_delimiter =
_params.file_attributes.text_params.collection_delimiter;
+ _text_converter->set_collection_delimiter(_collection_delimiter[0]);
+
+ _map_kv_delimiter = _params.file_attributes.text_params.mapkv_delimiter;
+ _text_converter->set_map_kv_delimiter(_map_kv_delimiter[0]);
// create decompressor.
// _decompressor may be nullptr if this is not a compressed file
diff --git a/be/src/vec/exec/format/csv/csv_reader.h
b/be/src/vec/exec/format/csv/csv_reader.h
index 42178846f1..a1577a638e 100644
--- a/be/src/vec/exec/format/csv/csv_reader.h
+++ b/be/src/vec/exec/format/csv/csv_reader.h
@@ -144,7 +144,11 @@ private:
std::string _value_separator;
std::string _line_delimiter;
- std::string _array_delimiter;
+
+ // struct, array and map delimiter
+ std::string _collection_delimiter;
+ // map key and value delimiter
+ std::string _map_kv_delimiter;
int _value_separator_length;
int _line_delimiter_length;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
index 211f6e8056..61a571358a 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
@@ -20,11 +20,15 @@ package org.apache.doris.planner.external;
import org.apache.doris.analysis.FunctionCallExpr;
import org.apache.doris.analysis.SlotDescriptor;
import org.apache.doris.analysis.TupleDescriptor;
+import org.apache.doris.catalog.ArrayType;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.HiveMetaStoreClientHelper;
import org.apache.doris.catalog.ListPartitionItem;
+import org.apache.doris.catalog.MapType;
import org.apache.doris.catalog.PartitionItem;
+import org.apache.doris.catalog.StructField;
+import org.apache.doris.catalog.StructType;
import org.apache.doris.catalog.TableIf;
import org.apache.doris.catalog.Type;
import org.apache.doris.catalog.external.HMSExternalTable;
@@ -71,9 +75,12 @@ public class HiveScanNode extends FileQueryScanNode {
public static final String PROP_LINE_DELIMITER = "line.delim";
public static final String DEFAULT_LINE_DELIMITER = "\n";
- public static final String PROP_ARRAY_DELIMITER_HIVE2 = "colelction.delim";
- public static final String PROP_ARRAY_DELIMITER_HIVE3 = "collection.delim";
- public static final String DEFAULT_ARRAY_DELIMITER = "\2";
+ public static final String PROP_COLLECTION_DELIMITER_HIVE2 =
"colelction.delim";
+ public static final String PROP_COLLECTION_DELIMITER_HIVE3 =
"collection.delim";
+ public static final String DEFAULT_COLLECTION_DELIMITER = "\2";
+
+ public static final String PROP_MAP_KV_DELIMITER = "mapkey.delim";
+ public static final String DEFAULT_MAP_KV_DELIMITER = "\003";
protected final HMSExternalTable hmsTable;
private HiveTransaction hiveTransaction = null;
@@ -104,10 +111,46 @@ public class HiveScanNode extends FileQueryScanNode {
String inputFormat =
hmsTable.getRemoteTable().getSd().getInputFormat();
if (inputFormat.contains("TextInputFormat")) {
for (SlotDescriptor slot : desc.getSlots()) {
- if (slot.getType().isMapType() ||
slot.getType().isStructType()) {
+ if (slot.getType().isScalarType()) {
+ continue;
+ }
+ boolean supported = true;
+
+ // support Array<primitive_type> and array<array<...>>
+ if (slot.getType().isArrayType()) {
+ ArrayType arraySubType = (ArrayType) slot.getType();
+ while (true) {
+ if (arraySubType.getItemType().isArrayType()) {
+ arraySubType = (ArrayType)
arraySubType.getItemType();
+ continue;
+ }
+ if (!arraySubType.getItemType().isScalarType()) {
+ supported = false;
+ }
+ break;
+ }
+ } else if (slot.getType().isMapType()) { //support
map<primitive_type,primitive_type>
+ if (!((MapType)
slot.getType()).getValueType().isScalarType()) {
+ supported = false;
+ }
+ } else if (slot.getType().isStructType()) { //support Struct<
primitive_type,primitive_type ... >
+ StructType structSubType = (StructType) slot.getType();
+ structSubType.getColumnSize();
+ for (StructField f : structSubType.getFields()) {
+ if (!f.getType().isScalarType()) {
+ supported = false;
+ }
+ }
+ }
+
+ if (supported == false) {
throw new UserException("For column `" +
slot.getColumn().getName()
- + "`, The column types MAP/STRUCT are not supported
yet"
- + " for text input format of Hive. ");
+ + "`, The column types are not supported yet"
+ + " for text input format of Hive.\n"
+ + "For complex type ,now Support :\n"
+ + "\t1. array< primitive_type > and array< array<
... > >\n"
+ + "\t2. map< primitive_type , primitive_type >\n"
+ + "\t3. Struct< primitive_type , primitive_type
... >\n");
}
}
}
@@ -281,12 +324,15 @@ public class HiveScanNode extends FileQueryScanNode {
java.util.Map<String, String> delimiter =
hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters();
textParams.setColumnSeparator(delimiter.getOrDefault(PROP_FIELD_DELIMITER,
DEFAULT_FIELD_DELIMITER));
textParams.setLineDelimiter(delimiter.getOrDefault(PROP_LINE_DELIMITER,
DEFAULT_LINE_DELIMITER));
- if (delimiter.get(PROP_ARRAY_DELIMITER_HIVE2) != null) {
-
textParams.setArrayDelimiter(delimiter.get(PROP_ARRAY_DELIMITER_HIVE2));
- } else if (delimiter.get(PROP_ARRAY_DELIMITER_HIVE3) != null) {
-
textParams.setArrayDelimiter(delimiter.get(PROP_ARRAY_DELIMITER_HIVE3));
+
textParams.setMapkvDelimiter(delimiter.getOrDefault(PROP_MAP_KV_DELIMITER,
DEFAULT_MAP_KV_DELIMITER));
+
+ // textParams.collection_delimiter field is map, array and struct
delimiter;
+ if (delimiter.get(PROP_COLLECTION_DELIMITER_HIVE2) != null) {
+
textParams.setCollectionDelimiter(delimiter.get(PROP_COLLECTION_DELIMITER_HIVE2));
+ } else if (delimiter.get(PROP_COLLECTION_DELIMITER_HIVE3) != null) {
+
textParams.setCollectionDelimiter(delimiter.get(PROP_COLLECTION_DELIMITER_HIVE3));
} else {
- textParams.setArrayDelimiter(DEFAULT_ARRAY_DELIMITER);
+ textParams.setCollectionDelimiter(DEFAULT_COLLECTION_DELIMITER);
}
TFileAttributes fileAttributes = new TFileAttributes();
fileAttributes.setTextParams(textParams);
diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift
index b0c2ae5b96..a36368ef7a 100644
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@@ -243,7 +243,8 @@ struct TEsScanRange {
struct TFileTextScanRangeParams {
1: optional string column_separator;
2: optional string line_delimiter;
- 3: optional string array_delimiter;
+ 3: optional string collection_delimiter;// array ,map ,struct delimiter
+ 4: optional string mapkv_delimiter;
}
struct TFileScanSlotInfo {
diff --git
a/regression-test/data/external_table_p2/hive/test_hive_text_complex_type.out
b/regression-test/data/external_table_p2/hive/test_hive_text_complex_type.out
new file mode 100644
index 0000000000..a04b9c1def
--- /dev/null
+++
b/regression-test/data/external_table_p2/hive/test_hive_text_complex_type.out
@@ -0,0 +1,15 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql1 --
+1 {101:1} {102:10} {"field1":100} {"field2":2000000}
{"field3":300000000} {"field4":3.14} {"field5":3.14159} {103:"Hello"}
{"field6":2023-07-28 12:34:56.000000} {"field7":2023-07-28} {1, 1, 20,
3000000, 44444444444, 3.14, 3.14159, "Hello", 2023-07-28 12:34:56.000000,
2023-07-28}
+2 {201:1} {202:11} {"field1":200} {"field2":9000000}
{"field3":8000000000} {"field4":9.13321} {"field5":322.14159}
{203:"Hello"} {"field6":2023-07-28 12:34:56.000000} {"field7":2023-07-28}
{1, 1, 201, 300011000, 44444444444, 3.14, 3.14159, "world", 2023-07-28
12:34:56.000000, 2023-06-28}
+3 {201:1} {202:10} {"field1":120} {"field2":44440000}
{"field3":700000000} {"field4":3.100004} {"field5":3.00014159}
{103:"Hello"} {"field6":2023-07-28 12:34:56.000000} {"field7":2023-07-28}
{1, 1, 700, 300011000, 3333333334, 3.00014, 3.3314159, "hello world",
2023-07-28 01:34:56.000000, 2023-07-27}
+10 {101:1, 102:1, 103:1} {102:10, 104:1, 105:2} {"field1":100,
"field0":100} {"field2":3000000} {"field3":300000000}
{"field4":3.14, "hello world":0.111, "hell0":7.001} {"field5":3.14159}
{103:"Hello"} {"field6":2023-07-28 12:34:56.000000, "field000006":2023-07-08
12:34:57.000000, "field2432456":2023-07-28 12:34:50.000000}
{"field7":2023-07-28} {1, 1, 20, 3000000, 44444444444, 3.14, 3.14159,
"Hello", 2023-07-28 12:34:56.000000, 2023-07-28}
+11 {101:1, 102:1, 13:1, 12:1} {102:10, 14:1, 15:2, 12:10}
{"field1":100, "fie88ld0":100, "fieweld0":100, "fieeeld1":100, "fieeeld0":100,
"feeield0":100, "feeield1":100, "firreld0":100, "field0":100}
{"field2":3000000, "abcd":4000000, "1231":3000000} {"fi7eld3":300000000,
"field30":300000000, "fielwwd3":300000000, "fi055":300000000,
"field7":300000121323} {"field4":3.14, "hello world":0.111, "hell0":7.001}
{"field5":3.14159} {103:"Hello", 0:"hello"}
{"field6":2023-07-28 12:34:56.000000, " [...]
+
+-- !sql2 --
+1 {101:1} {102:10} {"field1":100} {"field2":2000000}
{"field3":300000000} {"field4":3.14} {"field5":3.14159} {103:"Hello"}
{"field6":2023-07-28 12:34:56.000000} {"field7":2023-07-28} {1, 1, 20,
3000000, 44444444444, 3.14, 3.14159, "Hello", 2023-07-28 12:34:56.000000,
2023-07-28}
+2 {201:1} {202:11} {"field1":200} {"field2":9000000}
{"field3":8000000000} {"field4":9.13321} {"field5":322.14159}
{203:"Hello"} {"field6":2023-07-28 12:34:56.000000} {"field7":2023-07-28}
{1, 1, 201, 300011000, 44444444444, 3.14, 3.14159, "world", 2023-07-28
12:34:56.000000, 2023-06-28}
+3 {201:1} {202:10} {"field1":120} {"field2":44440000}
{"field3":700000000} {"field4":3.100004} {"field5":3.00014159}
{103:"Hello"} {"field6":2023-07-28 12:34:56.000000} {"field7":2023-07-28}
{1, 1, 700, 300011000, 3333333334, 3.00014, 3.3314159, "hello world",
2023-07-28 01:34:56.000000, 2023-07-27}
+10 {101:1, 102:1, 103:1} {102:10, 104:1, 105:2} {"field1":100,
"field0":100} {"field2":3000000} {"field3":300000000}
{"field4":3.14, "hello world":0.111, "hell0":7.001} {"field5":3.14159}
{103:"Hello"} {"field6":2023-07-28 12:34:56.000000, "field000006":2023-07-08
12:34:57.000000, "field2432456":2023-07-28 12:34:50.000000}
{"field7":2023-07-28} {1, 1, 20, 3000000, 44444444444, 3.14, 3.14159,
"Hello", 2023-07-28 12:34:56.000000, 2023-07-28}
+11 {101:1, 102:1, 13:1, 12:1} {102:10, 14:1, 15:2, 12:10}
{"field1":100, "fie88ld0":100, "fieweld0":100, "fieeeld1":100, "fieeeld0":100,
"feeield0":100, "feeield1":100, "firreld0":100, "field0":100}
{"field2":3000000, "abcd":4000000, "1231":3000000} {"fi7eld3":300000000,
"field30":300000000, "fielwwd3":300000000, "fi055":300000000,
"field7":300000121323} {"field4":3.14, "hello world":0.111, "hell0":7.001}
{"field5":3.14159} {103:"Hello", 0:"hello"}
{"field6":2023-07-28 12:34:56.000000, " [...]
+
diff --git
a/regression-test/suites/external_table_p2/hive/test_hive_text_complex_type.groovy
b/regression-test/suites/external_table_p2/hive/test_hive_text_complex_type.groovy
new file mode 100644
index 0000000000..8ea9f74135
--- /dev/null
+++
b/regression-test/suites/external_table_p2/hive/test_hive_text_complex_type.groovy
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_text_complex_type",
"p2,external,hive,external_remote,external_remote_hive") {
+ String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ String extHiveHmsHost =
context.config.otherConfigs.get("extHiveHmsHost")
+ String extHiveHmsPort =
context.config.otherConfigs.get("extHiveHmsPort")
+ String catalog_name = "test_hive_text_complex_type"
+
+ sql """drop catalog if exists ${catalog_name};"""
+ sql """
+ create catalog if not exists ${catalog_name} properties (
+ 'type'='hms',
+ 'hadoop.username' = 'hadoop',
+ 'hive.metastore.uris' =
'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
+ );
+ """
+ logger.info("catalog " + catalog_name + " created")
+ sql """switch ${catalog_name};"""
+ logger.info("switched to catalog " + catalog_name)
+
+ sql """ use multi_catalog """
+
+ qt_sql1 """ select * from hive_text_complex_type order by
column1; """
+
+ qt_sql2 """ select * from hive_text_complex_type_delimiter order by
column1; """
+
+
+ }
+}
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]