[doris] branch master updated: [feature](hive)append support for struct and map column type on textfile format of hive table (#22347)

morningman Wed, 09 Aug 2023 22:48:15 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new f1db6bd8c1 [feature](hive)append support for struct and map column 
type on textfile format  of hive table (#22347)
f1db6bd8c1 is described below

commit f1db6bd8c1a652bd5171de508ba8c9c41073d605
Author: daidai <[email protected]>
AuthorDate: Thu Aug 10 13:47:58 2023 +0800

    [feature](hive)append support for struct and map column type on textfile 
format  of hive table (#22347)
    
    1. append support for struct and map column type on textfile format  of 
hive table.
    2. optimizer code that array column type.
    
    ```mysql
    +------+------------------------------------+
    | id   | perf                               |
    +------+------------------------------------+
    | 1    | {"key1":"value1", "key2":"value2"} |
    | 1    | {"key1":"value1", "key2":"value2"} |
    | 2    | {"name":"John", "age":"30"}        |
    +------+------------------------------------+
    ```
    
    ```mysql
    +---------+------------------+
    | column1 | column2          |
    +---------+------------------+
    |       1 | {10, "data1", 1} |
    |       2 | {20, "data2", 0} |
    |       3 | {30, "data3", 1} |
    +---------+------------------+
    ```
    Summarizes support for complex types(support assign delimiter) :
    
    1. array< primitive_type > and array< array< ... > >
    2. map< primitive_type , primitive_type >
    3. Struct< primitive_type , primitive_type ... >
---
 be/src/exec/text_converter.cpp                     | 288 +++++++--------------
 be/src/exec/text_converter.h                       |  18 +-
 be/src/vec/exec/format/csv/csv_reader.cpp          |  16 +-
 be/src/vec/exec/format/csv/csv_reader.h            |   6 +-
 .../doris/planner/external/HiveScanNode.java       |  68 ++++-
 gensrc/thrift/PlanNodes.thrift                     |   3 +-
 .../hive/test_hive_text_complex_type.out           |  15 ++
 .../hive/test_hive_text_complex_type.groovy        |  46 ++++
 8 files changed, 248 insertions(+), 212 deletions(-)

diff --git a/be/src/exec/text_converter.cpp b/be/src/exec/text_converter.cpp
index 5cfa078fb8..1346b14a7b 100644
--- a/be/src/exec/text_converter.cpp
+++ b/be/src/exec/text_converter.cpp
@@ -35,16 +35,20 @@
 #include "util/string_parser.hpp"
 #include "vec/columns/column_array.h"
 #include "vec/columns/column_complex.h"
+#include "vec/columns/column_map.h"
 #include "vec/columns/column_nullable.h"
 #include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
 #include "vec/columns/column_vector.h"
 #include "vec/core/types.h"
 #include "vec/runtime/vdatetime_value.h"
 
 namespace doris {
 
-TextConverter::TextConverter(char escape_char, char array_delimiter)
-        : _escape_char(escape_char), _array_delimiter(array_delimiter) {}
+TextConverter::TextConverter(char escape_char, char collection_delimiter, char 
map_kv_delimiter)
+        : _escape_char(escape_char),
+          _collection_delimiter(collection_delimiter),
+          _map_kv_delimiter(map_kv_delimiter) {}
 
 void TextConverter::write_string_column(const SlotDescriptor* slot_desc,
                                         vectorized::MutableColumnPtr* 
column_ptr, const char* data,
@@ -62,12 +66,15 @@ void TextConverter::write_string_column(const 
SlotDescriptor* slot_desc,
     }
 }
 
-bool TextConverter::write_vec_column(const SlotDescriptor* slot_desc,
-                                     vectorized::IColumn* nullable_col_ptr, 
const char* data,
-                                     size_t len, bool copy_string, bool 
need_escape, size_t rows) {
+bool TextConverter::_write_data(const TypeDescriptor& type_desc,
+                                vectorized::IColumn* nullable_col_ptr, const 
char* data, size_t len,
+                                bool copy_string, bool need_escape, size_t 
rows,
+                                char array_delimiter) {
     vectorized::IColumn* col_ptr = nullable_col_ptr;
     // \N means it's NULL
-    if (slot_desc->is_nullable()) {
+    std::string col_type_name = col_ptr->get_name();
+    bool is_null_able = typeid(*nullable_col_ptr) == 
typeid(vectorized::ColumnNullable);
+    if (is_null_able) {
         auto* nullable_column = 
reinterpret_cast<vectorized::ColumnNullable*>(nullable_col_ptr);
         if ((len == 2 && data[0] == '\\' && data[1] == 'N') || len == 
SQL_NULL_DATA) {
             nullable_column->insert_many_defaults(rows);
@@ -82,7 +89,7 @@ bool TextConverter::write_vec_column(const SlotDescriptor* 
slot_desc,
     StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
     size_t origin_size = col_ptr->size();
     // Parse the raw-text data. Translate the text string to internal format.
-    switch (slot_desc->type().type) {
+    switch (type_desc.type) {
     case TYPE_HLL: {
         HyperLogLog hyper_log_log(Slice(data, len));
         auto& hyper_data = 
reinterpret_cast<vectorized::ColumnHLL*>(col_ptr)->get_data();
@@ -244,7 +251,7 @@ bool TextConverter::write_vec_column(const SlotDescriptor* 
slot_desc,
     case TYPE_DECIMAL32: {
         StringParser::ParseResult result = StringParser::PARSE_SUCCESS;
         int32_t value = StringParser::string_to_decimal<TYPE_DECIMAL32, 
int32_t>(
-                data, len, slot_desc->type().precision, 
slot_desc->type().scale, &result);
+                data, len, type_desc.precision, type_desc.scale, &result);
         if (result != StringParser::PARSE_SUCCESS) {
             parse_result = StringParser::PARSE_FAILURE;
             break;
@@ -257,7 +264,7 @@ bool TextConverter::write_vec_column(const SlotDescriptor* 
slot_desc,
     case TYPE_DECIMAL64: {
         StringParser::ParseResult result = StringParser::PARSE_SUCCESS;
         int64_t value = StringParser::string_to_decimal<TYPE_DECIMAL64, 
int64_t>(
-                data, len, slot_desc->type().precision, 
slot_desc->type().scale, &result);
+                data, len, type_desc.precision, type_desc.scale, &result);
         if (result != StringParser::PARSE_SUCCESS) {
             parse_result = StringParser::PARSE_FAILURE;
             break;
@@ -271,7 +278,7 @@ bool TextConverter::write_vec_column(const SlotDescriptor* 
slot_desc,
         StringParser::ParseResult result = StringParser::PARSE_SUCCESS;
         vectorized::Int128 value =
                 StringParser::string_to_decimal<TYPE_DECIMAL128I, 
vectorized::Int128>(
-                        data, len, slot_desc->type().precision, 
slot_desc->type().scale, &result);
+                        data, len, type_desc.precision, type_desc.scale, 
&result);
         if (result != StringParser::PARSE_SUCCESS) {
             parse_result = StringParser::PARSE_FAILURE;
             break;
@@ -282,200 +289,94 @@ bool TextConverter::write_vec_column(const 
SlotDescriptor* slot_desc,
         break;
     }
     case TYPE_ARRAY: {
-        std::function<vectorized::Array(int, int, char, const 
TypeDescriptor&)> func =
-                [&](int left, int right, char split,
-                    const TypeDescriptor& type) -> vectorized::Array {
-            vectorized::Array array;
-            int fr = left;
-            for (int i = left; i <= right + 1; i++) {
-                auto Sub_type = type.children[0];
-                if (i <= right && data[i] != split && data[i] != 
_array_delimiter) {
-                    continue;
-                }
-                if (Sub_type.type == TYPE_ARRAY) {
-                    array.push_back(func(fr, i - 1, split + 1, Sub_type));
-                } else {
-                    StringParser::ParseResult local_parse_result = 
StringParser::PARSE_SUCCESS;
-                    switch (Sub_type.type) {
-                    case TYPE_HLL: {
-                        DCHECK(false) << "not support type: "
-                                      << "array<HyperLogLog>\n";
-                        break;
-                    }
-                    case TYPE_STRING:
-                    case TYPE_VARCHAR:
-                    case TYPE_CHAR: {
-                        size_t sz = i - fr;
-                        if (need_escape) {
-                            unescape_string_on_spot(data + fr, &sz);
-                        }
-                        array.push_back(std::string(data + fr, sz));
-                        break;
-                    }
-                    case TYPE_BOOLEAN: {
-                        bool num = StringParser::string_to_bool(data + fr, i - 
fr,
-                                                                
&local_parse_result);
-                        array.push_back((uint8_t)num);
-                        break;
-                    }
-                    case TYPE_TINYINT: {
-                        int8_t num = StringParser::string_to_int<int8_t>(data 
+ fr, i - fr,
-                                                                         
&local_parse_result);
-                        array.push_back(num);
-                        break;
-                    }
-                    case TYPE_SMALLINT: {
-                        int16_t num = 
StringParser::string_to_int<int16_t>(data + fr, i - fr,
-                                                                           
&local_parse_result);
-                        array.push_back(num);
-                        break;
-                    }
-                    case TYPE_INT: {
-                        int32_t num = 
StringParser::string_to_int<int32_t>(data + fr, i - fr,
-                                                                           
&local_parse_result);
-                        array.push_back(num);
-                        break;
-                    }
-                    case TYPE_BIGINT: {
-                        int64_t num = 
StringParser::string_to_int<int64_t>(data + fr, i - fr,
-                                                                           
&local_parse_result);
-                        array.push_back(num);
-                        break;
-                    }
-                    case TYPE_LARGEINT: {
-                        __int128 num = 
StringParser::string_to_int<__int128>(data + fr, i - fr,
-                                                                             
&local_parse_result);
-                        array.push_back(num);
-                        break;
-                    }
-                    case TYPE_FLOAT: {
-                        float num = StringParser::string_to_float<float>(data 
+ fr, i - fr,
-                                                                         
&local_parse_result);
-                        array.push_back(num);
-                        break;
-                    }
-                    case TYPE_DOUBLE: {
-                        double num = 
StringParser::string_to_float<double>(data + fr, i - fr,
-                                                                           
&local_parse_result);
-                        array.push_back(num);
-                        break;
-                    }
-                    case TYPE_DATE: {
-                        vectorized::VecDateTimeValue ts_slot;
-                        if (!ts_slot.from_date_str(data + fr, i - fr)) {
-                            local_parse_result = StringParser::PARSE_FAILURE;
-                            break;
-                        }
-                        ts_slot.cast_to_date();
-                        array.push_back(*reinterpret_cast<int64_t*>(&ts_slot));
-                        break;
-                    }
-                    case TYPE_DATEV2: {
-                        vectorized::DateV2Value<vectorized::DateV2ValueType> 
ts_slot;
-                        if (!ts_slot.from_date_str(data + fr, i - fr)) {
-                            local_parse_result = StringParser::PARSE_FAILURE;
-                            break;
-                        }
-                        uint32_t int_val = ts_slot.to_date_int_val();
-                        array.push_back(int_val);
-                        break;
-                    }
-                    case TYPE_DATETIME: {
-                        vectorized::VecDateTimeValue ts_slot;
-                        if (!ts_slot.from_date_str(data + fr, i - fr)) {
-                            local_parse_result = StringParser::PARSE_FAILURE;
-                            break;
-                        }
-                        ts_slot.to_datetime();
-                        array.push_back((int64_t)ts_slot);
-                        break;
-                    }
-                    case TYPE_DATETIMEV2: {
-                        
vectorized::DateV2Value<vectorized::DateTimeV2ValueType> ts_slot;
-                        if (!ts_slot.from_date_str(data + fr, i - fr)) {
-                            local_parse_result = StringParser::PARSE_FAILURE;
-                            break;
-                        }
-                        uint64_t int_val = ts_slot.to_date_int_val();
-                        array.push_back(int_val);
-                        break;
-                    }
-                    case TYPE_DECIMALV2: {
-                        DecimalV2Value decimal_slot;
-                        if (decimal_slot.parse_from_str(data + fr, i - fr)) {
-                            local_parse_result = StringParser::PARSE_FAILURE;
-                            break;
-                        }
-                        array.push_back(decimal_slot.value());
-                        break;
-                    }
-                    case TYPE_DECIMAL32: {
-                        StringParser::ParseResult result = 
StringParser::PARSE_SUCCESS;
-                        int32_t value = 
StringParser::string_to_decimal<TYPE_DECIMAL32, int32_t>(
-                                data + fr, i - fr, Sub_type.precision, 
Sub_type.scale, &result);
-                        if (result != StringParser::PARSE_SUCCESS) {
-                            local_parse_result = StringParser::PARSE_FAILURE;
-                            break;
-                        }
-                        array.push_back(value);
-                        break;
-                    }
-                    case TYPE_DECIMAL64: {
-                        StringParser::ParseResult result = 
StringParser::PARSE_SUCCESS;
-                        int64_t value = 
StringParser::string_to_decimal<TYPE_DECIMAL64, int64_t>(
-                                data + fr, i - fr, Sub_type.precision, 
Sub_type.scale, &result);
-                        if (result != StringParser::PARSE_SUCCESS) {
-                            local_parse_result = StringParser::PARSE_FAILURE;
-                            break;
-                        }
-                        array.push_back(value);
-                        break;
-                    }
-                    case TYPE_DECIMAL128I: {
-                        StringParser::ParseResult result = 
StringParser::PARSE_SUCCESS;
-                        vectorized::Int128 value =
-                                
StringParser::string_to_decimal<TYPE_DECIMAL128I,
-                                                                
vectorized::Int128>(
-                                        data + fr, i - fr, Sub_type.precision, 
Sub_type.scale,
-                                        &result);
-                        if (result != StringParser::PARSE_SUCCESS) {
-                            local_parse_result = StringParser::PARSE_FAILURE;
-                            break;
-                        }
-                        array.push_back(value);
-                        break;
-                    }
-                    default: {
-                        DCHECK(false) << "bad slot type: array<" << Sub_type 
<< ">";
-                        break;
-                    }
-                    }
+        auto col = reinterpret_cast<vectorized::ColumnArray*>(col_ptr);
+
+        std::vector<std::pair<size_t, size_t>> ranges;
+        for (size_t i = 0, from = 0; i <= len; i++) {
+            if (i < len && data[i] != array_delimiter && data[i] != 
_collection_delimiter) {
+                continue;
+            }
+            ranges.push_back({from, i - from});
+            from = i + 1;
+        }
+
+        auto sub_type = type_desc.children[0];
+        for (int i = 0; i < rows; i++) {
+            for (auto range : ranges) {
+                _write_data(sub_type, &col->get_data(), data + range.first, 
range.second,
+                            copy_string, need_escape, 1, array_delimiter + 1);
+            }
+            col->get_offsets().push_back(col->get_offsets().back() + 
ranges.size());
+        }
+
+        break;
+    }
+    case TYPE_MAP: {
+        auto col = reinterpret_cast<vectorized::ColumnMap*>(col_ptr);
 
-                    if (local_parse_result != StringParser::PARSE_SUCCESS) {
-                        parse_result = local_parse_result;
-                        return array;
-                    }
-                }
-                fr = i + 1;
+        std::vector<std::array<size_t, 3>> ranges;
+        for (size_t i = 0, from = 0, kv = 0; i <= len; i++) {
+            /*
+             *  In hive , when you special map key and value delimiter as ':'
+             *  for map<int,timestamp> column , the query result is correct , 
but
+             *  for map<timestamp, int> column and map<timestamp,timestamp> 
column , the query result is incorrect,
+             *  because this field have many '_map_kv_delimiter'.
+             *
+             *  So i use 'kv <= from' in order to get _map_kv_delimiter that 
appears first.
+             * */
+            if (i < len && data[i] == _map_kv_delimiter && kv <= from) {
+                kv = i;
+                continue;
+            }
+            if (i == len || data[i] == _collection_delimiter) {
+                ranges.push_back({from, kv, i - 1});
+                from = i + 1;
             }
-            return array;
-        };
+        }
 
-        auto array = func(0, len - 1, '\002', slot_desc->type());
+        auto key_type = type_desc.children[0];
+        auto value_type = type_desc.children[1];
 
         for (int i = 0; i < rows; i++) {
-            reinterpret_cast<vectorized::ColumnArray*>(col_ptr)->insert(array);
+            for (auto range : ranges) {
+                _write_data(key_type, &col->get_keys(), data + range[0], 
range[1] - range[0],
+                            copy_string, need_escape, 1, array_delimiter + 1);
+
+                _write_data(value_type, &col->get_values(), data + range[1] + 
1,
+                            range[2] - range[1], copy_string, need_escape, 1, 
array_delimiter + 1);
+            }
+
+            col->get_offsets().push_back(col->get_offsets().back() + 
ranges.size());
         }
 
         break;
     }
+    case TYPE_STRUCT: {
+        auto col = reinterpret_cast<vectorized::ColumnStruct*>(col_ptr);
+
+        std::vector<std::pair<size_t, size_t>> ranges;
+        for (size_t i = 0, from = 0; i <= len; i++) {
+            if (i == len || data[i] == _collection_delimiter) {
+                ranges.push_back({from, i - from});
+                from = i + 1;
+            }
+        }
+        for (int i = 0; i < rows; i++) {
+            for (size_t loc = 0; loc < col->get_columns().size(); loc++) {
+                _write_data(type_desc.children[loc], &col->get_column(loc),
+                            data + ranges[loc].first, ranges[loc].second, 
copy_string, need_escape,
+                            rows, array_delimiter + 1);
+            }
+        }
+        break;
+    }
     default:
-        DCHECK(false) << "bad slot type: " << slot_desc->type();
+        DCHECK(false) << "bad slot type: " << type_desc;
         break;
     }
 
     if (UNLIKELY(parse_result == StringParser::PARSE_FAILURE)) {
-        if (true == slot_desc->is_nullable()) {
+        if (is_null_able) {
             auto* nullable_column = 
reinterpret_cast<vectorized::ColumnNullable*>(nullable_col_ptr);
             size_t size = nullable_column->get_null_map_data().size();
             doris::vectorized::NullMap& null_map_data = 
nullable_column->get_null_map_data();
@@ -489,6 +390,13 @@ bool TextConverter::write_vec_column(const SlotDescriptor* 
slot_desc,
     return true;
 }
 
+bool TextConverter::write_vec_column(const SlotDescriptor* slot_desc,
+                                     vectorized::IColumn* nullable_col_ptr, 
const char* data,
+                                     size_t len, bool copy_string, bool 
need_escape, size_t rows) {
+    return _write_data(slot_desc->type(), nullable_col_ptr, data, len, 
copy_string, need_escape,
+                       rows, '\2');
+}
+
 void TextConverter::unescape_string_on_spot(const char* src, size_t* len) {
     const char* start = src;
     char* dest_ptr = const_cast<char*>(src);
diff --git a/be/src/exec/text_converter.h b/be/src/exec/text_converter.h
index 083c7c6881..ef4e87f5a5 100644
--- a/be/src/exec/text_converter.h
+++ b/be/src/exec/text_converter.h
@@ -31,7 +31,7 @@ class TextConverter {
 public:
     static constexpr char NULL_STR[3] = {'\\', 'N', '\0'};
 
-    TextConverter(char escape_char, char array_delimiter = '\2');
+    TextConverter(char escape_char, char collection_delimiter = '\2', char 
map_kv_delimiter = '\3');
 
     void write_string_column(const SlotDescriptor* slot_desc,
                              vectorized::MutableColumnPtr* column_ptr, const 
char* data,
@@ -57,11 +57,23 @@ public:
                           size_t rows);
     void unescape_string_on_spot(const char* src, size_t* len);
 
-    void set_array_delimiter(char array_delimiter) { _array_delimiter = 
array_delimiter; }
+    void set_collection_delimiter(char collection_delimiter) {
+        _collection_delimiter = collection_delimiter;
+    }
+    void set_map_kv_delimiter(char mapkv_delimiter) { _map_kv_delimiter = 
mapkv_delimiter; }
 
 private:
+    bool _write_data(const TypeDescriptor& type_desc, vectorized::IColumn* 
nullable_col_ptr,
+                     const char* data, size_t len, bool copy_string, bool 
need_escape, size_t rows,
+                     char array_delimiter);
+
     char _escape_char;
-    char _array_delimiter;
+
+    //struct,array and map delimiter
+    char _collection_delimiter;
+
+    //map key and value delimiter
+    char _map_kv_delimiter;
 };
 
 } // namespace doris
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 59bae18dcc..93cc148248 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -200,9 +200,11 @@ Status CsvReader::init_reader(bool is_load) {
     _line_delimiter = _params.file_attributes.text_params.line_delimiter;
     _line_delimiter_length = _line_delimiter.size();
 
-    //get array delimiter
-    _array_delimiter = _params.file_attributes.text_params.array_delimiter;
-    _text_converter->set_array_delimiter(_array_delimiter[0]);
+    _collection_delimiter = 
_params.file_attributes.text_params.collection_delimiter;
+    _text_converter->set_collection_delimiter(_collection_delimiter[0]);
+
+    _map_kv_delimiter = _params.file_attributes.text_params.mapkv_delimiter;
+    _text_converter->set_map_kv_delimiter(_map_kv_delimiter[0]);
 
     if (_params.file_attributes.__isset.trim_double_quotes) {
         _trim_double_quotes = _params.file_attributes.trim_double_quotes;
@@ -693,9 +695,11 @@ Status CsvReader::_prepare_parse(size_t* read_line, bool* 
is_parse_name) {
     _line_delimiter = _params.file_attributes.text_params.line_delimiter;
     _line_delimiter_length = _line_delimiter.size();
 
-    //get array delimiter
-    _array_delimiter = _params.file_attributes.text_params.array_delimiter;
-    _text_converter->set_array_delimiter(_array_delimiter[0]);
+    _collection_delimiter = 
_params.file_attributes.text_params.collection_delimiter;
+    _text_converter->set_collection_delimiter(_collection_delimiter[0]);
+
+    _map_kv_delimiter = _params.file_attributes.text_params.mapkv_delimiter;
+    _text_converter->set_map_kv_delimiter(_map_kv_delimiter[0]);
 
     // create decompressor.
     // _decompressor may be nullptr if this is not a compressed file
diff --git a/be/src/vec/exec/format/csv/csv_reader.h 
b/be/src/vec/exec/format/csv/csv_reader.h
index 42178846f1..a1577a638e 100644
--- a/be/src/vec/exec/format/csv/csv_reader.h
+++ b/be/src/vec/exec/format/csv/csv_reader.h
@@ -144,7 +144,11 @@ private:
 
     std::string _value_separator;
     std::string _line_delimiter;
-    std::string _array_delimiter;
+
+    // struct, array and map delimiter
+    std::string _collection_delimiter;
+    // map key and value delimiter
+    std::string _map_kv_delimiter;
 
     int _value_separator_length;
     int _line_delimiter_length;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java 
b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
index 211f6e8056..61a571358a 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
@@ -20,11 +20,15 @@ package org.apache.doris.planner.external;
 import org.apache.doris.analysis.FunctionCallExpr;
 import org.apache.doris.analysis.SlotDescriptor;
 import org.apache.doris.analysis.TupleDescriptor;
+import org.apache.doris.catalog.ArrayType;
 import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.Env;
 import org.apache.doris.catalog.HiveMetaStoreClientHelper;
 import org.apache.doris.catalog.ListPartitionItem;
+import org.apache.doris.catalog.MapType;
 import org.apache.doris.catalog.PartitionItem;
+import org.apache.doris.catalog.StructField;
+import org.apache.doris.catalog.StructType;
 import org.apache.doris.catalog.TableIf;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.catalog.external.HMSExternalTable;
@@ -71,9 +75,12 @@ public class HiveScanNode extends FileQueryScanNode {
     public static final String PROP_LINE_DELIMITER = "line.delim";
     public static final String DEFAULT_LINE_DELIMITER = "\n";
 
-    public static final String PROP_ARRAY_DELIMITER_HIVE2 = "colelction.delim";
-    public static final String PROP_ARRAY_DELIMITER_HIVE3 = "collection.delim";
-    public static final String DEFAULT_ARRAY_DELIMITER = "\2";
+    public static final String PROP_COLLECTION_DELIMITER_HIVE2 = 
"colelction.delim";
+    public static final String PROP_COLLECTION_DELIMITER_HIVE3 = 
"collection.delim";
+    public static final String DEFAULT_COLLECTION_DELIMITER = "\2";
+
+    public static final String PROP_MAP_KV_DELIMITER = "mapkey.delim";
+    public static final String DEFAULT_MAP_KV_DELIMITER = "\003";
 
     protected final HMSExternalTable hmsTable;
     private HiveTransaction hiveTransaction = null;
@@ -104,10 +111,46 @@ public class HiveScanNode extends FileQueryScanNode {
         String inputFormat = 
hmsTable.getRemoteTable().getSd().getInputFormat();
         if (inputFormat.contains("TextInputFormat")) {
             for (SlotDescriptor slot : desc.getSlots()) {
-                if (slot.getType().isMapType() || 
slot.getType().isStructType()) {
+                if (slot.getType().isScalarType()) {
+                    continue;
+                }
+                boolean supported = true;
+
+                // support Array<primitive_type> and array<array<...>>
+                if (slot.getType().isArrayType()) {
+                    ArrayType arraySubType = (ArrayType) slot.getType();
+                    while (true) {
+                        if (arraySubType.getItemType().isArrayType()) {
+                            arraySubType = (ArrayType) 
arraySubType.getItemType();
+                            continue;
+                        }
+                        if (!arraySubType.getItemType().isScalarType()) {
+                            supported = false;
+                        }
+                        break;
+                    }
+                } else if (slot.getType().isMapType()) { //support 
map<primitive_type,primitive_type>
+                    if (!((MapType) 
slot.getType()).getValueType().isScalarType()) {
+                        supported = false;
+                    }
+                } else if (slot.getType().isStructType()) { //support Struct< 
primitive_type,primitive_type ... >
+                    StructType structSubType = (StructType) slot.getType();
+                    structSubType.getColumnSize();
+                    for (StructField f : structSubType.getFields()) {
+                        if (!f.getType().isScalarType()) {
+                            supported = false;
+                        }
+                    }
+                }
+
+                if (supported == false) {
                     throw new UserException("For column `" + 
slot.getColumn().getName()
-                        + "`, The column types MAP/STRUCT are not supported 
yet"
-                        + " for text input format of Hive. ");
+                            + "`, The column types are not supported yet"
+                            + " for text input format of Hive.\n"
+                            + "For complex type ,now Support :\n"
+                            + "\t1. array< primitive_type > and array< array< 
... > >\n"
+                            + "\t2. map< primitive_type , primitive_type >\n"
+                            + "\t3. Struct< primitive_type , primitive_type 
... >\n");
                 }
             }
         }
@@ -281,12 +324,15 @@ public class HiveScanNode extends FileQueryScanNode {
         java.util.Map<String, String> delimiter = 
hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters();
         
textParams.setColumnSeparator(delimiter.getOrDefault(PROP_FIELD_DELIMITER, 
DEFAULT_FIELD_DELIMITER));
         
textParams.setLineDelimiter(delimiter.getOrDefault(PROP_LINE_DELIMITER, 
DEFAULT_LINE_DELIMITER));
-        if (delimiter.get(PROP_ARRAY_DELIMITER_HIVE2) != null) {
-            
textParams.setArrayDelimiter(delimiter.get(PROP_ARRAY_DELIMITER_HIVE2));
-        } else if (delimiter.get(PROP_ARRAY_DELIMITER_HIVE3) != null) {
-            
textParams.setArrayDelimiter(delimiter.get(PROP_ARRAY_DELIMITER_HIVE3));
+        
textParams.setMapkvDelimiter(delimiter.getOrDefault(PROP_MAP_KV_DELIMITER, 
DEFAULT_MAP_KV_DELIMITER));
+
+        //  textParams.collection_delimiter field is map, array and struct 
delimiter;
+        if (delimiter.get(PROP_COLLECTION_DELIMITER_HIVE2) != null) {
+            
textParams.setCollectionDelimiter(delimiter.get(PROP_COLLECTION_DELIMITER_HIVE2));
+        } else if (delimiter.get(PROP_COLLECTION_DELIMITER_HIVE3) != null) {
+            
textParams.setCollectionDelimiter(delimiter.get(PROP_COLLECTION_DELIMITER_HIVE3));
         } else {
-            textParams.setArrayDelimiter(DEFAULT_ARRAY_DELIMITER);
+            textParams.setCollectionDelimiter(DEFAULT_COLLECTION_DELIMITER);
         }
         TFileAttributes fileAttributes = new TFileAttributes();
         fileAttributes.setTextParams(textParams);
diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift
index b0c2ae5b96..a36368ef7a 100644
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@@ -243,7 +243,8 @@ struct TEsScanRange {
 struct TFileTextScanRangeParams {
     1: optional string column_separator;
     2: optional string line_delimiter;
-    3: optional string array_delimiter;
+    3: optional string collection_delimiter;// array ,map ,struct delimiter 
+    4: optional string mapkv_delimiter;
 }
 
 struct TFileScanSlotInfo {
diff --git 
a/regression-test/data/external_table_p2/hive/test_hive_text_complex_type.out 
b/regression-test/data/external_table_p2/hive/test_hive_text_complex_type.out
new file mode 100644
index 0000000000..a04b9c1def
--- /dev/null
+++ 
b/regression-test/data/external_table_p2/hive/test_hive_text_complex_type.out
@@ -0,0 +1,15 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql1 --
+1      {101:1} {102:10}        {"field1":100}  {"field2":2000000}      
{"field3":300000000}    {"field4":3.14} {"field5":3.14159}      {103:"Hello"}   
{"field6":2023-07-28 12:34:56.000000}   {"field7":2023-07-28}   {1, 1, 20, 
3000000, 44444444444, 3.14, 3.14159, "Hello", 2023-07-28 12:34:56.000000, 
2023-07-28}
+2      {201:1} {202:11}        {"field1":200}  {"field2":9000000}      
{"field3":8000000000}   {"field4":9.13321}      {"field5":322.14159}    
{203:"Hello"}   {"field6":2023-07-28 12:34:56.000000}   {"field7":2023-07-28}   
{1, 1, 201, 300011000, 44444444444, 3.14, 3.14159, "world", 2023-07-28 
12:34:56.000000, 2023-06-28}
+3      {201:1} {202:10}        {"field1":120}  {"field2":44440000}     
{"field3":700000000}    {"field4":3.100004}     {"field5":3.00014159}   
{103:"Hello"}   {"field6":2023-07-28 12:34:56.000000}   {"field7":2023-07-28}   
{1, 1, 700, 300011000, 3333333334, 3.00014, 3.3314159, "hello world", 
2023-07-28 01:34:56.000000, 2023-07-27}
+10     {101:1, 102:1, 103:1}   {102:10, 104:1, 105:2}  {"field1":100, 
"field0":100}    {"field2":3000000}      {"field3":300000000}    
{"field4":3.14, "hello world":0.111, "hell0":7.001}     {"field5":3.14159}      
{103:"Hello"}   {"field6":2023-07-28 12:34:56.000000, "field000006":2023-07-08 
12:34:57.000000, "field2432456":2023-07-28 12:34:50.000000}      
{"field7":2023-07-28}   {1, 1, 20, 3000000, 44444444444, 3.14, 3.14159, 
"Hello", 2023-07-28 12:34:56.000000, 2023-07-28}
+11     {101:1, 102:1, 13:1, 12:1}      {102:10, 14:1, 15:2, 12:10}     
{"field1":100, "fie88ld0":100, "fieweld0":100, "fieeeld1":100, "fieeeld0":100, 
"feeield0":100, "feeield1":100, "firreld0":100, "field0":100}    
{"field2":3000000, "abcd":4000000, "1231":3000000}      {"fi7eld3":300000000, 
"field30":300000000, "fielwwd3":300000000, "fi055":300000000, 
"field7":300000121323}      {"field4":3.14, "hello world":0.111, "hell0":7.001} 
    {"field5":3.14159}      {103:"Hello", 0:"hello"}        
{"field6":2023-07-28 12:34:56.000000, " [...]
+
+-- !sql2 --
+1      {101:1} {102:10}        {"field1":100}  {"field2":2000000}      
{"field3":300000000}    {"field4":3.14} {"field5":3.14159}      {103:"Hello"}   
{"field6":2023-07-28 12:34:56.000000}   {"field7":2023-07-28}   {1, 1, 20, 
3000000, 44444444444, 3.14, 3.14159, "Hello", 2023-07-28 12:34:56.000000, 
2023-07-28}
+2      {201:1} {202:11}        {"field1":200}  {"field2":9000000}      
{"field3":8000000000}   {"field4":9.13321}      {"field5":322.14159}    
{203:"Hello"}   {"field6":2023-07-28 12:34:56.000000}   {"field7":2023-07-28}   
{1, 1, 201, 300011000, 44444444444, 3.14, 3.14159, "world", 2023-07-28 
12:34:56.000000, 2023-06-28}
+3      {201:1} {202:10}        {"field1":120}  {"field2":44440000}     
{"field3":700000000}    {"field4":3.100004}     {"field5":3.00014159}   
{103:"Hello"}   {"field6":2023-07-28 12:34:56.000000}   {"field7":2023-07-28}   
{1, 1, 700, 300011000, 3333333334, 3.00014, 3.3314159, "hello world", 
2023-07-28 01:34:56.000000, 2023-07-27}
+10     {101:1, 102:1, 103:1}   {102:10, 104:1, 105:2}  {"field1":100, 
"field0":100}    {"field2":3000000}      {"field3":300000000}    
{"field4":3.14, "hello world":0.111, "hell0":7.001}     {"field5":3.14159}      
{103:"Hello"}   {"field6":2023-07-28 12:34:56.000000, "field000006":2023-07-08 
12:34:57.000000, "field2432456":2023-07-28 12:34:50.000000}      
{"field7":2023-07-28}   {1, 1, 20, 3000000, 44444444444, 3.14, 3.14159, 
"Hello", 2023-07-28 12:34:56.000000, 2023-07-28}
+11     {101:1, 102:1, 13:1, 12:1}      {102:10, 14:1, 15:2, 12:10}     
{"field1":100, "fie88ld0":100, "fieweld0":100, "fieeeld1":100, "fieeeld0":100, 
"feeield0":100, "feeield1":100, "firreld0":100, "field0":100}    
{"field2":3000000, "abcd":4000000, "1231":3000000}      {"fi7eld3":300000000, 
"field30":300000000, "fielwwd3":300000000, "fi055":300000000, 
"field7":300000121323}      {"field4":3.14, "hello world":0.111, "hell0":7.001} 
    {"field5":3.14159}      {103:"Hello", 0:"hello"}        
{"field6":2023-07-28 12:34:56.000000, " [...]
+
diff --git 
a/regression-test/suites/external_table_p2/hive/test_hive_text_complex_type.groovy
 
b/regression-test/suites/external_table_p2/hive/test_hive_text_complex_type.groovy
new file mode 100644
index 0000000000..8ea9f74135
--- /dev/null
+++ 
b/regression-test/suites/external_table_p2/hive/test_hive_text_complex_type.groovy
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_text_complex_type", 
"p2,external,hive,external_remote,external_remote_hive") {
+    String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        String extHiveHmsHost = 
context.config.otherConfigs.get("extHiveHmsHost")
+        String extHiveHmsPort = 
context.config.otherConfigs.get("extHiveHmsPort")
+        String catalog_name = "test_hive_text_complex_type"
+
+        sql """drop catalog if exists ${catalog_name};"""
+        sql """
+            create catalog if not exists ${catalog_name} properties (
+                'type'='hms',
+                'hadoop.username' = 'hadoop',
+                'hive.metastore.uris' = 
'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
+            );
+        """
+        logger.info("catalog " + catalog_name + " created")
+        sql """switch ${catalog_name};"""
+        logger.info("switched to catalog " + catalog_name)
+        
+        sql """ use multi_catalog """ 
+       
+               qt_sql1 """ select * from hive_text_complex_type order by 
column1; """ 
+
+        qt_sql2 """ select * from hive_text_complex_type_delimiter order by 
column1; """   
+
+
+    }
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] branch master updated: [feature](hive)append support for struct and map column type on textfile format of hive table (#22347)

Reply via email to