This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 8efb98e8518 [feature](hive)support hive catalog read json table.
(#43469) (#44848)
8efb98e8518 is described below
commit 8efb98e8518aa34692314ba6a382852060c99116
Author: daidai <[email protected]>
AuthorDate: Thu Dec 5 00:18:23 2024 +0800
[feature](hive)support hive catalog read json table. (#43469) (#44848)
bp #43469
---
.../vec/data_types/serde/data_type_array_serde.h | 2 +
be/src/vec/data_types/serde/data_type_map_serde.h | 4 +
.../data_types/serde/data_type_nullable_serde.h | 2 +
be/src/vec/data_types/serde/data_type_serde.h | 12 +-
.../vec/data_types/serde/data_type_struct_serde.h | 2 +
be/src/vec/exec/format/json/new_json_reader.cpp | 555 ++++++++++++++++-----
be/src/vec/exec/format/json/new_json_reader.h | 32 +-
be/src/vec/exec/scan/vfile_scanner.cpp | 4 +-
.../scripts/create_preinstalled_scripts/run69.hql | 35 ++
.../scripts/create_preinstalled_scripts/run70.hql | 73 +++
.../scripts/create_preinstalled_scripts/run71.hql | 13 +
.../json/json_all_complex_types/dt=dt1/000000_0 | 3 +
.../json/json_all_complex_types/dt=dt2/000000_0 | 1 +
.../json/json_all_complex_types/dt=dt3/000000_0 | 2 +
.../preinstalled_data/json/json_load_data_table/1 | 13 +
.../json/json_nested_complex_table/1 | 2 +
.../json/json_nested_complex_table/2 | 1 +
.../json/json_nested_complex_table/modify_2 | 2 +
.../datasource/hive/HiveMetaStoreClientHelper.java | 3 +
.../doris/datasource/hive/source/HiveScanNode.java | 41 +-
.../hive/hive_json_basic_test.out | 115 +++++
.../hive/hive_json_basic_test.groovy | 71 +++
22 files changed, 830 insertions(+), 158 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_array_serde.h
b/be/src/vec/data_types/serde/data_type_array_serde.h
index 13c40e60777..2798596c823 100644
--- a/be/src/vec/data_types/serde/data_type_array_serde.h
+++ b/be/src/vec/data_types/serde/data_type_array_serde.h
@@ -101,6 +101,8 @@ public:
nested_serde->set_return_object_as_string(value);
}
+ virtual DataTypeSerDeSPtrs get_nested_serdes() const override { return
{nested_serde}; }
+
private:
template <bool is_binary_format>
Status _write_column_to_mysql(const IColumn& column,
MysqlRowBuffer<is_binary_format>& result,
diff --git a/be/src/vec/data_types/serde/data_type_map_serde.h
b/be/src/vec/data_types/serde/data_type_map_serde.h
index 5e10a7ec3f2..d9572682470 100644
--- a/be/src/vec/data_types/serde/data_type_map_serde.h
+++ b/be/src/vec/data_types/serde/data_type_map_serde.h
@@ -95,6 +95,10 @@ public:
value_serde->set_return_object_as_string(value);
}
+ virtual DataTypeSerDeSPtrs get_nested_serdes() const override {
+ return {key_serde, value_serde};
+ }
+
private:
template <bool is_binary_format>
Status _write_column_to_mysql(const IColumn& column,
MysqlRowBuffer<is_binary_format>& result,
diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.h
b/be/src/vec/data_types/serde/data_type_nullable_serde.h
index e9af344fb65..c7dac856621 100644
--- a/be/src/vec/data_types/serde/data_type_nullable_serde.h
+++ b/be/src/vec/data_types/serde/data_type_nullable_serde.h
@@ -99,6 +99,8 @@ public:
int row_num) const override;
Status read_one_cell_from_json(IColumn& column, const rapidjson::Value&
result) const override;
+ virtual DataTypeSerDeSPtrs get_nested_serdes() const override { return
{nested_serde}; }
+
private:
template <bool is_binary_format>
Status _write_column_to_mysql(const IColumn& column,
MysqlRowBuffer<is_binary_format>& result,
diff --git a/be/src/vec/data_types/serde/data_type_serde.h
b/be/src/vec/data_types/serde/data_type_serde.h
index 46236faa926..6caa51d2663 100644
--- a/be/src/vec/data_types/serde/data_type_serde.h
+++ b/be/src/vec/data_types/serde/data_type_serde.h
@@ -98,6 +98,10 @@ class IColumn;
class Arena;
class IDataType;
+class DataTypeSerDe;
+using DataTypeSerDeSPtr = std::shared_ptr<DataTypeSerDe>;
+using DataTypeSerDeSPtrs = std::vector<DataTypeSerDeSPtr>;
+
// Deserialize means read from different file format or memory format,
// for example read from arrow, read from parquet.
// Serialize means write the column cell or the total column into another
@@ -332,6 +336,11 @@ public:
Arena& mem_pool, int row_num) const;
virtual Status read_one_cell_from_json(IColumn& column, const
rapidjson::Value& result) const;
+ virtual DataTypeSerDeSPtrs get_nested_serdes() const {
+ throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
+ "Method get_nested_serdes is not supported for
this serde");
+ }
+
protected:
bool _return_object_as_string = false;
// This parameter indicates what level the serde belongs to and is mainly
used for complex types
@@ -374,9 +383,6 @@ inline void checkArrowStatus(const arrow::Status& status,
const std::string& col
}
}
-using DataTypeSerDeSPtr = std::shared_ptr<DataTypeSerDe>;
-using DataTypeSerDeSPtrs = std::vector<DataTypeSerDeSPtr>;
-
DataTypeSerDeSPtrs create_data_type_serdes(
const std::vector<std::shared_ptr<const IDataType>>& types);
DataTypeSerDeSPtrs create_data_type_serdes(const std::vector<SlotDescriptor*>&
slots);
diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.h
b/be/src/vec/data_types/serde/data_type_struct_serde.h
index 84e988e150b..5cd6f89e42f 100644
--- a/be/src/vec/data_types/serde/data_type_struct_serde.h
+++ b/be/src/vec/data_types/serde/data_type_struct_serde.h
@@ -171,6 +171,8 @@ public:
}
}
+ virtual DataTypeSerDeSPtrs get_nested_serdes() const override { return
elem_serdes_ptrs; }
+
private:
std::optional<size_t> try_get_position_by_name(const String& name) const;
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp
b/be/src/vec/exec/format/json/new_json_reader.cpp
index e3c2c1f332e..307edc265be 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -54,8 +54,11 @@
#include "util/slice.h"
#include "util/uid_util.h"
#include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_map.h"
#include "vec/columns/column_nullable.h"
#include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
#include "vec/common/assert_cast.h"
#include "vec/common/typeid_cast.h"
#include "vec/core/block.h"
@@ -164,10 +167,18 @@ void NewJsonReader::_init_file_description() {
}
Status NewJsonReader::init_reader(
- const std::unordered_map<std::string, VExprContextSPtr>&
col_default_value_ctx) {
+ const std::unordered_map<std::string, VExprContextSPtr>&
col_default_value_ctx,
+ bool is_load) {
+ _is_load = is_load;
+
// generate _col_default_value_map
RETURN_IF_ERROR(_get_column_default_value(_file_slot_descs,
col_default_value_ctx));
+ //use serde insert data to column.
+ for (auto* slot_desc : _file_slot_descs) {
+ _serdes.emplace_back(slot_desc->get_data_type_ptr()->get_serde());
+ }
+
// create decompressor.
// _decompressor may be nullptr if this is not a compressed file
RETURN_IF_ERROR(Decompressor::create_decompressor(_file_compress_type,
&_decompressor));
@@ -387,6 +398,9 @@ Status NewJsonReader::_get_range_params() {
if (_params.file_attributes.__isset.fuzzy_parse) {
_fuzzy_parse = _params.file_attributes.fuzzy_parse;
}
+ if (_range.table_format_params.table_format_type == "hive") {
+ _is_hive_table = true;
+ }
return Status::OK();
}
@@ -474,8 +488,8 @@ Status NewJsonReader::_vhandle_simple_json(RuntimeState*
/*state*/, Block& block
bool valid = false;
if (_next_row >= _total_rows) { // parse json and generic document
Status st = _parse_json(is_empty_row, eof);
- if (st.is<DATA_QUALITY_ERROR>()) {
- continue; // continue to read next
+ if (_is_load && st.is<DATA_QUALITY_ERROR>()) {
+ continue; // continue to read next (for load, after this ,
already append error to file.)
}
RETURN_IF_ERROR(st);
if (*is_empty_row) {
@@ -752,7 +766,21 @@ Status NewJsonReader::_set_column_value(rapidjson::Value&
objectValue, Block& bl
int ctx_idx = 0;
bool has_valid_value = false;
- for (auto* slot_desc : slot_descs) {
+
+ if (_is_hive_table) {
+ //don't like _fuzzy_parse,each line read in must modify name_map once.
+
+ for (int i = 0; i < objectValue.MemberCount(); ++i) {
+ auto it = objectValue.MemberBegin() + i;
+ std::string field_name(it->name.GetString(),
it->name.GetStringLength());
+ std::transform(field_name.begin(), field_name.end(),
field_name.begin(), ::tolower);
+
+ //Use the last value with the same name.
+ _name_map.emplace(field_name, i);
+ }
+ }
+ for (size_t slot_idx = 0; slot_idx < slot_descs.size(); ++slot_idx) {
+ auto* slot_desc = slot_descs[slot_idx];
if (!slot_desc->is_materialized()) {
continue;
}
@@ -761,7 +789,7 @@ Status NewJsonReader::_set_column_value(rapidjson::Value&
objectValue, Block& bl
auto* column_ptr =
block.get_by_position(dest_index).column->assume_mutable().get();
rapidjson::Value::ConstMemberIterator it = objectValue.MemberEnd();
- if (_fuzzy_parse) {
+ if (_fuzzy_parse || _is_hive_table) {
auto idx_it = _name_map.find(slot_desc->col_name());
if (idx_it != _name_map.end() && idx_it->second <
objectValue.MemberCount()) {
it = objectValue.MemberBegin() + idx_it->second;
@@ -773,20 +801,21 @@ Status NewJsonReader::_set_column_value(rapidjson::Value&
objectValue, Block& bl
if (it != objectValue.MemberEnd()) {
const rapidjson::Value& value = it->value;
- RETURN_IF_ERROR(_write_data_to_column(&value, slot_desc,
column_ptr, valid));
+ RETURN_IF_ERROR(_write_data_to_column(&value, slot_desc->type(),
column_ptr,
+ slot_desc->col_name(),
_serdes[slot_idx], valid));
if (!(*valid)) {
return Status::OK();
}
has_valid_value = true;
} else {
// not found, filling with default value
- RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr,
valid));
+ RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[slot_idx],
column_ptr, valid));
if (!(*valid)) {
return Status::OK();
}
}
}
- if (!has_valid_value) {
+ if (!has_valid_value && _is_load) {
// there is no valid value in json line but has filled with default
value before
// so remove this line in block
string col_names;
@@ -810,79 +839,188 @@ Status
NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl
}
Status
NewJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value,
- SlotDescriptor* slot_desc,
IColumn* column_ptr,
+ const TypeDescriptor& type_desc,
+ vectorized::IColumn* column_ptr,
+ const std::string& column_name,
DataTypeSerDeSPtr serde,
bool* valid) {
- const char* str_value = nullptr;
- char tmp_buf[128] = {0};
- int32_t wbytes = 0;
- std::string json_str;
-
ColumnNullable* nullable_column = nullptr;
- if (slot_desc->is_nullable()) {
+ vectorized::IColumn* data_column_ptr = column_ptr;
+ DataTypeSerDeSPtr data_serde = serde;
+
+ bool value_is_null = (value == nullptr) || (value->GetType() ==
rapidjson::Type::kNullType);
+
+ if (column_ptr->is_nullable()) {
nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr);
- // kNullType will put 1 into the Null map, so there is no need to push
0 for kNullType.
- if (value->GetType() != rapidjson::Type::kNullType) {
+ data_column_ptr = nullable_column->get_nested_column().get_ptr();
+ data_serde = serde->get_nested_serdes()[0];
+
+ if (value_is_null) {
+ nullable_column->insert_default();
+ *valid = true;
+ return Status::OK();
+ } else {
nullable_column->get_null_map_data().push_back(0);
+ }
+
+ } else if (value_is_null) [[unlikely]] {
+ if (_is_load) {
+ RETURN_IF_ERROR(_append_error_msg(
+ *value, "Json value is null, but the column `{}` is not
nullable.", column_name,
+ valid));
+ return Status::OK();
+
} else {
- nullable_column->insert_default();
+ return Status::DataQualityError(
+ "Json value is null, but the column `{}` is not
nullable.", column_name);
}
- column_ptr = &nullable_column->get_nested_column();
}
- switch (value->GetType()) {
- case rapidjson::Type::kStringType:
- str_value = value->GetString();
- wbytes = value->GetStringLength();
- break;
- case rapidjson::Type::kNumberType:
- if (value->IsUint()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u",
value->GetUint());
- } else if (value->IsInt()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt());
- } else if (value->IsUint64()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64,
value->GetUint64());
- } else if (value->IsInt64()) {
- wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64,
value->GetInt64());
- } else if (value->IsFloat() || value->IsDouble()) {
- auto* end = fmt::format_to(tmp_buf, "{}", value->GetDouble());
- wbytes = end - tmp_buf;
+ if (_is_load || !type_desc.is_complex_type()) {
+ if (value->IsString()) {
+ Slice slice {value->GetString(), value->GetStringLength()};
+
RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr,
slice,
+
_serde_options));
+
} else {
- return Status::InternalError<false>("It should not here.");
+ // We can `switch (value->GetType()) case: kNumberType`.
+ // Note that `if (value->IsInt())`, but column is FloatColumn.
+ // Or for any type, use `NewJsonReader::_print_json_value(*value)`.
+
+ const char* str_value = nullptr;
+ char tmp_buf[128] = {0};
+ size_t wbytes = 0;
+ std::string json_str;
+
+ switch (value->GetType()) {
+ case rapidjson::Type::kStringType:
+ str_value = value->GetString();
+ wbytes = value->GetStringLength();
+ break;
+ case rapidjson::Type::kNumberType:
+ if (value->IsUint()) {
+ wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u",
value->GetUint());
+ } else if (value->IsInt()) {
+ wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d",
value->GetInt());
+ } else if (value->IsUint64()) {
+ wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64,
value->GetUint64());
+ } else if (value->IsInt64()) {
+ wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64,
value->GetInt64());
+ } else if (value->IsFloat() || value->IsDouble()) {
+ auto* end = fmt::format_to(tmp_buf, "{}",
value->GetDouble());
+ wbytes = end - tmp_buf;
+ } else {
+ return Status::InternalError<false>("It should not here.");
+ }
+ str_value = tmp_buf;
+ break;
+ case rapidjson::Type::kFalseType:
+ wbytes = 1;
+ str_value = (char*)"0";
+ break;
+ case rapidjson::Type::kTrueType:
+ wbytes = 1;
+ str_value = (char*)"1";
+ break;
+ default:
+ // for other type, we convert it to string to save
+ json_str = NewJsonReader::_print_json_value(*value);
+ wbytes = json_str.size();
+ str_value = json_str.c_str();
+ break;
+ }
+ Slice slice {str_value, wbytes};
+
RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr,
slice,
+
_serde_options));
}
- str_value = tmp_buf;
- break;
- case rapidjson::Type::kFalseType:
- wbytes = 1;
- str_value = (char*)"0";
- break;
- case rapidjson::Type::kTrueType:
- wbytes = 1;
- str_value = (char*)"1";
- break;
- case rapidjson::Type::kNullType:
- if (!slot_desc->is_nullable()) {
- RETURN_IF_ERROR(_append_error_msg(
- *value, "Json value is null, but the column `{}` is not
nullable.",
- slot_desc->col_name(), valid));
- return Status::OK();
+ } else if (type_desc.type == TYPE_STRUCT) {
+ if (!value->IsObject()) [[unlikely]] {
+ return Status::DataQualityError(
+ "Json value isn't object, but the column `{}` is struct.",
column_name);
}
- // return immediately to prevent from repeatedly insert_data
- *valid = true;
- return Status::OK();
- default:
- // for other type like array or object. we convert it to string to save
- json_str = NewJsonReader::_print_json_value(*value);
- wbytes = json_str.size();
- str_value = json_str.c_str();
- break;
- }
+ auto sub_col_size = type_desc.children.size();
+ const auto& struct_value = value->GetObject();
+
+ auto sub_serdes = data_serde->get_nested_serdes();
+ auto struct_column_ptr = assert_cast<ColumnStruct*>(data_column_ptr);
+
+ std::map<std::string, size_t> sub_col_name_to_idx;
+ for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size;
sub_col_idx++) {
+ sub_col_name_to_idx.emplace(type_desc.field_names[sub_col_idx],
sub_col_idx);
+ }
+
+ std::vector<rapidjson::Value::ConstValueIterator>
sub_values(sub_col_size, nullptr);
+ for (const auto& sub : struct_value) {
+ if (!sub.name.IsString()) [[unlikely]] {
+ return Status::DataQualityError(
+ "Json file struct column `{}` subfield name isn't a
String", column_name);
+ }
+
+ auto sub_key_char = sub.name.GetString();
+ auto sub_key_length = sub.name.GetStringLength();
+
+ std::string sub_key(sub_key_char, sub_key_length);
+ std::transform(sub_key.begin(), sub_key.end(), sub_key.begin(),
::tolower);
+
+ if (sub_col_name_to_idx.find(sub_key) ==
sub_col_name_to_idx.end()) [[unlikely]] {
+ continue;
+ }
+ size_t sub_column_idx = sub_col_name_to_idx[sub_key];
+ sub_values[sub_column_idx] = &sub.value;
+ }
+
+ for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size;
sub_col_idx++) {
+ auto sub_value = sub_values[sub_col_idx];
+
+ const auto& sub_col_type = type_desc.children[sub_col_idx];
+
+ RETURN_IF_ERROR(_write_data_to_column(
+ sub_value, sub_col_type,
struct_column_ptr->get_column(sub_col_idx).get_ptr(),
+ column_name + "." + type_desc.field_names[sub_col_idx],
sub_serdes[sub_col_idx],
+ valid));
+ }
+ } else if (type_desc.type == TYPE_MAP) {
+ if (!value->IsObject()) [[unlikely]] {
+ return Status::DataQualityError("Json value isn't object, but the
column `{}` is map.",
+ column_name);
+ }
+ const auto& object_value = value->GetObject();
+ auto sub_serdes = data_serde->get_nested_serdes();
+ auto map_column_ptr = assert_cast<ColumnMap*>(data_column_ptr);
- // TODO: if the vexpr can support another 'slot_desc type' than
'TYPE_VARCHAR',
- // we need use a function to support these types to insert data in columns.
- DCHECK(slot_desc->type().type == TYPE_VARCHAR || slot_desc->type().type ==
TYPE_STRING)
- << slot_desc->type().type << ", query id: " <<
print_id(_state->query_id());
- assert_cast<ColumnString*>(column_ptr)->insert_data(str_value, wbytes);
+ for (const auto& member_value : object_value) {
+ RETURN_IF_ERROR(_write_data_to_column(
+ &member_value.name, type_desc.children[0],
+
map_column_ptr->get_keys_ptr()->assume_mutable()->get_ptr(),
+ column_name + ".key", sub_serdes[0], valid));
+
+ RETURN_IF_ERROR(_write_data_to_column(
+ &member_value.value, type_desc.children[1],
+
map_column_ptr->get_values_ptr()->assume_mutable()->get_ptr(),
+ column_name + ".value", sub_serdes[1], valid));
+ }
+
+ auto& offsets = map_column_ptr->get_offsets();
+ offsets.emplace_back(offsets.back() + object_value.MemberCount());
+ } else if (type_desc.type == TYPE_ARRAY) {
+ if (!value->IsArray()) [[unlikely]] {
+ return Status::DataQualityError("Json value isn't array, but the
column `{}` is array.",
+ column_name);
+ }
+ const auto& array_value = value->GetArray();
+ auto sub_serdes = data_serde->get_nested_serdes();
+ auto array_column_ptr = assert_cast<ColumnArray*>(data_column_ptr);
+
+ for (const auto& sub_value : array_value) {
+ RETURN_IF_ERROR(_write_data_to_column(&sub_value,
type_desc.children[0],
+
array_column_ptr->get_data().get_ptr(),
+ column_name + ".element",
sub_serdes[0], valid));
+ }
+ auto& offsets = array_column_ptr->get_offsets();
+ offsets.emplace_back(offsets.back() + array_value.Size());
+ } else {
+ return Status::InternalError("Not support load to complex column.");
+ }
*valid = true;
return Status::OK();
@@ -914,20 +1052,21 @@ Status
NewJsonReader::_write_columns_by_jsonpath(rapidjson::Value& objectValue,
// if json_values' size > 1, it means we just match an array,
not a wrapped one, so no need to unwrap.
json_values = &((*json_values)[0]);
}
- RETURN_IF_ERROR(_write_data_to_column(json_values, slot_descs[i],
column_ptr, valid));
+ RETURN_IF_ERROR(_write_data_to_column(json_values,
slot_descs[i]->type(), column_ptr,
+ slot_descs[i]->col_name(),
_serdes[i], valid));
if (!(*valid)) {
return Status::OK();
}
has_valid_value = true;
} else {
// not found, filling with default value
- RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr,
valid));
+ RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i],
column_ptr, valid));
if (!(*valid)) {
return Status::OK();
}
}
}
- if (!has_valid_value) {
+ if (!has_valid_value && _is_load) {
// there is no valid value in json line but has filled with default
value before
// so remove this line in block
for (int i = 0; i < block.columns(); ++i) {
@@ -1074,7 +1213,7 @@ Status
NewJsonReader::_simdjson_handle_simple_json(RuntimeState* /*state*/, Bloc
// step2: get json value by json doc
Status st = _get_json_value(&size, eof, &error, is_empty_row);
- if (st.is<DATA_QUALITY_ERROR>()) {
+ if (_is_load && st.is<DATA_QUALITY_ERROR>()) {
return Status::OK();
}
RETURN_IF_ERROR(st);
@@ -1349,25 +1488,39 @@ Status
NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val
for (auto field : *value) {
std::string_view key = field.unescaped_key();
StringRef name_ref(key.data(), key.size());
+ std::string key_string;
+ if (_is_hive_table) {
+ key_string = name_ref.to_string();
+ std::transform(key_string.begin(), key_string.end(),
key_string.begin(), ::tolower);
+ name_ref = StringRef(key_string);
+ }
const size_t column_index = _column_index(name_ref, key_index++);
if (UNLIKELY(ssize_t(column_index) < 0)) {
// This key is not exist in slot desc, just ignore
continue;
}
if (_seen_columns[column_index]) {
- continue;
+ if (_is_hive_table) {
+ //Since value can only be traversed once,
+ // we can only insert the original value first, then delete
it, and then reinsert the new value
+
block.get_by_position(column_index).column->assume_mutable()->pop_back(1);
+ } else {
+ continue;
+ }
}
simdjson::ondemand::value val = field.value();
auto* column_ptr =
block.get_by_position(column_index).column->assume_mutable().get();
- RETURN_IF_ERROR(
- _simdjson_write_data_to_column(val, slot_descs[column_index],
column_ptr, valid));
+ RETURN_IF_ERROR(_simdjson_write_data_to_column(
+ val, slot_descs[column_index]->type(), column_ptr,
+ slot_descs[column_index]->col_name(), _serdes[column_index],
valid));
if (!(*valid)) {
return Status::OK();
}
_seen_columns[column_index] = true;
has_valid_value = true;
}
- if (!has_valid_value) {
+
+ if (!has_valid_value && _is_load) {
string col_names;
for (auto* slot_desc : slot_descs) {
col_names.append(slot_desc->col_name() + ", ");
@@ -1400,7 +1553,7 @@ Status
NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val
auto* column_ptr =
block.get_by_position(i).column->assume_mutable().get();
if (column_ptr->size() < cur_row_count + 1) {
DCHECK(column_ptr->size() == cur_row_count);
- RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr,
valid));
+ RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i],
column_ptr, valid));
if (!(*valid)) {
return Status::OK();
}
@@ -1409,12 +1562,6 @@ Status
NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val
DCHECK(column_ptr->size() == cur_row_count + 1);
}
-#ifndef NDEBUG
- // Check all columns rows matched
- for (size_t i = 0; i < block.columns(); ++i) {
- DCHECK_EQ(block.get_by_position(i).column->size(), cur_row_count + 1);
- }
-#endif
// There is at least one valid value here
DCHECK(nullcount < block.columns());
*valid = true;
@@ -1422,54 +1569,180 @@ Status
NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val
}
Status
NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& value,
- SlotDescriptor*
slot_desc, IColumn* column,
- bool* valid) {
- // write
+ const TypeDescriptor&
type_desc,
+ vectorized::IColumn*
column_ptr,
+ const std::string&
column_name,
+ DataTypeSerDeSPtr serde,
bool* valid) {
ColumnNullable* nullable_column = nullptr;
- IColumn* column_ptr = nullptr;
- if (slot_desc->is_nullable()) {
- nullable_column = assert_cast<ColumnNullable*>(column);
- column_ptr = &nullable_column->get_nested_column();
- }
- // TODO: if the vexpr can support another 'slot_desc type' than
'TYPE_VARCHAR',
- // we need use a function to support these types to insert data in columns.
- auto* column_string = assert_cast<ColumnString*>(column_ptr);
- switch (value.type()) {
- case simdjson::ondemand::json_type::null: {
- if (column->is_nullable()) {
- // insert_default already push 1 to null_map
- nullable_column->insert_default();
+ vectorized::IColumn* data_column_ptr = column_ptr;
+ DataTypeSerDeSPtr data_serde = serde;
+
+ if (column_ptr->is_nullable()) {
+ nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr);
+
+ data_column_ptr = nullable_column->get_nested_column().get_ptr();
+ data_serde = serde->get_nested_serdes()[0];
+
+ // kNullType will put 1 into the Null map, so there is no need to push
0 for kNullType.
+ if (value.type() != simdjson::ondemand::json_type::null) {
+ nullable_column->get_null_map_data().push_back(0);
} else {
+ nullable_column->insert_default();
+ *valid = true;
+ return Status::OK();
+ }
+ } else if (value.type() == simdjson::ondemand::json_type::null)
[[unlikely]] {
+ if (_is_load) {
RETURN_IF_ERROR(_append_error_msg(
nullptr, "Json value is null, but the column `{}` is not
nullable.",
- slot_desc->col_name(), valid));
+ column_name, valid));
return Status::OK();
- }
- break;
- }
- case simdjson::ondemand::json_type::boolean: {
- nullable_column->get_null_map_data().push_back(0);
- if (value.get_bool()) {
- column_string->insert_data("1", 1);
} else {
- column_string->insert_data("0", 1);
+ return Status::DataQualityError(
+ "Json value is null, but the column `{}` is not
nullable.", column_name);
}
- break;
}
- default: {
+
+ if (_is_load || !type_desc.is_complex_type()) {
if (value.type() == simdjson::ondemand::json_type::string) {
- auto* unescape_buffer =
-
reinterpret_cast<uint8_t*>(_simdjson_ondemand_unscape_padding_buffer.data());
- std::string_view unescaped_value =
-
_ondemand_json_parser->unescape(value.get_raw_json_string(), unescape_buffer);
- nullable_column->get_null_map_data().push_back(0);
- column_string->insert_data(unescaped_value.data(),
unescaped_value.length());
- break;
+ std::string_view value_string = value.get_string();
+ Slice slice {value_string.data(), value_string.size()};
+
RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr,
slice,
+
_serde_options));
+
+ } else {
+ // Maybe we can `switch (value->GetType()) case: kNumberType`.
+ // Note that `if (value->IsInt())`, but column is FloatColumn.
+ std::string_view json_str = simdjson::to_json_string(value);
+ Slice slice {json_str.data(), json_str.size()};
+
RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr,
slice,
+
_serde_options));
+ }
+ } else if (type_desc.type == TYPE_STRUCT) {
+ if (value.type() != simdjson::ondemand::json_type::object)
[[unlikely]] {
+ return Status::DataQualityError(
+ "Json value isn't object, but the column `{}` is struct.",
column_name);
+ }
+
+ auto sub_col_size = type_desc.children.size();
+ simdjson::ondemand::object struct_value = value.get_object();
+ auto sub_serdes = data_serde->get_nested_serdes();
+ auto struct_column_ptr = assert_cast<ColumnStruct*>(data_column_ptr);
+
+ std::map<std::string, size_t> sub_col_name_to_idx;
+ for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size;
sub_col_idx++) {
+ sub_col_name_to_idx.emplace(type_desc.field_names[sub_col_idx],
sub_col_idx);
+ }
+ vector<bool> has_value(sub_col_size, false);
+ for (simdjson::ondemand::field sub : struct_value) {
+ std::string_view sub_key_view = sub.unescaped_key();
+ std::string sub_key(sub_key_view.data(), sub_key_view.length());
+ std::transform(sub_key.begin(), sub_key.end(), sub_key.begin(),
::tolower);
+
+ if (sub_col_name_to_idx.find(sub_key) ==
sub_col_name_to_idx.end()) [[unlikely]] {
+ continue;
+ }
+ size_t sub_column_idx = sub_col_name_to_idx[sub_key];
+ auto sub_column_ptr =
struct_column_ptr->get_column(sub_column_idx).get_ptr();
+
+ if (has_value[sub_column_idx]) [[unlikely]] {
+ // Since struct_value can only be traversed once, we can only
insert
+ // the original value first, then delete it, and then reinsert
the new value.
+ sub_column_ptr->pop_back(1);
+ }
+ has_value[sub_column_idx] = true;
+
+ const auto& sub_col_type = type_desc.children[sub_column_idx];
+ RETURN_IF_ERROR(_simdjson_write_data_to_column(
+ sub.value(), sub_col_type, sub_column_ptr, column_name +
"." + sub_key,
+ sub_serdes[sub_column_idx], valid));
}
- auto value_str = simdjson::to_json_string(value).value();
- nullable_column->get_null_map_data().push_back(0);
- column_string->insert_data(value_str.data(), value_str.length());
- }
+
+ //fill missing subcolumn
+ for (size_t sub_col_idx = 0; sub_col_idx < sub_col_size;
sub_col_idx++) {
+ if (has_value[sub_col_idx] == true) {
+ continue;
+ }
+
+ auto sub_column_ptr =
struct_column_ptr->get_column(sub_col_idx).get_ptr();
+ if (sub_column_ptr->is_nullable()) {
+ sub_column_ptr->insert_default();
+ continue;
+ } else [[unlikely]] {
+ return Status::DataQualityError(
+ "Json file structColumn miss field {} and this column
isn't nullable.",
+ column_name + "." +
type_desc.field_names[sub_col_idx]);
+ }
+ }
+ } else if (type_desc.type == TYPE_MAP) {
+ if (value.type() != simdjson::ondemand::json_type::object)
[[unlikely]] {
+ return Status::DataQualityError("Json value isn't object, but the
column `{}` is map.",
+ column_name);
+ }
+ simdjson::ondemand::object object_value = value.get_object();
+
+ auto sub_serdes = data_serde->get_nested_serdes();
+ auto map_column_ptr = assert_cast<ColumnMap*>(data_column_ptr);
+
+ size_t field_count = 0;
+ for (simdjson::ondemand::field member_value : object_value) {
+ auto f = [](std::string_view key_view, const TypeDescriptor&
type_desc,
+ vectorized::IColumn* column_ptr, DataTypeSerDeSPtr
serde,
+ vectorized::DataTypeSerDe::FormatOptions
serde_options, bool* valid) {
+ auto data_column_ptr = column_ptr;
+ auto data_serde = serde;
+ if (column_ptr->is_nullable()) {
+ auto nullable_column =
static_cast<ColumnNullable*>(column_ptr);
+
+ nullable_column->get_null_map_data().push_back(0);
+ data_column_ptr =
nullable_column->get_nested_column().get_ptr();
+ data_serde = serde->get_nested_serdes()[0];
+ }
+ Slice slice(key_view.data(), key_view.length());
+
+
RETURN_IF_ERROR(data_serde->deserialize_one_cell_from_json(*data_column_ptr,
slice,
+
serde_options));
+ return Status::OK();
+ };
+
+ RETURN_IF_ERROR(f(member_value.unescaped_key(),
type_desc.children[0],
+
map_column_ptr->get_keys_ptr()->assume_mutable()->get_ptr(),
+ sub_serdes[0], _serde_options, valid));
+
+ simdjson::ondemand::value field_value = member_value.value();
+ RETURN_IF_ERROR(_simdjson_write_data_to_column(
+ field_value, type_desc.children[1],
+
map_column_ptr->get_values_ptr()->assume_mutable()->get_ptr(),
+ column_name + ".value", sub_serdes[1], valid));
+ field_count++;
+ }
+
+ auto& offsets = map_column_ptr->get_offsets();
+ offsets.emplace_back(offsets.back() + field_count);
+
+ } else if (type_desc.type == TYPE_ARRAY) {
+ if (value.type() != simdjson::ondemand::json_type::array) [[unlikely]]
{
+ return Status::DataQualityError("Json value isn't array, but the
column `{}` is array.",
+ column_name);
+ }
+
+ simdjson::ondemand::array array_value = value.get_array();
+
+ auto sub_serdes = data_serde->get_nested_serdes();
+ auto array_column_ptr = assert_cast<ColumnArray*>(data_column_ptr);
+
+ int field_count = 0;
+ for (simdjson::ondemand::value sub_value : array_value) {
+ RETURN_IF_ERROR(_simdjson_write_data_to_column(
+ sub_value, type_desc.children[0],
array_column_ptr->get_data().get_ptr(),
+ column_name + ".element", sub_serdes[0], valid));
+ field_count++;
+ }
+ auto& offsets = array_column_ptr->get_offsets();
+ offsets.emplace_back(offsets.back() + field_count);
+
+ } else {
+ return Status::InternalError("Not support load to complex column.");
}
*valid = true;
return Status::OK();
@@ -1677,13 +1950,14 @@ Status
NewJsonReader::_simdjson_write_columns_by_jsonpath(
has_valid_value = true;
} else if (i >= _parsed_jsonpaths.size() || st.is<NOT_FOUND>()) {
// not match in jsondata, filling with default value
- RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr,
valid));
+ RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i],
column_ptr, valid));
if (!(*valid)) {
return Status::OK();
}
} else {
- RETURN_IF_ERROR(
- _simdjson_write_data_to_column(json_value, slot_desc,
column_ptr, valid));
+ RETURN_IF_ERROR(_simdjson_write_data_to_column(json_value,
slot_desc->type(),
+ column_ptr,
slot_desc->col_name(),
+ _serdes[i], valid));
if (!(*valid)) {
return Status::OK();
}
@@ -1741,25 +2015,30 @@ Status NewJsonReader::_get_column_default_value(
return Status::OK();
}
-Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc, IColumn*
column_ptr,
- bool* valid) {
- if (slot_desc->is_nullable()) {
- auto* nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr);
- column_ptr = &nullable_column->get_nested_column();
- auto col_value = _col_default_value_map.find(slot_desc->col_name());
- if (col_value == _col_default_value_map.end()) {
+Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc,
DataTypeSerDeSPtr serde,
+ IColumn* column_ptr, bool* valid) {
+ auto col_value = _col_default_value_map.find(slot_desc->col_name());
+ if (col_value == _col_default_value_map.end()) {
+ if (slot_desc->is_nullable()) {
+ auto* nullable_column = static_cast<ColumnNullable*>(column_ptr);
nullable_column->insert_default();
} else {
- const std::string& v_str = col_value->second;
- nullable_column->get_null_map_data().push_back(0);
- assert_cast<ColumnString*>(column_ptr)->insert_data(v_str.c_str(),
v_str.size());
+ if (_is_load) {
+ RETURN_IF_ERROR(_append_error_msg(
+ nullptr, "The column `{}` is not nullable, but it's
not found in jsondata.",
+ slot_desc->col_name(), valid));
+ } else {
+ return Status::DataQualityError(
+ "The column `{}` is not nullable, but it's not found
in jsondata.",
+ slot_desc->col_name());
+ }
}
} else {
- RETURN_IF_ERROR(_append_error_msg(
- nullptr, "The column `{}` is not nullable, but it's not found
in jsondata.",
- slot_desc->col_name(), valid));
+ const std::string& v_str = col_value->second;
+ Slice column_default_value {v_str};
+ RETURN_IF_ERROR(serde->deserialize_one_cell_from_json(*column_ptr,
column_default_value,
+ _serde_options));
}
-
*valid = true;
return Status::OK();
}
diff --git a/be/src/vec/exec/format/json/new_json_reader.h
b/be/src/vec/exec/format/json/new_json_reader.h
index 0df3747b8c2..6828b6b2abf 100644
--- a/be/src/vec/exec/format/json/new_json_reader.h
+++ b/be/src/vec/exec/format/json/new_json_reader.h
@@ -88,7 +88,8 @@ public:
~NewJsonReader() override = default;
Status init_reader(const std::unordered_map<std::string,
vectorized::VExprContextSPtr>&
- col_default_value_ctx);
+ col_default_value_ctx,
+ bool is_load);
Status get_next_block(Block* block, size_t* read_rows, bool* eof) override;
Status get_columns(std::unordered_map<std::string, TypeDescriptor>*
name_to_type,
std::unordered_set<std::string>* missing_cols) override;
@@ -129,7 +130,8 @@ private:
const std::vector<SlotDescriptor*>& slot_descs,
bool* valid);
Status _write_data_to_column(rapidjson::Value::ConstValueIterator value,
- SlotDescriptor* slot_desc,
vectorized::IColumn* column_ptr,
+ const TypeDescriptor& type_desc,
vectorized::IColumn* column_ptr,
+ const std::string& column_name,
DataTypeSerDeSPtr serde,
bool* valid);
Status _write_columns_by_jsonpath(rapidjson::Value& objectValue,
@@ -178,8 +180,10 @@ private:
const std::vector<SlotDescriptor*>&
slot_descs, bool* valid);
Status _simdjson_write_data_to_column(simdjson::ondemand::value& value,
- SlotDescriptor* slot_desc,
- vectorized::IColumn* column_ptr,
bool* valid);
+ const TypeDescriptor& type_desc,
+ vectorized::IColumn* column_ptr,
+ const std::string& column_name,
DataTypeSerDeSPtr serde,
+ bool* valid);
Status _simdjson_write_columns_by_jsonpath(simdjson::ondemand::object*
value,
const
std::vector<SlotDescriptor*>& slot_descs,
@@ -197,8 +201,8 @@ private:
const std::unordered_map<std::string,
vectorized::VExprContextSPtr>&
col_default_value_ctx);
- Status _fill_missing_column(SlotDescriptor* slot_desc,
vectorized::IColumn* column_ptr,
- bool* valid);
+ Status _fill_missing_column(SlotDescriptor* slot_desc, DataTypeSerDeSPtr
serde,
+ vectorized::IColumn* column_ptr, bool* valid);
RuntimeState* _state = nullptr;
RuntimeProfile* _profile = nullptr;
@@ -283,6 +287,22 @@ private:
std::unique_ptr<simdjson::ondemand::parser> _ondemand_json_parser;
// column to default value string map
std::unordered_map<std::string, std::string> _col_default_value_map;
+
+ bool _is_load = true;
+ //Used to indicate whether it is a stream load. When loading, only data
will be inserted into columnString.
+ //If an illegal value is encountered during the load process,
`_append_error_msg` should be called
+ //instead of directly returning `Status::DataQualityError`
+
+ bool _is_hive_table = false;
+ // In hive : create table xxx ROW FORMAT SERDE
'org.apache.hive.hcatalog.data.JsonSerDe';
+ // Hive will not allow you to create columns with the same name but
different case, including field names inside
+ // structs, and will automatically convert uppercase names in create sql
to lowercase.However, when Hive loads data
+ // to table, the column names in the data may be uppercase,and there may
be multiple columns with
+ // the same name but different capitalization.We refer to the behavior of
hive, convert all column names
+ // in the data to lowercase,and use the last one as the insertion value
+
+ DataTypeSerDeSPtrs _serdes;
+ vectorized::DataTypeSerDe::FormatOptions _serde_options;
};
} // namespace vectorized
diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp
b/be/src/vec/exec/scan/vfile_scanner.cpp
index 997eef02090..ba8048f73a9 100644
--- a/be/src/vec/exec/scan/vfile_scanner.cpp
+++ b/be/src/vec/exec/scan/vfile_scanner.cpp
@@ -931,8 +931,8 @@ Status VFileScanner::_get_next_reader() {
_cur_reader =
NewJsonReader::create_unique(_state, _profile, &_counter,
*_params, range,
_file_slot_descs,
&_scanner_eof, _io_ctx.get());
- init_status =
-
((NewJsonReader*)(_cur_reader.get()))->init_reader(_col_default_value_ctx);
+ init_status = ((NewJsonReader*)(_cur_reader.get()))
+ ->init_reader(_col_default_value_ctx,
_is_load);
break;
}
case TFileFormatType::FORMAT_AVRO: {
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run69.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run69.hql
new file mode 100644
index 00000000000..adf0f7d56b2
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run69.hql
@@ -0,0 +1,35 @@
+use `default`;
+
+
+CREATE TABLE json_nested_complex_table (
+ user_ID STRING,
+ user_PROFILE STRUCT<
+ name: STRING,
+ AGE: INT,
+ preferences: MAP<
+ STRING,
+ STRUCT<
+ preference_ID: INT,
+ preference_VALUES: ARRAY<STRING>
+ >
+ >
+ >,
+ activity_LOG ARRAY<
+ STRUCT<
+ activity_DATE: STRING,
+ activities: MAP<
+ STRING,
+ STRUCT<
+ `DETAILS`: STRING,
+ metrics: MAP<STRING, float>
+ >
+ >
+ >
+ >
+) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
+
+LOCATION
+ '/user/doris/preinstalled_data/json/json_nested_complex_table';
+
+
+msck repair table json_nested_complex_table;
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run70.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run70.hql
new file mode 100644
index 00000000000..73df8cba557
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run70.hql
@@ -0,0 +1,73 @@
+use `default`;
+
+
+CREATE TABLE json_all_complex_types (
+ `id` int,
+ `boolean_col` boolean,
+ `tinyint_col` tinyint,
+ `smallint_col` smallint,
+ `int_col` int,
+ `bigint_col` bigint,
+ `float_col` float,
+ `double_col` double,
+ `decimal_col1` decimal(9,0),
+ `decimal_col2` decimal(8,4),
+ `decimal_col3` decimal(18,6),
+ `decimal_col4` decimal(38,12),
+ `string_col` string,
+ `binary_col` binary,
+ `date_col` date,
+ `timestamp_col1` timestamp,
+ `timestamp_col2` timestamp,
+ `timestamp_col3` timestamp,
+ `char_col1` char(50),
+ `char_col2` char(100),
+ `char_col3` char(255),
+ `varchar_col1` varchar(50),
+ `varchar_col2` varchar(100),
+ `varchar_col3` varchar(255),
+ `t_map_string` map<string,string>,
+ `t_map_varchar` map<varchar(65535),varchar(65535)>,
+ `t_map_char` map<char(10),char(10)>,
+ `t_map_int` map<int,int>,
+ `t_map_bigint` map<bigint,bigint>,
+ `t_map_float` map<float,float>,
+ `t_map_double` map<double,double>,
+ `t_map_boolean` map<boolean,boolean>,
+ `t_map_decimal_precision_2` map<decimal(2,1),decimal(2,1)>,
+ `t_map_decimal_precision_4` map<decimal(4,2),decimal(4,2)>,
+ `t_map_decimal_precision_8` map<decimal(8,4),decimal(8,4)>,
+ `t_map_decimal_precision_17` map<decimal(17,8),decimal(17,8)>,
+ `t_map_decimal_precision_18` map<decimal(18,8),decimal(18,8)>,
+ `t_map_decimal_precision_38` map<decimal(38,16),decimal(38,16)>,
+ `t_array_string` array<string>,
+ `t_array_int` array<int>,
+ `t_array_bigint` array<bigint>,
+ `t_array_float` array<float>,
+ `t_array_double` array<double>,
+ `t_array_boolean` array<boolean>,
+ `t_array_varchar` array<varchar(65535)>,
+ `t_array_char` array<char(10)>,
+ `t_array_decimal_precision_2` array<decimal(2,1)>,
+ `t_array_decimal_precision_4` array<decimal(4,2)>,
+ `t_array_decimal_precision_8` array<decimal(8,4)>,
+ `t_array_decimal_precision_17` array<decimal(17,8)>,
+ `t_array_decimal_precision_18` array<decimal(18,8)>,
+ `t_array_decimal_precision_38` array<decimal(38,16)>,
+ `t_struct_bigint` struct<s_bigint:bigint>,
+ `t_complex` map<string,array<struct<s_int:int>>>,
+ `t_struct_nested` struct<struct_field:array<string>>,
+ `t_struct_null` struct<struct_field_null:string,struct_field_null2:string>,
+ `t_struct_non_nulls_after_nulls`
struct<struct_non_nulls_after_nulls1:int,struct_non_nulls_after_nulls2:string>,
+ `t_nested_struct_non_nulls_after_nulls`
struct<struct_field1:int,struct_field2:string,strict_field3:struct<nested_struct_field1:int,nested_struct_field2:string>>,
+ `t_map_null_value` map<string,string>,
+ `t_array_string_starting_with_nulls` array<string>,
+ `t_array_string_with_nulls_in_between` array<string>,
+ `t_array_string_ending_with_nulls` array<string>,
+ `t_array_string_all_nulls` array<string>
+ ) PARTITIONED BY (`dt` string)
+ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
+LOCATION
+ '/user/doris/preinstalled_data/json/json_all_complex_types';
+
+msck repair table json_all_complex_types;
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run71.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run71.hql
new file mode 100644
index 00000000000..ec99e72d2f5
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run71.hql
@@ -0,0 +1,13 @@
+use `default`;
+
+
+CREATE TABLE json_load_data_table (
+ `id` int,
+ `col1` int,
+ `col2` struct< col2a:int, col2b:string>,
+ `col3` map<int,string>
+) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
+LOCATION
+ '/user/doris/preinstalled_data/json/json_load_data_table';
+
+msck repair table json_load_data_table;
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt1/000000_0
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt1/000000_0
new file mode 100644
index 00000000000..5fe37cbc6f0
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt1/000000_0
@@ -0,0 +1,3 @@
+{"id":1,"boolean_col":true,"tinyint_col":127,"smallint_col":32767,"int_col":2147483647,"bigint_col":9223372036854775807,"float_col":123.45,"double_col":123456.789,"decimal_col1":123456789,"decimal_col2":1234.5678,"decimal_col3":123456.789012,"decimal_col4":123456789.012345678901,"string_col":"string_value","binary_col":"binary_value","date_col":"2024-03-20","timestamp_col1":"2024-03-20
12:00:00","timestamp_col2":"2024-03-20
12:00:00.123456789","timestamp_col3":"2024-03-20 12:00:00.123456 [...]
+{"id":2,"boolean_col":false,"tinyint_col":58,"smallint_col":12345,"int_col":2147483000,"bigint_col":null,"float_col":789.56,"double_col":654321.123,"decimal_col1":987654321,"decimal_col2":5678.1234,"decimal_col3":987654.321098,"decimal_col4":987654321.098765432109,"string_col":"changed_string","binary_col":"new_binary_value","date_col":"2025-05-25","timestamp_col1":"2025-05-25
15:30:00","timestamp_col2":"2025-05-25
15:30:00.654321987","timestamp_col3":"2025-05-25 15:30:00.654321987","cha [...]
+{"id":3,"boolean_col":false,"tinyint_col":-128,"smallint_col":-32768,"int_col":-2147483648,"bigint_col":-9223372036854775808,"float_col":-3.4028235E38,"double_col":-1.7976931348623157E308,"decimal_col1":-999999999,"decimal_col2":-9999.9999,"decimal_col3":-999999999.999999,"decimal_col4":null,"string_col":"min_string_value","binary_col":"xxxx","date_col":"2001-01-01","timestamp_col1":"2001-01-01
00:00:00","timestamp_col2":"2001-01-01 00:00:00","timestamp_col3":"2001-01-01
00:00:00","char_ [...]
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt2/000000_0
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt2/000000_0
new file mode 100644
index 00000000000..0a823bee693
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt2/000000_0
@@ -0,0 +1 @@
+{"id":4,"boolean_col":null,"tinyint_col":null,"smallint_col":null,"int_col":null,"bigint_col":null,"float_col":123.45,"double_col":null,"decimal_col1":null,"decimal_col2":null,"decimal_col3":null,"decimal_col4":null,"string_col":null,"binary_col":null,"date_col":null,"timestamp_col1":null,"timestamp_col2":null,"timestamp_col3":null,"char_col1":null,"char_col2":null,"char_col3":null,"varchar_col1":null,"varchar_col2":null,"varchar_col3":null,"t_map_string":null,"t_map_varchar":null,"t_map
[...]
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt3/000000_0
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt3/000000_0
new file mode 100644
index 00000000000..a5e46399fdd
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_all_complex_types/dt=dt3/000000_0
@@ -0,0 +1,2 @@
+{"id":5,"boolean_col":null,"tinyint_col":null,"smallint_col":null,"int_col":null,"bigint_col":null,"float_col":null,"double_col":null,"decimal_col1":null,"decimal_col2":null,"decimal_col3":null,"decimal_col4":null,"string_col":null,"binary_col":null,"date_col":null,"timestamp_col1":null,"timestamp_col2":null,"timestamp_col3":null,"char_col1":null,"char_col2":null,"char_col3":null,"varchar_col1":null,"varchar_col2":null,"varchar_col3":null,"t_map_string":null,"t_map_varchar":null,"t_map_c
[...]
+{"id":6,"boolean_col":null,"tinyint_col":null,"smallint_col":null,"int_col":null,"bigint_col":null,"float_col":null,"double_col":null,"decimal_col1":null,"decimal_col2":null,"decimal_col3":null,"decimal_col4":null,"string_col":null,"binary_col":null,"date_col":null,"timestamp_col1":null,"timestamp_col2":null,"timestamp_col3":null,"char_col1":null,"char_col2":null,"char_col3":null,"varchar_col1":null,"varchar_col2":null,"varchar_col3":null,"t_map_string":null,"t_map_varchar":null,"t_map_c
[...]
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_load_data_table/1
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_load_data_table/1
new file mode 100644
index 00000000000..70d1265f98d
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_load_data_table/1
@@ -0,0 +1,13 @@
+{"id":1,"col1":10,"col2":{"col2a":10,"col2b":"string1"},"col3":{"1":"string10"}}
+{"id":2,"col1":10,"col1":20,"col2":{"col2b":"string2","col2a":0,"Col2A":20},"col3":{"2":"string2"}}
+{"id":3,"col1":10,"col1":20,"COL1":30,"COL2":{"col2a":30,"col2b":"string3"}}
+{"id":4,"COL1":40,"col2":{"col2a":10,"col2b":"string4","new_col":"new_val","col2a":40},"col3":{"4":"string4"}}
+{"id":5}
+{"id":6,"col1":60,"col2":{"COL2a":60,"col2b":600},"col3":{"6":600}}
+{"id":7,"col1":70,"col3":{"7":"string7"},"col2":{"col2b":"string7","col2a":70}}
+
+
+
+
+{}
+{"a":5}
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/1
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/1
new file mode 100644
index 00000000000..11342c441bc
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/1
@@ -0,0 +1,2 @@
+{"user_id":"user1","user_profile":{"name":"Alice","age":28,"preferences":{"sports":{"preference_id":101,"preference_values":["soccer","tennis"]},"music":{"preference_id":102,"preference_values":["rock","classical"]}}},"activity_log":[{"activity_date":"2024-08-01","activities":{"workout":{"details":"Morning
run","metrics":{"duration":30.5,"calories":200.0}},"reading":{"details":"Read
book on
Hive","metrics":{"pages":50.0,"time":2.0}}}},{"activity_date":"2024-08-02","activities":{"travel":
[...]
+{"user_id":"user2","user_profile":{"name":"Bob","age":32,"preferences":{"books":{"preference_id":201,"preference_values":["fiction","non-fiction"]},"travel":{"preference_id":202,"preference_values":["beaches","mountains"]}}},"activity_log":[{"activity_date":"2024-08-01","activities":{"hiking":{"details":"Mountain
trail","metrics":{"distance":10.0,"elevation":500.0}},"photography":{"details":"Wildlife
photoshoot","metrics":{"photos_taken":100.0,"time":4.0}}}},{"activity_date":"2024-08-02"
[...]
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/2
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/2
new file mode 100644
index 00000000000..e1b0befc7bc
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/2
@@ -0,0 +1 @@
+{"user_id":"user3","user_profile":{"name":"Carol","age":24,"preferences":{"food":{"preference_id":301,"preference_values":["vegan","desserts"]},"movies":{"preference_id":302,"preference_values":["action","comedy"]}}},"activity_log":[{"activity_date":"2024-08-01","activities":{"cooking":{"details":"Made
vegan
meal","metrics":{"time_spent":1.5,"calories":500.0}},"movie":{"details":"Watched
action
movie","metrics":{"duration":2.0,"rating":8.5}}}},{"activity_date":"2024-08-02","activities":{
[...]
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/modify_2
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/modify_2
new file mode 100644
index 00000000000..08f1586f3aa
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/json_nested_complex_table/modify_2
@@ -0,0 +1,2 @@
+{"user_ID":"user4","user_PROFILE":{"name":"Carol","age":24,"preferences":{"food":{"preference_ID":301,"preference_VALUES":["vegan","desserts"]},"movies":{"preference_ID":302,"preference_VALUES":["action","comedy"]}}},"activity_LOG":[{"activity_DATE":"2024-08-01","activities":{"cooking":{"DETAILS":"Made
vegan
meal","metrics":{"time_spent":1.5,"calories":500.0}},"movie":{"DETAILS":"Watched
action
movie","metrics":{"duration":2.0,"rating":8.5}}}},{"activity_DATE":"2024-08-02","activities":{
[...]
+
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
index 97032467cec..0f839d238b2 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
@@ -94,6 +94,9 @@ public class HiveMetaStoreClientHelper {
private static final Pattern digitPattern = Pattern.compile("(\\d+)");
+ public static final String HIVE_JSON_SERDE =
"org.apache.hive.hcatalog.data.JsonSerDe";
+ public static final String LEGACY_HIVE_JSON_SERDE =
"org.apache.hadoop.hive.serde2.JsonSerDe";
+
public enum HiveFileFormat {
TEXT_FILE(0, "text"),
PARQUET(1, "parquet"),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index e710bdb935d..3a2a4d3eb5c 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -364,14 +364,21 @@ public class HiveScanNode extends FileQueryScanNode {
@Override
public TFileFormatType getFileFormatType() throws UserException {
TFileFormatType type = null;
- String inputFormatName =
hmsTable.getRemoteTable().getSd().getInputFormat();
+ Table table = hmsTable.getRemoteTable();
+ String inputFormatName = table.getSd().getInputFormat();
String hiveFormat =
HiveMetaStoreClientHelper.HiveFileFormat.getFormat(inputFormatName);
if
(hiveFormat.equals(HiveMetaStoreClientHelper.HiveFileFormat.PARQUET.getDesc()))
{
type = TFileFormatType.FORMAT_PARQUET;
} else if
(hiveFormat.equals(HiveMetaStoreClientHelper.HiveFileFormat.ORC.getDesc())) {
type = TFileFormatType.FORMAT_ORC;
} else if
(hiveFormat.equals(HiveMetaStoreClientHelper.HiveFileFormat.TEXT_FILE.getDesc()))
{
- type = TFileFormatType.FORMAT_CSV_PLAIN;
+ String serDeLib =
table.getSd().getSerdeInfo().getSerializationLib();
+ if (serDeLib.equals(HiveMetaStoreClientHelper.HIVE_JSON_SERDE)
+ ||
serDeLib.equals(HiveMetaStoreClientHelper.LEGACY_HIVE_JSON_SERDE)) {
+ type = TFileFormatType.FORMAT_JSON;
+ } else {
+ type = TFileFormatType.FORMAT_CSV_PLAIN;
+ }
}
return type;
}
@@ -383,11 +390,12 @@ public class HiveScanNode extends FileQueryScanNode {
@Override
protected TFileAttributes getFileAttributes() throws UserException {
- TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
+ TFileAttributes fileAttributes = new TFileAttributes();
Table table = hmsTable.getRemoteTable();
// TODO: separate hive text table and OpenCsv table
String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
if
(serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
+ TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
// set properties of LazySimpleSerDe
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table));
@@ -401,7 +409,10 @@ public class HiveScanNode extends FileQueryScanNode {
HiveProperties.getEscapeDelimiter(table).ifPresent(d ->
textParams.setEscape(d.getBytes()[0]));
// 6. set null format
textParams.setNullFormat(HiveProperties.getNullFormat(table));
+ fileAttributes.setTextParams(textParams);
+ fileAttributes.setHeaderType("");
} else if
(serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
+ TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
// set set properties of OpenCSVSerde
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table));
@@ -411,17 +422,29 @@ public class HiveScanNode extends FileQueryScanNode {
textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]);
// 4. set escape char
textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]);
+ fileAttributes.setTextParams(textParams);
+ fileAttributes.setHeaderType("");
+ if (textParams.isSetEnclose()) {
+ fileAttributes.setTrimDoubleQuotes(true);
+ }
+ } else if (serDeLib.equals("org.apache.hive.hcatalog.data.JsonSerDe"))
{
+ TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
+ textParams.setColumnSeparator("\t");
+ textParams.setLineDelimiter("\n");
+ fileAttributes.setTextParams(textParams);
+
+ fileAttributes.setJsonpaths("");
+ fileAttributes.setJsonRoot("");
+ fileAttributes.setNumAsString(true);
+ fileAttributes.setFuzzyParse(false);
+ fileAttributes.setReadJsonByLine(true);
+ fileAttributes.setStripOuterArray(false);
+ fileAttributes.setHeaderType("");
} else {
throw new UserException(
"unsupported hive table serde: " + serDeLib);
}
- TFileAttributes fileAttributes = new TFileAttributes();
- fileAttributes.setTextParams(textParams);
- fileAttributes.setHeaderType("");
- if (textParams.isSet(TFileTextScanRangeParams._Fields.ENCLOSE)) {
- fileAttributes.setTrimDoubleQuotes(true);
- }
return fileAttributes;
}
diff --git
a/regression-test/data/external_table_p0/hive/hive_json_basic_test.out
b/regression-test/data/external_table_p0/hive/hive_json_basic_test.out
new file mode 100644
index 00000000000..9023f5d72b1
--- /dev/null
+++ b/regression-test/data/external_table_p0/hive/hive_json_basic_test.out
@@ -0,0 +1,115 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !q1 --
+1 true 127 32767 2147483647 9223372036854775807 123.45
123456.789 123456789 1234.5678 123456.789012
123456789.012345678901 string_value binary_value 2024-03-20
2024-03-20T12:00 2024-03-20T12:00:00.123457
2024-03-20T12:00:00.123457 char_value1
char_value2
char_value3
[...]
+2 false 58 12345 2147483000 \N 789.56 654321.123
987654321 5678.1234 987654.321098 987654321.098765432109
changed_string new_binary_value 2025-05-25 2025-05-25T15:30
2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1
char_new_value2
char_new_value3
[...]
+3 false -128 -32768 -2147483648 -9223372036854775808
-3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999
-999999999.999999 \N min_string_value xxxx 2001-01-01
2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00
char_min_value1 char_min_value2
char_min_value3
[...]
+4 \N \N \N \N \N 123.45 \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N {1:10} \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N [1.2345,
2.3456] \N \N \N \N \N \N \N \N
\N \N [null, "value1", "value2"] \N \N \N dt2
+5 \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N dt3
+6 \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N dt3
+
+-- !q2 --
+3 false -128 -32768 -2147483648 -9223372036854775808
-3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999
-999999999.999999 \N min_string_value xxxx 2001-01-01
2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00
char_min_value1 char_min_value2
char_min_value3
[...]
+
+-- !q3 --
+1 true 127 32767 2147483647 9223372036854775807 123.45
123456.789 123456789 1234.5678 123456.789012
123456789.012345678901 string_value binary_value 2024-03-20
2024-03-20T12:00 2024-03-20T12:00:00.123457
2024-03-20T12:00:00.123457 char_value1
char_value2
char_value3
[...]
+
+-- !q4 --
+123.45
+789.56
+-3.4028235e+38
+123.45
+
+-- !q5 --
+2 false 58 12345 2147483000 \N 789.56 654321.123
987654321 5678.1234 987654.321098 987654321.098765432109
changed_string new_binary_value 2025-05-25 2025-05-25T15:30
2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1
char_new_value2
char_new_value3
[...]
+
+-- !q6 --
+user1 {"name":"Alice", "age":28,
"preferences":{"sports":{"preference_id":101, "preference_values":["soccer",
"tennis"]}, "music":{"preference_id":102, "preference_values":["rock",
"classical"]}}} [{"activity_date":"2024-08-01",
"activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5,
"calories":200}}, "reading":{"details":"Read book on Hive",
"metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02",
"activities":{"travel":{"details":"Flight to NY", "metric [...]
+user2 {"name":"Bob", "age":32, "preferences":{"books":{"preference_id":201,
"preference_values":["fiction", "non-fiction"]}, "travel":{"preference_id":202,
"preference_values":["beaches", "mountains"]}}}
[{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain
trail", "metrics":{"distance":10, "elevation":500}},
"photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100,
"time":4}}}}, {"activity_date":"2024-08-02",
"activities":{"workshop":{"details" [...]
+user3 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301,
"preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302,
"preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01",
"activities":{"cooking":{"details":"Made vegan meal",
"metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched
action movie", "metrics":{"duration":2, "rating":8.5}}}},
{"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength train
[...]
+user4 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301,
"preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302,
"preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01",
"activities":{"cooking":{"details":"Made vegan meal",
"metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched
action movie", "metrics":{"duration":2, "rating":8.5}}}},
{"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength train
[...]
+
+-- !q7 --
+user1 [{"activity_date":"2024-08-01",
"activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5,
"calories":200}}, "reading":{"details":"Read book on Hive",
"metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02",
"activities":{"travel":{"details":"Flight to NY", "metrics":{"distance":500,
"time":3}}, "meeting":{"details":"Project meeting", "metrics":{"duration":1.5,
"participants":5}}}}]
+user2 [{"activity_date":"2024-08-01",
"activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10,
"elevation":500}}, "photography":{"details":"Wildlife photoshoot",
"metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02",
"activities":{"workshop":{"details":"Photography workshop",
"metrics":{"duration":3, "participants":15}}, "shopping":{"details":"Bought
camera gear", "metrics":{"items":5, "cost":1500}}}}]
+user3 [{"activity_date":"2024-08-01",
"activities":{"cooking":{"details":"Made vegan meal",
"metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched
action movie", "metrics":{"duration":2, "rating":8.5}}}},
{"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength
training", "metrics":{"duration":1, "calories":300}},
"shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}]
+user4 [{"activity_date":"2024-08-01",
"activities":{"cooking":{"details":"Made vegan meal",
"metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched
action movie", "metrics":{"duration":2, "rating":8.5}}}},
{"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength
training", "metrics":{"duration":1, "calories":300}},
"shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}]
+
+-- !q8 --
+\N \N \N \N
+\N \N \N \N
+1 10 {"col2a":10, "col2b":"string1"} {1:"string10"}
+2 20 {"col2a":20, "col2b":"string2"} {2:"string2"}
+3 30 {"col2a":30, "col2b":"string3"} \N
+4 40 {"col2a":40, "col2b":"string4"} {4:"string4"}
+5 \N \N \N
+6 60 {"col2a":60, "col2b":"600"} {6:"600"}
+7 70 {"col2a":70, "col2b":"string7"} {7:"string7"}
+
+-- !q9 --
+\N \N
+\N \N
+\N 5
+10 1
+20 2
+30 3
+40 4
+60 6
+70 7
+
+-- !q1 --
+1 true 127 32767 2147483647 9223372036854775807 123.45
123456.789 123456789 1234.5678 123456.789012
123456789.012345678901 string_value binary_value 2024-03-20
2024-03-20T12:00 2024-03-20T12:00:00.123457
2024-03-20T12:00:00.123457 char_value1
char_value2
char_value3
[...]
+2 false 58 12345 2147483000 \N 789.56 654321.123
987654321 5678.1234 987654.321098 987654321.098765432109
changed_string new_binary_value 2025-05-25 2025-05-25T15:30
2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1
char_new_value2
char_new_value3
[...]
+3 false -128 -32768 -2147483648 -9223372036854775808
-3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999
-999999999.999999 \N min_string_value xxxx 2001-01-01
2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00
char_min_value1 char_min_value2
char_min_value3
[...]
+4 \N \N \N \N \N 123.45 \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N {1:10} \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N [1.2345,
2.3456] \N \N \N \N \N \N \N \N
\N \N [null, "value1", "value2"] \N \N \N dt2
+5 \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N dt3
+6 \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N dt3
+
+-- !q2 --
+3 false -128 -32768 -2147483648 -9223372036854775808
-3.4028235e+38 -1.7976931348623157E308 -999999999 -9999.9999
-999999999.999999 \N min_string_value xxxx 2001-01-01
2001-01-01T00:00 2001-01-01T00:00 2001-01-01T00:00
char_min_value1 char_min_value2
char_min_value3
[...]
+
+-- !q3 --
+1 true 127 32767 2147483647 9223372036854775807 123.45
123456.789 123456789 1234.5678 123456.789012
123456789.012345678901 string_value binary_value 2024-03-20
2024-03-20T12:00 2024-03-20T12:00:00.123457
2024-03-20T12:00:00.123457 char_value1
char_value2
char_value3
[...]
+
+-- !q4 --
+123.45
+789.56
+-3.4028235e+38
+123.45
+
+-- !q5 --
+2 false 58 12345 2147483000 \N 789.56 654321.123
987654321 5678.1234 987654.321098 987654321.098765432109
changed_string new_binary_value 2025-05-25 2025-05-25T15:30
2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1
char_new_value2
char_new_value3
[...]
+
+-- !q6 --
+user1 {"name":"Alice", "age":28,
"preferences":{"sports":{"preference_id":101, "preference_values":["soccer",
"tennis"]}, "music":{"preference_id":102, "preference_values":["rock",
"classical"]}}} [{"activity_date":"2024-08-01",
"activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5,
"calories":200}}, "reading":{"details":"Read book on Hive",
"metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02",
"activities":{"travel":{"details":"Flight to NY", "metric [...]
+user2 {"name":"Bob", "age":32, "preferences":{"books":{"preference_id":201,
"preference_values":["fiction", "non-fiction"]}, "travel":{"preference_id":202,
"preference_values":["beaches", "mountains"]}}}
[{"activity_date":"2024-08-01", "activities":{"hiking":{"details":"Mountain
trail", "metrics":{"distance":10, "elevation":500}},
"photography":{"details":"Wildlife photoshoot", "metrics":{"photos_taken":100,
"time":4}}}}, {"activity_date":"2024-08-02",
"activities":{"workshop":{"details" [...]
+user3 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301,
"preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302,
"preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01",
"activities":{"cooking":{"details":"Made vegan meal",
"metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched
action movie", "metrics":{"duration":2, "rating":8.5}}}},
{"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength train
[...]
+user4 {"name":"Carol", "age":24, "preferences":{"food":{"preference_id":301,
"preference_values":["vegan", "desserts"]}, "movies":{"preference_id":302,
"preference_values":["action", "comedy"]}}} [{"activity_date":"2024-08-01",
"activities":{"cooking":{"details":"Made vegan meal",
"metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched
action movie", "metrics":{"duration":2, "rating":8.5}}}},
{"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength train
[...]
+
+-- !q7 --
+user1 [{"activity_date":"2024-08-01",
"activities":{"workout":{"details":"Morning run", "metrics":{"duration":30.5,
"calories":200}}, "reading":{"details":"Read book on Hive",
"metrics":{"pages":50, "time":2}}}}, {"activity_date":"2024-08-02",
"activities":{"travel":{"details":"Flight to NY", "metrics":{"distance":500,
"time":3}}, "meeting":{"details":"Project meeting", "metrics":{"duration":1.5,
"participants":5}}}}]
+user2 [{"activity_date":"2024-08-01",
"activities":{"hiking":{"details":"Mountain trail", "metrics":{"distance":10,
"elevation":500}}, "photography":{"details":"Wildlife photoshoot",
"metrics":{"photos_taken":100, "time":4}}}}, {"activity_date":"2024-08-02",
"activities":{"workshop":{"details":"Photography workshop",
"metrics":{"duration":3, "participants":15}}, "shopping":{"details":"Bought
camera gear", "metrics":{"items":5, "cost":1500}}}}]
+user3 [{"activity_date":"2024-08-01",
"activities":{"cooking":{"details":"Made vegan meal",
"metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched
action movie", "metrics":{"duration":2, "rating":8.5}}}},
{"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength
training", "metrics":{"duration":1, "calories":300}},
"shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}]
+user4 [{"activity_date":"2024-08-01",
"activities":{"cooking":{"details":"Made vegan meal",
"metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched
action movie", "metrics":{"duration":2, "rating":8.5}}}},
{"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength
training", "metrics":{"duration":1, "calories":300}},
"shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}]
+
+-- !q8 --
+\N \N \N \N
+\N \N \N \N
+1 10 {"col2a":10, "col2b":"string1"} {1:"string10"}
+2 20 {"col2a":20, "col2b":"string2"} {2:"string2"}
+3 30 {"col2a":30, "col2b":"string3"} \N
+4 40 {"col2a":40, "col2b":"string4"} {4:"string4"}
+5 \N \N \N
+6 60 {"col2a":60, "col2b":"600"} {6:"600"}
+7 70 {"col2a":70, "col2b":"string7"} {7:"string7"}
+
+-- !q9 --
+\N \N
+\N \N
+\N 5
+10 1
+20 2
+30 3
+40 4
+60 6
+70 7
+
diff --git
a/regression-test/suites/external_table_p0/hive/hive_json_basic_test.groovy
b/regression-test/suites/external_table_p0/hive/hive_json_basic_test.groovy
new file mode 100644
index 00000000000..9d05e1a4c74
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/hive_json_basic_test.groovy
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("hive_json_basic_test",
"p0,external,hive,external_docker,external_docker_hive") {
+
+
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+ logger.info("diable Hive test.")
+ return;
+ }
+
+ for (String hivePrefix : ["hive2", "hive3"]) {
+ try {
+ String externalEnvIp =
context.config.otherConfigs.get("externalEnvIp")
+ String hms_port = context.config.otherConfigs.get(hivePrefix +
"HmsPort")
+ String catalog_name = "${hivePrefix}_hive_json_basic_test"
+ String broker_name = "hdfs"
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """create catalog if not exists ${catalog_name} properties (
+ 'type'='hms',
+ 'hive.metastore.uris'='thrift://${externalEnvIp}:${hms_port}'
+ );"""
+ sql """use `${catalog_name}`.`default`"""
+
+ String tb1 = """json_all_complex_types"""
+ String tb2 = """json_nested_complex_table"""
+ String tb3 = """json_load_data_table"""
+
+ def tables = sql """ show tables """
+ logger.info("tables = ${tables}")
+
+ qt_q1 """ select * from ${tb1} order by id """
+ qt_q2 """ select * from ${tb1} where tinyint_col < 0 order by id
"""
+ qt_q3 """ select * from ${tb1} where bigint_col > 0 order by id """
+ qt_q4 """ select float_col from ${tb1} where float_col is not null
order by id """
+ qt_q5 """ select * from ${tb1} where id = 2 order by id """
+
+
+
+ qt_q6 """ select * from ${tb2} order by user_id"""
+ qt_q7 """ select user_id,activity_log from ${tb2} order by
user_id"""
+
+
+ order_qt_q8 """ select * from ${tb3} order by id """
+
+ order_qt_q9 """ select col1,id from ${tb3} order by id """
+
+
+
+
+ sql """drop catalog if exists ${catalog_name}"""
+ } finally {
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]