This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 119a5dafb36 branch-4.0: [Feature](serd)Support read column from json
arrow. #58429 (#58757)
119a5dafb36 is described below
commit 119a5dafb36b09d3f1cc7f63afa321bd5bdd69a4
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Dec 9 17:27:27 2025 +0800
branch-4.0: [Feature](serd)Support read column from json arrow. #58429
(#58757)
Cherry-picked from #58429
Co-authored-by: Chen768959 <[email protected]>
---
.../vec/data_types/serde/data_type_jsonb_serde.cpp | 70 ++++++++++++++++++++++
.../vec/data_types/serde/data_type_jsonb_serde.h | 2 +
.../serde/data_type_jsonb_serde_test.cpp | 49 +++++++++++++++
.../test_remote_doris_all_types_select.out | 8 +--
.../test_remote_doris_all_types_show.out | 3 +-
.../test_remote_doris_all_types_select.groovy | 9 +--
.../test_remote_doris_all_types_show.groovy | 1 +
7 files changed, 133 insertions(+), 9 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
b/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
index 53afaf24cfc..9e5ce5ae68a 100644
--- a/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
@@ -120,6 +120,76 @@ Status DataTypeJsonbSerDe::write_column_to_arrow(const
IColumn& column, const Nu
return Status::OK();
}
+Status DataTypeJsonbSerDe::read_column_from_arrow(IColumn& column, const
arrow::Array* arrow_array,
+ int64_t start, int64_t end,
+ const cctz::time_zone& ctz)
const {
+ if (arrow_array->type_id() == arrow::Type::STRING ||
+ arrow_array->type_id() == arrow::Type::BINARY) {
+ const auto* concrete_array = dynamic_cast<const
arrow::BinaryArray*>(arrow_array);
+ std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
+
+ const uint8_t* offsets_data = concrete_array->value_offsets()->data();
+ const size_t offset_size = sizeof(int32_t);
+
+ JsonBinaryValue value;
+ for (auto offset_i = start; offset_i < end; ++offset_i) {
+ if (!concrete_array->IsNull(offset_i)) {
+ int32_t start_offset = 0;
+ int32_t end_offset = 0;
+ memcpy(&start_offset, offsets_data + offset_i * offset_size,
offset_size);
+ memcpy(&end_offset, offsets_data + (offset_i + 1) *
offset_size, offset_size);
+
+ int32_t length = end_offset - start_offset;
+ const auto* raw_data = buffer->data() + start_offset;
+
+ RETURN_IF_ERROR(
+ value.from_json_string(reinterpret_cast<const
char*>(raw_data), length));
+ column.insert_data(value.value(), value.size());
+ } else {
+ column.insert_default();
+ }
+ }
+ } else if (arrow_array->type_id() == arrow::Type::FIXED_SIZE_BINARY) {
+ const auto* concrete_array = dynamic_cast<const
arrow::FixedSizeBinaryArray*>(arrow_array);
+ uint32_t width = concrete_array->byte_width();
+ const auto* array_data = concrete_array->GetValue(start);
+
+ JsonBinaryValue value;
+ for (size_t offset_i = 0; offset_i < end - start; ++offset_i) {
+ if (!concrete_array->IsNull(offset_i)) {
+ const auto* raw_data = array_data + (offset_i * width);
+
+ RETURN_IF_ERROR(
+ value.from_json_string(reinterpret_cast<const
char*>(raw_data), width));
+ column.insert_data(value.value(), value.size());
+ } else {
+ column.insert_default();
+ }
+ }
+ } else if (arrow_array->type_id() == arrow::Type::LARGE_STRING ||
+ arrow_array->type_id() == arrow::Type::LARGE_BINARY) {
+ const auto* concrete_array = dynamic_cast<const
arrow::LargeBinaryArray*>(arrow_array);
+ std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
+
+ JsonBinaryValue value;
+ for (auto offset_i = start; offset_i < end; ++offset_i) {
+ if (!concrete_array->IsNull(offset_i)) {
+ const auto* raw_data = buffer->data() +
concrete_array->value_offset(offset_i);
+
+ RETURN_IF_ERROR(value.from_json_string(reinterpret_cast<const
char*>(raw_data),
+
concrete_array->value_length(offset_i)));
+ column.insert_data(value.value(), value.size());
+ } else {
+ column.insert_default();
+ }
+ }
+ } else {
+ return Status::InvalidArgument("Unsupported arrow type for json
column: {}",
+ arrow_array->type_id());
+ }
+ return Status::OK();
+}
+
Status DataTypeJsonbSerDe::write_column_to_orc(const std::string& timezone,
const IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch, int64_t start,
diff --git a/be/src/vec/data_types/serde/data_type_jsonb_serde.h
b/be/src/vec/data_types/serde/data_type_jsonb_serde.h
index 8df44f5a454..95c60ae583b 100644
--- a/be/src/vec/data_types/serde/data_type_jsonb_serde.h
+++ b/be/src/vec/data_types/serde/data_type_jsonb_serde.h
@@ -47,6 +47,8 @@ public:
Status write_column_to_arrow(const IColumn& column, const NullMap*
null_map,
arrow::ArrayBuilder* array_builder, int64_t
start, int64_t end,
const cctz::time_zone& ctz) const override;
+ Status read_column_from_arrow(IColumn& column, const arrow::Array*
arrow_array, int64_t start,
+ int64_t end, const cctz::time_zone& ctz)
const override;
Status serialize_one_cell_to_json(const IColumn& column, int64_t row_num,
BufferWritable& bw,
FormatOptions& options) const override;
diff --git a/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
b/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
index 1bb52c8d851..8238d7fdf03 100644
--- a/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
@@ -17,6 +17,8 @@
#include "vec/data_types/serde/data_type_jsonb_serde.h"
+#include <arrow/api.h>
+#include <cctz/time_zone.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>
@@ -218,4 +220,51 @@ TEST_F(DataTypeJsonbSerDeTest, serdes) {
test_func(*serde_jsonb, column_jsonb);
}
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeJsonbSerDeTest, ArrowMemNotAligned) {
+ // 1.Prepare the data.
+ std::vector<std::string> strings = {"{\"k1\":\"v1\"}", "{\"k2\":\"v2\"}",
"{\"k3\":\"v3\"}",
+ "{\"k4\":\"v4\"}", "{\"k5\":\"v5\"}"};
+
+ int32_t total_length = 0;
+ std::vector<int32_t> offsets = {0};
+ for (const auto& str : strings) {
+ total_length += static_cast<int32_t>(str.length());
+ offsets.push_back(total_length);
+ }
+
+ // 2.Create an unaligned memory buffer.
+ std::vector<uint8_t> value_storage(total_length + 10);
+ std::vector<uint8_t> offset_storage((strings.size() + 1) * sizeof(int32_t)
+ 10);
+
+ uint8_t* unaligned_value_data = value_storage.data() + 1;
+ uint8_t* unaligned_offset_data = offset_storage.data() + 1;
+
+ // 3. Copy data to unaligned memory
+ int32_t current_pos = 0;
+ for (size_t i = 0; i < strings.size(); ++i) {
+ memcpy(unaligned_value_data + current_pos, strings[i].data(),
strings[i].length());
+ current_pos += strings[i].length();
+ }
+
+ for (size_t i = 0; i < offsets.size(); ++i) {
+ memcpy(unaligned_offset_data + i * sizeof(int32_t), &offsets[i],
sizeof(int32_t));
+ }
+
+ // 4. Create Arrow array with unaligned memory
+ auto value_buffer = arrow::Buffer::Wrap(unaligned_value_data,
total_length);
+ auto offset_buffer =
+ arrow::Buffer::Wrap(unaligned_offset_data, offsets.size() *
sizeof(int32_t));
+ auto arr = std::make_shared<arrow::StringArray>(strings.size(),
offset_buffer, value_buffer);
+
+ const auto* offsets_ptr = arr->raw_value_offsets();
+ uintptr_t address = reinterpret_cast<uintptr_t>(offsets_ptr);
+ EXPECT_EQ((reinterpret_cast<uintptr_t>(address) % 4), 1);
+
+ // 5.Test read_column_from_arrow
+ cctz::time_zone tz;
+ auto st = serde_jsonb->read_column_from_arrow(*column_jsonb, arr.get(), 0,
1, tz);
+ EXPECT_TRUE(st.ok());
+}
+
} // namespace doris::vectorized
\ No newline at end of file
diff --git
a/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_select.out
b/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_select.out
index 069600b28c1..bc8a9600539 100644
---
a/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_select.out
+++
b/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_select.out
@@ -1,9 +1,9 @@
-- This file is automatically generated. You should know what you did if you
want to edit this
-- !sql --
-2025-05-18T01:00 true -128 -32768 -2147483648
-9223372036854775808 -1234567890123456790 -123.456 -123456.789
-123457 -123456789012346 -1234567890123456789012345678 1970-01-01
A Hello Hello, Doris! ["apple", "banana", "orange"]
{"Emily":101, "age":25} {"f1":11, "f2":3.14, "f3":"Emily"}
-2025-05-18T02:00 \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N
-2025-05-18T03:00 false 127 32767 2147483647
9223372036854775807 1234567890123456789 123.456 123456.789 123457
123456789012346 1234567890123456789012345678 9999-12-31
9999-12-31T23:59:59 [] {} {"f1":11,
"f2":3.14, "f3":"Emily"}
-2025-05-18T04:00 true 0 0 0 0 0 0.0
0.0 0 0 0 2023-10-01 2023-10-01T12:34:56 A
Hello Hello, Doris! ["apple", "banana", "orange"] {"Emily":101, "age":25}
{"f1":11, "f2":3.14, "f3":"Emily"}
+2025-05-18T01:00 true -128 -32768 -2147483648
-9223372036854775808 -1234567890123456790 -123.456 -123456.789
-123457 -123456789012346 -1234567890123456789012345678 1970-01-01
A Hello Hello, Doris! ["apple", "banana", "orange"]
{"Emily":101, "age":25} {"f1":11, "f2":3.14, "f3":"Emily"}
{"k1":"v31","k2":300,"k3":[123,456],"k4":[],"k5":{"i1":"iv1"}}
+2025-05-18T02:00 \N \N \N \N \N \N \N
\N \N \N \N \N \N \N \N \N \N
\N \N \N
+2025-05-18T03:00 false 127 32767 2147483647
9223372036854775807 1234567890123456789 123.456 123456.789 123457
123456789012346 1234567890123456789012345678 9999-12-31
9999-12-31T23:59:59 [] {} {"f1":11,
"f2":3.14, "f3":"Emily"} {}
+2025-05-18T04:00 true 0 0 0 0 0 0.0
0.0 0 0 0 2023-10-01 2023-10-01T12:34:56 A
Hello Hello, Doris! ["apple", "banana", "orange"] {"Emily":101, "age":25}
{"f1":11, "f2":3.14, "f3":"Emily"} []
-- !sql --
2025-05-18T01:00 [1] [-128] [-32768] [-2147483648]
[-9223372036854775808] [-1234567890123456790] [-123.456] [-123456.789]
[-123457] [-123456789012346] [-1234567890123456789012345678]
["0000-01-01"] [""] ["A"] ["Hello"] ["Hello, Doris!"]
diff --git
a/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_show.out
b/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_show.out
index 5133dffcb91..051047ef631 100644
---
a/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_show.out
+++
b/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_show.out
@@ -1,6 +1,6 @@
-- This file is automatically generated. You should know what you did if you
want to edit this
-- !sql --
-test_remote_doris_all_types_t1 CREATE TABLE `test_remote_doris_all_types_t1`
(\n `id` datetimev2(3) NOT NULL,\n `c_boolean` boolean NULL DEFAULT "true",\n
`c_tinyint` tinyint NULL DEFAULT "1",\n `c_smallint` smallint NULL DEFAULT
"1",\n `c_int` int NULL DEFAULT "1",\n `c_bigint` bigint NULL DEFAULT "1",\n
`c_largeint` largeint NULL DEFAULT "1",\n `c_float` float NULL DEFAULT "1",\n
`c_double` double NULL DEFAULT "1",\n `c_decimal9` decimalv3(9,0) NULL DEFAULT
"1",\n `c_decimal [...]
+test_remote_doris_all_types_t1 CREATE TABLE `test_remote_doris_all_types_t1`
(\n `id` datetimev2(3) NOT NULL,\n `c_boolean` boolean NULL DEFAULT "true",\n
`c_tinyint` tinyint NULL DEFAULT "1",\n `c_smallint` smallint NULL DEFAULT
"1",\n `c_int` int NULL DEFAULT "1",\n `c_bigint` bigint NULL DEFAULT "1",\n
`c_largeint` largeint NULL DEFAULT "1",\n `c_float` float NULL DEFAULT "1",\n
`c_double` double NULL DEFAULT "1",\n `c_decimal9` decimalv3(9,0) NULL DEFAULT
"1",\n `c_decimal [...]
-- !sql --
id datetime(3) No true \N
@@ -23,6 +23,7 @@ c_string text Yes false d NONE
c_array_s array<text> Yes false \N NONE
c_map map<text,int> Yes false \N NONE
c_struct struct<f1:int,f2:float,f3:text> Yes false \N NONE
+c_json json Yes false \N NONE
-- !sql --
test_remote_doris_all_types_t2 CREATE TABLE `test_remote_doris_all_types_t2`
(\n `id` datetimev2(3) NOT NULL,\n `a_boolean` array<boolean> NULL,\n
`a_tinyint` array<tinyint> NULL,\n `a_smallint` array<smallint> NULL,\n
`a_int` array<int> NULL,\n `a_bigint` array<bigint> NULL,\n `a_largeint`
array<largeint> NULL,\n `a_float` array<float> NULL,\n `a_double`
array<double> NULL,\n `a_decimal9` array<decimalv3(9,0)> NULL,\n
`a_decimal18` array<decimalv3(18,0)> NULL,\n `a_decimal32 [...]
diff --git
a/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_select.groovy
b/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_select.groovy
index ef5bfcae8f6..31530d7bfc1 100644
---
a/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_select.groovy
+++
b/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_select.groovy
@@ -58,6 +58,7 @@ suite("test_remote_doris_all_types_select",
"p0,external,doris,external_docker,e
`c_array_s` array<text> NULL,
`c_map` MAP<STRING, INT> NULL,
`c_struct` STRUCT<f1:INT,f2:FLOAT,f3:STRING> NULL,
+ `c_json` JSON NULL,
) ENGINE=OLAP
DUPLICATE KEY(`id`)
DISTRIBUTED BY HASH(`id`) BUCKETS 1
@@ -67,16 +68,16 @@ suite("test_remote_doris_all_types_select",
"p0,external,doris,external_docker,e
"""
sql """
- INSERT INTO
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t`
values('2025-05-18 01:00:00.000', true, -128, -32768, -2147483648,
-9223372036854775808, -1234567890123456790, -123.456, -123456.789, -123457,
-123456789012346, -1234567890123456789012345678, '1970-01-01', '0000-01-01
00:00:00', 'A', 'Hello', 'Hello, Doris!', '["apple", "banana", "orange"]',
{"Emily":101,"age":25} , {11, 3.14, "Emily"})
+ INSERT INTO
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t`
values('2025-05-18 01:00:00.000', true, -128, -32768, -2147483648,
-9223372036854775808, -1234567890123456790, -123.456, -123456.789, -123457,
-123456789012346, -1234567890123456789012345678, '1970-01-01', '0000-01-01
00:00:00', 'A', 'Hello', 'Hello, Doris!', '["apple", "banana", "orange"]',
{"Emily":101,"age":25} , {11, 3.14, "Emily"}, '{"k1":"v31", "k2": 300, "k3":
[123, 456], "k4": [], " [...]
"""
sql """
- INSERT INTO
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t`
values('2025-05-18 02:00:00.000', NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+ INSERT INTO
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t`
values('2025-05-18 02:00:00.000', NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
"""
sql """
- INSERT INTO
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t`
values('2025-05-18 03:00:00.000', false, 127, 32767, 2147483647,
9223372036854775807, 1234567890123456789, 123.456, 123456.789, 123457,
123456789012346, 1234567890123456789012345678, '9999-12-31', '9999-12-31
23:59:59', '', '', '', [], {}, {11, 3.14, "Emily"})
+ INSERT INTO
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t`
values('2025-05-18 03:00:00.000', false, 127, 32767, 2147483647,
9223372036854775807, 1234567890123456789, 123.456, 123456.789, 123457,
123456789012346, 1234567890123456789012345678, '9999-12-31', '9999-12-31
23:59:59', '', '', '', [], {}, {11, 3.14, "Emily"}, '{}')
"""
sql """
- INSERT INTO
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t`
values('2025-05-18 04:00:00.000', true, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
'2023-10-01', '2023-10-01 12:34:56', 'A', 'Hello', 'Hello, Doris!', '["apple",
"banana", "orange"]', {"Emily":101,"age":25} , {11, 3.14, "Emily"});
+ INSERT INTO
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t`
values('2025-05-18 04:00:00.000', true, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
'2023-10-01', '2023-10-01 12:34:56', 'A', 'Hello', 'Hello, Doris!', '["apple",
"banana", "orange"]', {"Emily":101,"age":25} , {11, 3.14, "Emily"}, '[]');
"""
sql """
diff --git
a/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_show.groovy
b/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_show.groovy
index 5438617e09c..c80c2785ea4 100644
---
a/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_show.groovy
+++
b/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_show.groovy
@@ -58,6 +58,7 @@ suite("test_remote_doris_all_types_show",
"p0,external,doris,external_docker,ext
`c_array_s` array<text> NULL,
`c_map` MAP<STRING, INT> NULL,
`c_struct` STRUCT<f1:INT,f2:FLOAT,f3:STRING> NULL,
+ `c_json` JSON NULL,
) ENGINE=OLAP
DUPLICATE KEY(`id`)
DISTRIBUTED BY HASH(`id`) BUCKETS 1
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]