This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 119a5dafb36 branch-4.0: [Feature](serd)Support read column from json 
arrow. #58429 (#58757)
119a5dafb36 is described below

commit 119a5dafb36b09d3f1cc7f63afa321bd5bdd69a4
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Dec 9 17:27:27 2025 +0800

    branch-4.0: [Feature](serd)Support read column from json arrow. #58429 
(#58757)
    
    Cherry-picked from #58429
    
    Co-authored-by: Chen768959 <[email protected]>
---
 .../vec/data_types/serde/data_type_jsonb_serde.cpp | 70 ++++++++++++++++++++++
 .../vec/data_types/serde/data_type_jsonb_serde.h   |  2 +
 .../serde/data_type_jsonb_serde_test.cpp           | 49 +++++++++++++++
 .../test_remote_doris_all_types_select.out         |  8 +--
 .../test_remote_doris_all_types_show.out           |  3 +-
 .../test_remote_doris_all_types_select.groovy      |  9 +--
 .../test_remote_doris_all_types_show.groovy        |  1 +
 7 files changed, 133 insertions(+), 9 deletions(-)

diff --git a/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp 
b/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
index 53afaf24cfc..9e5ce5ae68a 100644
--- a/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
@@ -120,6 +120,76 @@ Status DataTypeJsonbSerDe::write_column_to_arrow(const 
IColumn& column, const Nu
     return Status::OK();
 }
 
+Status DataTypeJsonbSerDe::read_column_from_arrow(IColumn& column, const 
arrow::Array* arrow_array,
+                                                  int64_t start, int64_t end,
+                                                  const cctz::time_zone& ctz) 
const {
+    if (arrow_array->type_id() == arrow::Type::STRING ||
+        arrow_array->type_id() == arrow::Type::BINARY) {
+        const auto* concrete_array = dynamic_cast<const 
arrow::BinaryArray*>(arrow_array);
+        std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
+
+        const uint8_t* offsets_data = concrete_array->value_offsets()->data();
+        const size_t offset_size = sizeof(int32_t);
+
+        JsonBinaryValue value;
+        for (auto offset_i = start; offset_i < end; ++offset_i) {
+            if (!concrete_array->IsNull(offset_i)) {
+                int32_t start_offset = 0;
+                int32_t end_offset = 0;
+                memcpy(&start_offset, offsets_data + offset_i * offset_size, 
offset_size);
+                memcpy(&end_offset, offsets_data + (offset_i + 1) * 
offset_size, offset_size);
+
+                int32_t length = end_offset - start_offset;
+                const auto* raw_data = buffer->data() + start_offset;
+
+                RETURN_IF_ERROR(
+                        value.from_json_string(reinterpret_cast<const 
char*>(raw_data), length));
+                column.insert_data(value.value(), value.size());
+            } else {
+                column.insert_default();
+            }
+        }
+    } else if (arrow_array->type_id() == arrow::Type::FIXED_SIZE_BINARY) {
+        const auto* concrete_array = dynamic_cast<const 
arrow::FixedSizeBinaryArray*>(arrow_array);
+        uint32_t width = concrete_array->byte_width();
+        const auto* array_data = concrete_array->GetValue(start);
+
+        JsonBinaryValue value;
+        for (size_t offset_i = 0; offset_i < end - start; ++offset_i) {
+            if (!concrete_array->IsNull(offset_i)) {
+                const auto* raw_data = array_data + (offset_i * width);
+
+                RETURN_IF_ERROR(
+                        value.from_json_string(reinterpret_cast<const 
char*>(raw_data), width));
+                column.insert_data(value.value(), value.size());
+            } else {
+                column.insert_default();
+            }
+        }
+    } else if (arrow_array->type_id() == arrow::Type::LARGE_STRING ||
+               arrow_array->type_id() == arrow::Type::LARGE_BINARY) {
+        const auto* concrete_array = dynamic_cast<const 
arrow::LargeBinaryArray*>(arrow_array);
+        std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
+
+        JsonBinaryValue value;
+        for (auto offset_i = start; offset_i < end; ++offset_i) {
+            if (!concrete_array->IsNull(offset_i)) {
+                const auto* raw_data = buffer->data() + 
concrete_array->value_offset(offset_i);
+
+                RETURN_IF_ERROR(value.from_json_string(reinterpret_cast<const 
char*>(raw_data),
+                                                       
concrete_array->value_length(offset_i)));
+                column.insert_data(value.value(), value.size());
+            } else {
+                column.insert_default();
+            }
+        }
+    } else {
+        return Status::InvalidArgument("Unsupported arrow type for json 
column: {}",
+                                       arrow_array->type_id());
+    }
+    return Status::OK();
+}
+
 Status DataTypeJsonbSerDe::write_column_to_orc(const std::string& timezone, 
const IColumn& column,
                                                const NullMap* null_map,
                                                orc::ColumnVectorBatch* 
orc_col_batch, int64_t start,
diff --git a/be/src/vec/data_types/serde/data_type_jsonb_serde.h 
b/be/src/vec/data_types/serde/data_type_jsonb_serde.h
index 8df44f5a454..95c60ae583b 100644
--- a/be/src/vec/data_types/serde/data_type_jsonb_serde.h
+++ b/be/src/vec/data_types/serde/data_type_jsonb_serde.h
@@ -47,6 +47,8 @@ public:
     Status write_column_to_arrow(const IColumn& column, const NullMap* 
null_map,
                                  arrow::ArrayBuilder* array_builder, int64_t 
start, int64_t end,
                                  const cctz::time_zone& ctz) const override;
+    Status read_column_from_arrow(IColumn& column, const arrow::Array* 
arrow_array, int64_t start,
+                                  int64_t end, const cctz::time_zone& ctz) 
const override;
 
     Status serialize_one_cell_to_json(const IColumn& column, int64_t row_num, 
BufferWritable& bw,
                                       FormatOptions& options) const override;
diff --git a/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp 
b/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
index 1bb52c8d851..8238d7fdf03 100644
--- a/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
@@ -17,6 +17,8 @@
 
 #include "vec/data_types/serde/data_type_jsonb_serde.h"
 
+#include <arrow/api.h>
+#include <cctz/time_zone.h>
 #include <gtest/gtest-message.h>
 #include <gtest/gtest-test-part.h>
 #include <gtest/gtest.h>
@@ -218,4 +220,51 @@ TEST_F(DataTypeJsonbSerDeTest, serdes) {
     test_func(*serde_jsonb, column_jsonb);
 }
 
+// Run with UBSan enabled to catch misalignment errors.
+TEST_F(DataTypeJsonbSerDeTest, ArrowMemNotAligned) {
+    // 1.Prepare the data.
+    std::vector<std::string> strings = {"{\"k1\":\"v1\"}", "{\"k2\":\"v2\"}", 
"{\"k3\":\"v3\"}",
+                                        "{\"k4\":\"v4\"}", "{\"k5\":\"v5\"}"};
+
+    int32_t total_length = 0;
+    std::vector<int32_t> offsets = {0};
+    for (const auto& str : strings) {
+        total_length += static_cast<int32_t>(str.length());
+        offsets.push_back(total_length);
+    }
+
+    // 2.Create an unaligned memory buffer.
+    std::vector<uint8_t> value_storage(total_length + 10);
+    std::vector<uint8_t> offset_storage((strings.size() + 1) * sizeof(int32_t) 
+ 10);
+
+    uint8_t* unaligned_value_data = value_storage.data() + 1;
+    uint8_t* unaligned_offset_data = offset_storage.data() + 1;
+
+    // 3. Copy data to unaligned memory
+    int32_t current_pos = 0;
+    for (size_t i = 0; i < strings.size(); ++i) {
+        memcpy(unaligned_value_data + current_pos, strings[i].data(), 
strings[i].length());
+        current_pos += strings[i].length();
+    }
+
+    for (size_t i = 0; i < offsets.size(); ++i) {
+        memcpy(unaligned_offset_data + i * sizeof(int32_t), &offsets[i], 
sizeof(int32_t));
+    }
+
+    // 4. Create Arrow array with unaligned memory
+    auto value_buffer = arrow::Buffer::Wrap(unaligned_value_data, 
total_length);
+    auto offset_buffer =
+            arrow::Buffer::Wrap(unaligned_offset_data, offsets.size() * 
sizeof(int32_t));
+    auto arr = std::make_shared<arrow::StringArray>(strings.size(), 
offset_buffer, value_buffer);
+
+    const auto* offsets_ptr = arr->raw_value_offsets();
+    uintptr_t address = reinterpret_cast<uintptr_t>(offsets_ptr);
+    EXPECT_EQ((reinterpret_cast<uintptr_t>(address) % 4), 1);
+
+    // 5.Test read_column_from_arrow
+    cctz::time_zone tz;
+    auto st = serde_jsonb->read_column_from_arrow(*column_jsonb, arr.get(), 0, 
1, tz);
+    EXPECT_TRUE(st.ok());
+}
+
 } // namespace doris::vectorized
\ No newline at end of file
diff --git 
a/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_select.out
 
b/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_select.out
index 069600b28c1..bc8a9600539 100644
--- 
a/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_select.out
+++ 
b/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_select.out
@@ -1,9 +1,9 @@
 -- This file is automatically generated. You should know what you did if you 
want to edit this
 -- !sql --
-2025-05-18T01:00       true    -128    -32768  -2147483648     
-9223372036854775808    -1234567890123456790    -123.456        -123456.789     
-123457 -123456789012346        -1234567890123456789012345678   1970-01-01      
        A       Hello   Hello, Doris!   ["apple", "banana", "orange"]   
{"Emily":101, "age":25} {"f1":11, "f2":3.14, "f3":"Emily"}
-2025-05-18T02:00       \N      \N      \N      \N      \N      \N      \N      
\N      \N      \N      \N      \N      \N      \N      \N      \N      \N      
\N      \N
-2025-05-18T03:00       false   127     32767   2147483647      
9223372036854775807     1234567890123456789     123.456 123456.789      123457  
123456789012346 1234567890123456789012345678    9999-12-31      
9999-12-31T23:59:59                             []      {}      {"f1":11, 
"f2":3.14, "f3":"Emily"}
-2025-05-18T04:00       true    0       0       0       0       0       0.0     
0.0     0       0       0       2023-10-01      2023-10-01T12:34:56     A       
Hello   Hello, Doris!   ["apple", "banana", "orange"]   {"Emily":101, "age":25} 
{"f1":11, "f2":3.14, "f3":"Emily"}
+2025-05-18T01:00       true    -128    -32768  -2147483648     
-9223372036854775808    -1234567890123456790    -123.456        -123456.789     
-123457 -123456789012346        -1234567890123456789012345678   1970-01-01      
        A       Hello   Hello, Doris!   ["apple", "banana", "orange"]   
{"Emily":101, "age":25} {"f1":11, "f2":3.14, "f3":"Emily"}      
{"k1":"v31","k2":300,"k3":[123,456],"k4":[],"k5":{"i1":"iv1"}}
+2025-05-18T02:00       \N      \N      \N      \N      \N      \N      \N      
\N      \N      \N      \N      \N      \N      \N      \N      \N      \N      
\N      \N      \N
+2025-05-18T03:00       false   127     32767   2147483647      
9223372036854775807     1234567890123456789     123.456 123456.789      123457  
123456789012346 1234567890123456789012345678    9999-12-31      
9999-12-31T23:59:59                             []      {}      {"f1":11, 
"f2":3.14, "f3":"Emily"}      {}
+2025-05-18T04:00       true    0       0       0       0       0       0.0     
0.0     0       0       0       2023-10-01      2023-10-01T12:34:56     A       
Hello   Hello, Doris!   ["apple", "banana", "orange"]   {"Emily":101, "age":25} 
{"f1":11, "f2":3.14, "f3":"Emily"}      []
 
 -- !sql --
 2025-05-18T01:00       [1]     [-128]  [-32768]        [-2147483648]   
[-9223372036854775808]  [-1234567890123456790]  [-123.456]      [-123456.789]   
[-123457]       [-123456789012346]      [-1234567890123456789012345678] 
["0000-01-01"]  [""]    ["A"]   ["Hello"]       ["Hello, Doris!"]
diff --git 
a/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_show.out
 
b/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_show.out
index 5133dffcb91..051047ef631 100644
--- 
a/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_show.out
+++ 
b/regression-test/data/external_table_p0/remote_doris/test_remote_doris_all_types_show.out
@@ -1,6 +1,6 @@
 -- This file is automatically generated. You should know what you did if you 
want to edit this
 -- !sql --
-test_remote_doris_all_types_t1 CREATE TABLE `test_remote_doris_all_types_t1` 
(\n  `id` datetimev2(3) NOT NULL,\n  `c_boolean` boolean NULL DEFAULT "true",\n 
 `c_tinyint` tinyint NULL DEFAULT "1",\n  `c_smallint` smallint NULL DEFAULT 
"1",\n  `c_int` int NULL DEFAULT "1",\n  `c_bigint` bigint NULL DEFAULT "1",\n  
`c_largeint` largeint NULL DEFAULT "1",\n  `c_float` float NULL DEFAULT "1",\n  
`c_double` double NULL DEFAULT "1",\n  `c_decimal9` decimalv3(9,0) NULL DEFAULT 
"1",\n  `c_decimal [...]
+test_remote_doris_all_types_t1 CREATE TABLE `test_remote_doris_all_types_t1` 
(\n  `id` datetimev2(3) NOT NULL,\n  `c_boolean` boolean NULL DEFAULT "true",\n 
 `c_tinyint` tinyint NULL DEFAULT "1",\n  `c_smallint` smallint NULL DEFAULT 
"1",\n  `c_int` int NULL DEFAULT "1",\n  `c_bigint` bigint NULL DEFAULT "1",\n  
`c_largeint` largeint NULL DEFAULT "1",\n  `c_float` float NULL DEFAULT "1",\n  
`c_double` double NULL DEFAULT "1",\n  `c_decimal9` decimalv3(9,0) NULL DEFAULT 
"1",\n  `c_decimal [...]
 
 -- !sql --
 id     datetime(3)     No      true    \N      
@@ -23,6 +23,7 @@ c_string      text    Yes     false   d       NONE
 c_array_s      array<text>     Yes     false   \N      NONE
 c_map  map<text,int>   Yes     false   \N      NONE
 c_struct       struct<f1:int,f2:float,f3:text> Yes     false   \N      NONE
+c_json json    Yes     false   \N      NONE
 
 -- !sql --
 test_remote_doris_all_types_t2 CREATE TABLE `test_remote_doris_all_types_t2` 
(\n  `id` datetimev2(3) NOT NULL,\n  `a_boolean` array<boolean> NULL,\n  
`a_tinyint` array<tinyint> NULL,\n  `a_smallint` array<smallint> NULL,\n  
`a_int` array<int> NULL,\n  `a_bigint` array<bigint> NULL,\n  `a_largeint` 
array<largeint> NULL,\n  `a_float` array<float> NULL,\n  `a_double` 
array<double> NULL,\n  `a_decimal9` array<decimalv3(9,0)> NULL,\n  
`a_decimal18` array<decimalv3(18,0)> NULL,\n  `a_decimal32 [...]
diff --git 
a/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_select.groovy
 
b/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_select.groovy
index ef5bfcae8f6..31530d7bfc1 100644
--- 
a/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_select.groovy
+++ 
b/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_select.groovy
@@ -58,6 +58,7 @@ suite("test_remote_doris_all_types_select", 
"p0,external,doris,external_docker,e
           `c_array_s` array<text> NULL,
           `c_map` MAP<STRING, INT> NULL,
           `c_struct` STRUCT<f1:INT,f2:FLOAT,f3:STRING>  NULL,
+          `c_json` JSON  NULL,
         ) ENGINE=OLAP
         DUPLICATE KEY(`id`)
         DISTRIBUTED BY HASH(`id`) BUCKETS 1
@@ -67,16 +68,16 @@ suite("test_remote_doris_all_types_select", 
"p0,external,doris,external_docker,e
     """
 
     sql """
-        INSERT INTO 
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t` 
values('2025-05-18 01:00:00.000', true, -128, -32768, -2147483648, 
-9223372036854775808, -1234567890123456790, -123.456, -123456.789, -123457, 
-123456789012346, -1234567890123456789012345678, '1970-01-01', '0000-01-01 
00:00:00', 'A', 'Hello', 'Hello, Doris!', '["apple", "banana", "orange"]', 
{"Emily":101,"age":25} , {11, 3.14, "Emily"})
+        INSERT INTO 
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t` 
values('2025-05-18 01:00:00.000', true, -128, -32768, -2147483648, 
-9223372036854775808, -1234567890123456790, -123.456, -123456.789, -123457, 
-123456789012346, -1234567890123456789012345678, '1970-01-01', '0000-01-01 
00:00:00', 'A', 'Hello', 'Hello, Doris!', '["apple", "banana", "orange"]', 
{"Emily":101,"age":25} , {11, 3.14, "Emily"}, '{"k1":"v31", "k2": 300, "k3": 
[123, 456], "k4": [], " [...]
     """
     sql """
-        INSERT INTO 
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t` 
values('2025-05-18 02:00:00.000', NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+        INSERT INTO 
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t` 
values('2025-05-18 02:00:00.000', NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
     """
     sql """
-        INSERT INTO 
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t` 
values('2025-05-18 03:00:00.000', false, 127, 32767, 2147483647, 
9223372036854775807, 1234567890123456789, 123.456, 123456.789, 123457, 
123456789012346, 1234567890123456789012345678, '9999-12-31', '9999-12-31 
23:59:59', '', '', '', [], {}, {11, 3.14, "Emily"})
+        INSERT INTO 
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t` 
values('2025-05-18 03:00:00.000', false, 127, 32767, 2147483647, 
9223372036854775807, 1234567890123456789, 123.456, 123456.789, 123457, 
123456789012346, 1234567890123456789012345678, '9999-12-31', '9999-12-31 
23:59:59', '', '', '', [], {}, {11, 3.14, "Emily"}, '{}')
     """
     sql """
-        INSERT INTO 
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t` 
values('2025-05-18 04:00:00.000', true, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
'2023-10-01', '2023-10-01 12:34:56', 'A', 'Hello', 'Hello, Doris!', '["apple", 
"banana", "orange"]', {"Emily":101,"age":25} , {11, 3.14, "Emily"});
+        INSERT INTO 
`test_remote_doris_all_types_select_db`.`test_remote_doris_all_types_select_t` 
values('2025-05-18 04:00:00.000', true, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
'2023-10-01', '2023-10-01 12:34:56', 'A', 'Hello', 'Hello, Doris!', '["apple", 
"banana", "orange"]', {"Emily":101,"age":25} , {11, 3.14, "Emily"}, '[]');
     """
 
     sql """
diff --git 
a/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_show.groovy
 
b/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_show.groovy
index 5438617e09c..c80c2785ea4 100644
--- 
a/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_show.groovy
+++ 
b/regression-test/suites/external_table_p0/remote_doris/test_remote_doris_all_types_show.groovy
@@ -58,6 +58,7 @@ suite("test_remote_doris_all_types_show", 
"p0,external,doris,external_docker,ext
           `c_array_s` array<text> NULL,
           `c_map` MAP<STRING, INT> NULL,
           `c_struct` STRUCT<f1:INT,f2:FLOAT,f3:STRING>  NULL,
+          `c_json` JSON  NULL,
         ) ENGINE=OLAP
         DUPLICATE KEY(`id`)
         DISTRIBUTED BY HASH(`id`) BUCKETS 1


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to