This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new ca858fd4a87 [fix](be) Correct Arrow UTF8/String size limit (#63137)
ca858fd4a87 is described below

commit ca858fd4a8752d25cfef15a09302022f7cfa2a92
Author: Jerry Hu <[email protected]>
AuthorDate: Mon May 11 21:40:16 2026 +0800

    [fix](be) Correct Arrow UTF8/String size limit (#63137)
    
    Issue Number: None
    
    Related PR: None
    
    Problem Summary: Fix the Arrow UTF8 max size constant so it matches the
    documented 2G limit instead of a much smaller value.
    
    None
    
    - Test: No need to test
        - No need to test (header constant fix only)
    - Behavior changed: Yes (Arrow UTF8 size limit now matches the intended
    2G threshold)
    - Does this need documentation: No
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary:
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/format/arrow/arrow_row_batch.h              |  2 +-
 .../data/arrow_flight_sql_p0/test_select.out       |  8 ++++-
 .../suites/arrow_flight_sql_p0/test_select.groovy  | 36 +++++++++++++++++++++-
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/be/src/format/arrow/arrow_row_batch.h 
b/be/src/format/arrow/arrow_row_batch.h
index e0a37f6bf42..e7b77ed707b 100644
--- a/be/src/format/arrow/arrow_row_batch.h
+++ b/be/src/format/arrow/arrow_row_batch.h
@@ -39,7 +39,7 @@ class Schema;
 
 namespace doris {
 
-constexpr size_t MAX_ARROW_UTF8 = (1ULL << 21); // 2G
+constexpr size_t MAX_ARROW_UTF8 = (1ULL << 31); // 2G
 
 class RowDescriptor;
 
diff --git a/regression-test/data/arrow_flight_sql_p0/test_select.out 
b/regression-test/data/arrow_flight_sql_p0/test_select.out
index f2f4b86bbf5..62888cd3dfc 100644
--- a/regression-test/data/arrow_flight_sql_p0/test_select.out
+++ b/regression-test/data/arrow_flight_sql_p0/test_select.out
@@ -5,4 +5,10 @@
 -- !arrow_flight_sql_datetime --
 333    plsql333        2024-07-21 12:00:00.123456      2024-07-21 12:00:00.0
 222    plsql222        2024-07-20 12:00:00.123456      2024-07-20 12:00:00.0
-111    plsql111        2024-07-19 12:00:00.123456      2024-07-19 12:00:00.0
\ No newline at end of file
+111    plsql111        2024-07-19 12:00:00.123456      2024-07-19 12:00:00.0
+
+-- !arrow_flight_sql_jsonb --
+1      {"k1":1,"k2":"v2"}
+2      [1,2,{"nested":true}]
+3      \N
+
diff --git a/regression-test/suites/arrow_flight_sql_p0/test_select.groovy 
b/regression-test/suites/arrow_flight_sql_p0/test_select.groovy
index 950fb4af7e9..85f119fc2c3 100644
--- a/regression-test/suites/arrow_flight_sql_p0/test_select.groovy
+++ b/regression-test/suites/arrow_flight_sql_p0/test_select.groovy
@@ -26,7 +26,7 @@ suite("test_select", "arrow_flight_sql") {
     sql """INSERT INTO ${tableName} VALUES(222, "plsql222")"""
     sql """INSERT INTO ${tableName} VALUES(333, "plsql333")"""
     sql """INSERT INTO ${tableName} VALUES(111, "plsql333")"""
-    
+
     qt_arrow_flight_sql "select sum(id) as a, count(1) as b from ${tableName}"
 
     tableName = "test_select_datetime"
@@ -40,4 +40,38 @@ suite("test_select", "arrow_flight_sql") {
     sql """INSERT INTO ${tableName} VALUES(333, "plsql333","2024-07-21 
12:00:00.123456","2024-07-21 12:00:00")"""
 
     qt_arrow_flight_sql_datetime "select * from ${tableName} order by id desc"
+
+    tableName = "test_select_jsonb"
+    sql "DROP TABLE IF EXISTS ${tableName}"
+    sql """
+        create table ${tableName} (id int, payload jsonb) DUPLICATE key(`id`) 
distributed by hash (`id`) buckets 4
+        properties ("replication_num"="1");
+        """
+    sql """
+        INSERT INTO ${tableName} VALUES
+            (1, '{"k1": 1, "k2": "v2"}'),
+            (2, '[1, 2, {"nested": true}]'),
+            (3, NULL)
+        """
+
+    qt_arrow_flight_sql_jsonb "select id, payload from ${tableName} order by 
id"
+
+    def largeJsonValueSize = 2100000
+    sql """
+        INSERT INTO ${tableName}
+        SELECT 4, CAST(CONCAT('{"large":"', REPEAT('x', 
${largeJsonValueSize}), '"}') AS JSONB)
+        """
+
+    // This row exceeds MAX_ARROW_UTF8 and exercises JSONB -> LargeString 
serialization.
+    def largeJsonbResult = arrow_flight_sql """
+        select payload, length(cast(payload as string)) from ${tableName} 
where id = 4
+        """
+    assertEquals(1, largeJsonbResult.size())
+    assertEquals(2, largeJsonbResult[0].size())
+    def expectedLargeJsonbSize = largeJsonValueSize + '{"large":""}'.length()
+    def largeJsonb = largeJsonbResult[0][0].toString()
+    assertEquals(expectedLargeJsonbSize, largeJsonb.length())
+    assertEquals(expectedLargeJsonbSize, (largeJsonbResult[0][1] as 
Number).intValue())
+    assertTrue(largeJsonb.startsWith('{"large":"'))
+    assertTrue(largeJsonb.endsWith('"}'))
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to