This is an automated email from the ASF dual-hosted git repository.

Gabriel39 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new ac877e5cc4f [fix](iceberg) Write binary columns with proper Arrow 
types (#64949)
ac877e5cc4f is described below

commit ac877e5cc4f27de76753794a4948d55bd425bf0a
Author: Gabriel <[email protected]>
AuthorDate: Tue Jun 30 12:01:06 2026 +0800

    [fix](iceberg) Write binary columns with proper Arrow types (#64949)
    
    Doris used the Iceberg schema to build the Arrow schema for Parquet
    writer, but mapped Iceberg BINARY/FIXED/UUID to Arrow utf8. This made
    Doris write binary Iceberg columns with string semantics and could break
    Spark/Iceberg vectorized reads with Unsupported type: binary.
    
    Map Iceberg BINARY to Arrow binary, FIXED(n) to fixed-size binary(n),
    and UUID to fixed-size binary(16). Add FixedType length access for
    schema conversion.
    
    Also allow STRING and VARBINARY columns to write into Arrow
    binary/fixed-size binary builders so existing Doris-side mappings can
    still produce the correct Iceberg Parquet schema. Fixed-size binary
    writes validate byte width before appending.
    
    Add ArrowSchemaUtil coverage for STRING/BINARY/FIXED/UUID mapping.
---
 .../data_type_serde/data_type_string_serde.cpp     | 118 +++++++++++++++++++
 .../data_type_serde/data_type_varbinary_serde.cpp  |  35 ++++++
 be/src/format/arrow/arrow_block_convertor.cpp      | 131 ++++++++++++++++++++-
 be/src/format/table/iceberg/arrow_schema_util.cpp  |  19 ++-
 be/src/format/table/iceberg/arrow_schema_util.h    |   1 +
 be/src/format/table/iceberg/types.h                |   2 +
 .../data_type_serde/data_type_serde_arrow_test.cpp | 108 +++++++++++++++++
 .../data_type_serde_varbinary_test.cpp             |  18 +++
 .../table/iceberg/arrow_schema_util_test.cpp       |  28 +++++
 9 files changed, 453 insertions(+), 7 deletions(-)

diff --git a/be/src/core/data_type_serde/data_type_string_serde.cpp 
b/be/src/core/data_type_serde/data_type_string_serde.cpp
index ff340de9dcf..dc7667fefca 100644
--- a/be/src/core/data_type_serde/data_type_string_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_string_serde.cpp
@@ -17,6 +17,9 @@
 
 #include "core/data_type_serde/data_type_string_serde.h"
 
+#include <array>
+#include <cstring>
+
 #include "core/column/column_string.h"
 #include "core/data_type/define_primitive_type.h"
 #include "util/jsonb_document_cast.h"
@@ -25,6 +28,88 @@
 
 namespace doris {
 
+namespace {
+
+int hex_value(char c) {
+    if (c >= '0' && c <= '9') {
+        return c - '0';
+    }
+    if (c >= 'a' && c <= 'f') {
+        return c - 'a' + 10;
+    }
+    if (c >= 'A' && c <= 'F') {
+        return c - 'A' + 10;
+    }
+    return -1;
+}
+
+Status parse_uuid_to_bytes(StringRef uuid, std::array<uint8_t, 16>* bytes) {
+    if (uuid.size != 32 && uuid.size != 36) {
+        return Status::InvalidArgument("Invalid UUID string length: {}", 
uuid.size);
+    }
+
+    int hex_count = 0;
+    int high_nibble = -1;
+    int byte_index = 0;
+    for (size_t i = 0; i < uuid.size; ++i) {
+        char c = uuid.data[i];
+        if (uuid.size == 36 && (i == 8 || i == 13 || i == 18 || i == 23)) {
+            if (c != '-') {
+                return Status::InvalidArgument("Invalid UUID string format");
+            }
+            continue;
+        }
+        if (c == '-') {
+            return Status::InvalidArgument("Invalid UUID string format");
+        }
+
+        int value = hex_value(c);
+        if (value < 0) {
+            return Status::InvalidArgument("Invalid UUID string format");
+        }
+        if (hex_count % 2 == 0) {
+            high_nibble = value;
+        } else {
+            (*bytes)[byte_index++] = static_cast<uint8_t>((high_nibble << 4) | 
value);
+        }
+        ++hex_count;
+    }
+
+    if (hex_count != 32 || byte_index != 16) {
+        return Status::InvalidArgument("Invalid UUID string format");
+    }
+    return Status::OK();
+}
+
+Status append_fixed_size_binary(arrow::FixedSizeBinaryBuilder& builder, const 
IColumn& column,
+                                StringRef string_ref, int byte_width, bool 
pad_char_value,
+                                bool convert_uuid_string) {
+    if (convert_uuid_string && byte_width == 16 &&
+        (string_ref.size == 32 || string_ref.size == 36)) {
+        std::array<uint8_t, 16> bytes;
+        RETURN_IF_ERROR(parse_uuid_to_bytes(string_ref, &bytes));
+        return checkArrowStatus(builder.Append(bytes.data()), column, builder);
+    }
+
+    if (string_ref.size == byte_width) {
+        return checkArrowStatus(builder.Append(reinterpret_cast<const 
uint8_t*>(string_ref.data)),
+                                column, builder);
+    }
+
+    if (pad_char_value && string_ref.size < byte_width) {
+        std::string padded_value(byte_width, '\0');
+        std::memcpy(padded_value.data(), string_ref.data, string_ref.size);
+        return checkArrowStatus(
+                builder.Append(reinterpret_cast<const 
uint8_t*>(padded_value.data())), column,
+                builder);
+    }
+
+    return Status::InvalidArgument("Fixed size binary column expects {} bytes, 
got {}", byte_width,
+                                   string_ref.size);
+}
+
+} // namespace
+
 template <typename ColumnType>
 Status DataTypeStringSerDeBase<ColumnType>::serialize_column_to_json(const 
IColumn& column,
                                                                      int64_t 
start_idx,
@@ -243,9 +328,42 @@ Status 
DataTypeStringSerDeBase<ColumnType>::write_column_to_arrow(
     if (array_builder->type()->id() == arrow::Type::LARGE_STRING) {
         auto& builder = 
assert_cast<arrow::LargeStringBuilder&>(*array_builder);
         return write_column_to_arrow_impl(column, null_map, builder, start, 
end);
+    } else if (array_builder->type()->id() == arrow::Type::LARGE_BINARY) {
+        auto& builder = 
assert_cast<arrow::LargeBinaryBuilder&>(*array_builder);
+        const auto& string_column = assert_cast<const ColumnType&>(column);
+        for (size_t string_i = start; string_i < end; ++string_i) {
+            if (null_map && (*null_map)[string_i]) {
+                RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column, 
builder));
+                continue;
+            }
+            auto string_ref = string_column.get_data_at(string_i);
+            RETURN_IF_ERROR(checkArrowStatus(
+                    builder.Append(reinterpret_cast<const 
uint8_t*>(string_ref.data),
+                                   cast_set<int64_t, size_t, 
false>(string_ref.size)),
+                    column, builder));
+        }
+        return Status::OK();
     } else if (array_builder->type()->id() == arrow::Type::STRING) {
         auto& builder = assert_cast<arrow::StringBuilder&>(*array_builder);
         return write_column_to_arrow_impl(column, null_map, builder, start, 
end);
+    } else if (array_builder->type()->id() == arrow::Type::BINARY) {
+        auto& builder = assert_cast<arrow::BinaryBuilder&>(*array_builder);
+        return write_column_to_arrow_impl(column, null_map, builder, start, 
end);
+    } else if (array_builder->type()->id() == arrow::Type::FIXED_SIZE_BINARY) {
+        auto& builder = 
assert_cast<arrow::FixedSizeBinaryBuilder&>(*array_builder);
+        const int byte_width =
+                static_cast<const 
arrow::FixedSizeBinaryType&>(*array_builder->type()).byte_width();
+        const auto& string_column = assert_cast<const ColumnType&>(column);
+        for (size_t string_i = start; string_i < end; ++string_i) {
+            if (null_map && (*null_map)[string_i]) {
+                RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column, 
builder));
+                continue;
+            }
+            auto string_ref = string_column.get_data_at(string_i);
+            RETURN_IF_ERROR(append_fixed_size_binary(builder, column, 
string_ref, byte_width,
+                                                     _type == TYPE_CHAR, _type 
!= TYPE_CHAR));
+        }
+        return Status::OK();
     } else {
         return Status::InvalidArgument("Unsupported arrow type for string 
column: {}",
                                        array_builder->type()->name());
diff --git a/be/src/core/data_type_serde/data_type_varbinary_serde.cpp 
b/be/src/core/data_type_serde/data_type_varbinary_serde.cpp
index 2b1069d3bbb..6aca595e84d 100644
--- a/be/src/core/data_type_serde/data_type_varbinary_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_varbinary_serde.cpp
@@ -70,9 +70,44 @@ Status DataTypeVarbinarySerDe::write_column_to_arrow(const 
IColumn& column, cons
     if (array_builder->type()->id() == arrow::Type::BINARY) {
         auto& builder = assert_cast<arrow::BinaryBuilder&>(*array_builder);
         return lambda_function(builder);
+    } else if (array_builder->type()->id() == arrow::Type::LARGE_BINARY) {
+        auto& builder = 
assert_cast<arrow::LargeBinaryBuilder&>(*array_builder);
+        const auto& varbinary_column_data = assert_cast<const 
ColumnVarbinary&>(column).get_data();
+        for (size_t i = start; i < end; ++i) {
+            if (null_map && (*null_map)[i]) {
+                RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column, 
builder));
+                continue;
+            }
+            const auto& string_view = varbinary_column_data[i];
+            RETURN_IF_ERROR(checkArrowStatus(
+                    builder.Append(reinterpret_cast<const 
uint8_t*>(string_view.data()),
+                                   cast_set<int64_t, size_t, 
false>(string_view.size())),
+                    column, builder));
+        }
+        return Status::OK();
     } else if (array_builder->type()->id() == arrow::Type::STRING) {
         auto& builder = assert_cast<arrow::StringBuilder&>(*array_builder);
         return lambda_function(builder);
+    } else if (array_builder->type()->id() == arrow::Type::FIXED_SIZE_BINARY) {
+        auto& builder = 
assert_cast<arrow::FixedSizeBinaryBuilder&>(*array_builder);
+        const int byte_width =
+                static_cast<const 
arrow::FixedSizeBinaryType&>(*array_builder->type()).byte_width();
+        const auto& varbinary_column_data = assert_cast<const 
ColumnVarbinary&>(column).get_data();
+        for (size_t i = start; i < end; ++i) {
+            if (null_map && (*null_map)[i]) {
+                RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column, 
builder));
+                continue;
+            }
+            const auto& string_view = varbinary_column_data[i];
+            if (string_view.size() != byte_width) {
+                return Status::InvalidArgument("Fixed size binary column 
expects {} bytes, got {}",
+                                               byte_width, string_view.size());
+            }
+            RETURN_IF_ERROR(checkArrowStatus(
+                    builder.Append(reinterpret_cast<const 
uint8_t*>(string_view.data())), column,
+                    builder));
+        }
+        return Status::OK();
     } else {
         return Status::InvalidArgument("Unsupported arrow type for varbinary 
column: {}",
                                        array_builder->type()->name());
diff --git a/be/src/format/arrow/arrow_block_convertor.cpp 
b/be/src/format/arrow/arrow_block_convertor.cpp
index 91593898ac5..d6a4735c73a 100644
--- a/be/src/format/arrow/arrow_block_convertor.cpp
+++ b/be/src/format/arrow/arrow_block_convertor.cpp
@@ -26,11 +26,14 @@
 #include <arrow/status.h>
 #include <arrow/type.h>
 #include <arrow/util/decimal.h>
+#include <arrow/util/key_value_metadata.h>
 #include <arrow/visit_type_inline.h>
 #include <arrow/visitor.h>
 #include <cctz/time_zone.h>
 #include <glog/logging.h>
 
+#include <array>
+#include <cstring>
 #include <ctime>
 #include <memory>
 #include <utility>
@@ -39,6 +42,8 @@
 #include "common/status.h"
 #include "core/block/column_with_type_and_name.h"
 #include "core/column/column.h"
+#include "core/column/column_nullable.h"
+#include "core/column/column_string.h"
 #include "core/data_type/data_type.h"
 #include "core/data_type/data_type_array.h"
 #include "core/data_type/data_type_nullable.h"
@@ -52,6 +57,115 @@ class Array;
 
 namespace doris {
 
+namespace {
+
+constexpr const char* ICEBERG_ORIGINAL_TYPE_KEY = "originalType";
+constexpr const char* ICEBERG_UUID_TYPE_VALUE = "uuid";
+
+bool is_iceberg_uuid_field(const std::shared_ptr<arrow::Field>& field) {
+    if (field == nullptr || !field->HasMetadata()) {
+        return false;
+    }
+    const auto result = field->metadata()->Get(ICEBERG_ORIGINAL_TYPE_KEY);
+    return result.ok() && result.ValueUnsafe() == ICEBERG_UUID_TYPE_VALUE;
+}
+
+int hex_value(char c) {
+    if (c >= '0' && c <= '9') {
+        return c - '0';
+    }
+    if (c >= 'a' && c <= 'f') {
+        return c - 'a' + 10;
+    }
+    if (c >= 'A' && c <= 'F') {
+        return c - 'A' + 10;
+    }
+    return -1;
+}
+
+Status parse_uuid_to_bytes(StringRef uuid, std::array<uint8_t, 16>* bytes) {
+    if (uuid.size == 16) {
+        std::memcpy(bytes->data(), uuid.data, bytes->size());
+        return Status::OK();
+    }
+    if (uuid.size != 32 && uuid.size != 36) {
+        return Status::InvalidArgument("Invalid UUID string length: {}", 
uuid.size);
+    }
+
+    int hex_count = 0;
+    int high_nibble = -1;
+    int byte_index = 0;
+    for (size_t i = 0; i < uuid.size; ++i) {
+        char c = uuid.data[i];
+        if (uuid.size == 36 && (i == 8 || i == 13 || i == 18 || i == 23)) {
+            if (c != '-') {
+                return Status::InvalidArgument("Invalid UUID string format");
+            }
+            continue;
+        }
+        if (c == '-') {
+            return Status::InvalidArgument("Invalid UUID string format");
+        }
+
+        int value = hex_value(c);
+        if (value < 0) {
+            return Status::InvalidArgument("Invalid UUID string format");
+        }
+        if (hex_count % 2 == 0) {
+            high_nibble = value;
+        } else {
+            (*bytes)[byte_index++] = static_cast<uint8_t>((high_nibble << 4) | 
value);
+        }
+        ++hex_count;
+    }
+
+    if (hex_count != 32 || byte_index != 16) {
+        return Status::InvalidArgument("Invalid UUID string format");
+    }
+    return Status::OK();
+}
+
+Status write_iceberg_uuid_string_column_to_arrow(const IColumn& column, const 
DataTypePtr& type,
+                                                 arrow::ArrayBuilder* 
array_builder, int64_t start,
+                                                 int64_t end) {
+    if (array_builder->type()->id() != arrow::Type::FIXED_SIZE_BINARY) {
+        return Status::InvalidArgument("Iceberg UUID must be written to fixed 
size binary");
+    }
+    const int byte_width =
+            static_cast<const 
arrow::FixedSizeBinaryType&>(*array_builder->type()).byte_width();
+    if (byte_width != 16) {
+        return Status::InvalidArgument("Iceberg UUID expects 16 bytes, got 
{}", byte_width);
+    }
+
+    auto& builder = 
assert_cast<arrow::FixedSizeBinaryBuilder&>(*array_builder);
+    const IColumn* data_column = &column;
+    const NullMap* null_map = nullptr;
+    if (type->is_nullable()) {
+        const auto& nullable_column = assert_cast<const 
ColumnNullable&>(column);
+        data_column = &nullable_column.get_nested_column();
+        null_map = &nullable_column.get_null_map_data();
+    }
+    if (!data_column->is_column_string()) {
+        return Status::InvalidArgument(
+                "Iceberg UUID string conversion expects string column, got {}",
+                data_column->get_name());
+    }
+
+    const auto& string_column = assert_cast<const ColumnString&>(*data_column);
+    for (size_t row = start; row < end; ++row) {
+        if (null_map != nullptr && (*null_map)[row]) {
+            RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column, 
builder));
+            continue;
+        }
+        std::array<uint8_t, 16> bytes;
+        RETURN_IF_ERROR(parse_uuid_to_bytes(string_column.get_data_at(row), 
&bytes));
+        RETURN_IF_ERROR(checkArrowStatus(builder.Append(bytes.data()), column, 
builder));
+    }
+    return Status::OK();
+}
+
+} // namespace
+
 Status 
FromBlockToRecordBatchConverter::convert(std::shared_ptr<arrow::RecordBatch>* 
out) {
     int num_fields = _schema->num_fields();
     if (_block.columns() != num_fields) {
@@ -80,8 +194,11 @@ Status 
FromBlockToRecordBatchConverter::convert(std::shared_ptr<arrow::RecordBat
         _cur_type = _block.get_by_position(idx).type;
         auto column = _cur_col->convert_to_full_column_if_const();
         auto arrow_type = _schema->field(idx)->type();
-        if (arrow_type->name() == "utf8" && column->byte_size() >= 
MAX_ARROW_UTF8) {
+        if (arrow_type->id() == arrow::Type::STRING && column->byte_size() >= 
MAX_ARROW_UTF8) {
             arrow_type = arrow::large_utf8();
+        } else if (arrow_type->id() == arrow::Type::BINARY &&
+                   column->byte_size() >= MAX_ARROW_UTF8) {
+            arrow_type = arrow::large_binary();
         }
         std::unique_ptr<arrow::ArrayBuilder> builder;
         auto arrow_st = arrow::MakeBuilder(_pool, arrow_type, &builder);
@@ -90,9 +207,15 @@ Status 
FromBlockToRecordBatchConverter::convert(std::shared_ptr<arrow::RecordBat
         }
         _cur_builder = builder.get();
         try {
-            RETURN_IF_ERROR(_cur_type->get_serde()->write_column_to_arrow(
-                    *column, nullptr, _cur_builder, _cur_start, _cur_start + 
_cur_rows,
-                    _timezone_obj));
+            if (is_iceberg_uuid_field(_schema->field(idx)) &&
+                
is_string_type(remove_nullable(_cur_type)->get_primitive_type())) {
+                RETURN_IF_ERROR(write_iceberg_uuid_string_column_to_arrow(
+                        *column, _cur_type, _cur_builder, _cur_start, 
_cur_start + _cur_rows));
+            } else {
+                RETURN_IF_ERROR(_cur_type->get_serde()->write_column_to_arrow(
+                        *column, nullptr, _cur_builder, _cur_start, _cur_start 
+ _cur_rows,
+                        _timezone_obj));
+            }
         } catch (std::exception& e) {
             return Status::InternalError(
                     "Fail to convert block data to arrow data, type: {}, name: 
{}, error: {}",
diff --git a/be/src/format/table/iceberg/arrow_schema_util.cpp 
b/be/src/format/table/iceberg/arrow_schema_util.cpp
index e0bf830dfc8..e7186e6a486 100644
--- a/be/src/format/table/iceberg/arrow_schema_util.cpp
+++ b/be/src/format/table/iceberg/arrow_schema_util.cpp
@@ -25,6 +25,7 @@ namespace doris::iceberg {
 const char* ArrowSchemaUtil::PARQUET_FIELD_ID = "PARQUET:field_id";
 const char* ArrowSchemaUtil::ORIGINAL_TYPE = "originalType";
 const char* ArrowSchemaUtil::MAP_TYPE_VALUE = "mapType";
+const char* ArrowSchemaUtil::UUID_TYPE_VALUE = "uuid";
 
 Status ArrowSchemaUtil::convert(const Schema* schema, const std::string& 
timezone,
                                 std::vector<std::shared_ptr<arrow::Field>>& 
fields) {
@@ -75,13 +76,25 @@ Status ArrowSchemaUtil::convert_to(const 
iceberg::NestedField& field,
         break;
     }
 
-    case iceberg::TypeID::BINARY:
     case iceberg::TypeID::STRING:
-    case iceberg::TypeID::UUID:
-    case iceberg::TypeID::FIXED:
         arrow_type = arrow::utf8();
         break;
 
+    case iceberg::TypeID::BINARY:
+        arrow_type = arrow::binary();
+        break;
+
+    case iceberg::TypeID::UUID:
+        metadata[ORIGINAL_TYPE] = UUID_TYPE_VALUE;
+        arrow_type = arrow::fixed_size_binary(16);
+        break;
+
+    case iceberg::TypeID::FIXED: {
+        iceberg::FixedType* fixed_type = 
static_cast<iceberg::FixedType*>(field.field_type());
+        arrow_type = arrow::fixed_size_binary(fixed_type->get_length());
+        break;
+    }
+
     case iceberg::TypeID::DECIMAL: {
         auto* dt = dynamic_cast<DecimalType*>(field.field_type());
         arrow_type = arrow::decimal(dt->get_precision(), dt->get_scale());
diff --git a/be/src/format/table/iceberg/arrow_schema_util.h 
b/be/src/format/table/iceberg/arrow_schema_util.h
index 5b2e368fd14..ce13f543b74 100644
--- a/be/src/format/table/iceberg/arrow_schema_util.h
+++ b/be/src/format/table/iceberg/arrow_schema_util.h
@@ -32,6 +32,7 @@ private:
     static const char* PARQUET_FIELD_ID;
     static const char* ORIGINAL_TYPE;
     static const char* MAP_TYPE_VALUE;
+    static const char* UUID_TYPE_VALUE;
 
     static Status convert_to(const iceberg::NestedField& field,
                              std::shared_ptr<arrow::Field>* arrow_field,
diff --git a/be/src/format/table/iceberg/types.h 
b/be/src/format/table/iceberg/types.h
index 53c54e238fa..1594003fccd 100644
--- a/be/src/format/table/iceberg/types.h
+++ b/be/src/format/table/iceberg/types.h
@@ -289,6 +289,8 @@ public:
         return ss.str();
     }
 
+    int get_length() const { return length; }
+
 private:
     int length;
 };
diff --git a/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp 
b/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp
index 63a3819cd35..7b20c2f82d0 100644
--- a/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp
+++ b/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <arrow/array/array_binary.h>
+#include <arrow/array/array_nested.h>
 #include <arrow/array/builder_base.h>
 #include <arrow/array/builder_binary.h>
 #include <arrow/array/builder_decimal.h>
@@ -25,6 +27,7 @@
 #include <arrow/type.h>
 #include <arrow/type_fwd.h>
 #include <arrow/util/decimal.h>
+#include <arrow/util/key_value_metadata.h>
 #include <arrow/visit_type_inline.h>
 #include <arrow/visitor.h>
 #include <gen_cpp/Descriptors_types.h>
@@ -35,6 +38,7 @@
 
 #include <cmath>
 #include <cstdint>
+#include <cstring>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -519,6 +523,110 @@ TEST(DataTypeSerDeArrowTest, BigStringSerDeTest) {
     CommonDataTypeSerdeTest::compare_two_blocks(block, assert_block);
 }
 
+TEST(DataTypeSerDeArrowTest, IcebergUuidStringToFixedSizeBinary) {
+    auto block = std::make_shared<Block>();
+    auto strcol = ColumnString::create();
+    strcol->insert_data("550e8400-e29b-41d4-a716-446655440000", 36);
+    strcol->insert_data("00112233445566778899aabbccddeeff", 32);
+    DataTypePtr data_type(std::make_shared<DataTypeString>());
+    block->insert(ColumnWithTypeAndName(strcol->get_ptr(), data_type, 
"uuid_col"));
+
+    auto metadata = arrow::KeyValueMetadata::Make({"originalType"}, {"uuid"});
+    auto schema =
+            arrow::schema({arrow::field("uuid_col", 
arrow::fixed_size_binary(16), true, metadata)});
+
+    std::shared_ptr<arrow::RecordBatch> record_batch;
+    cctz::time_zone default_timezone;
+    Status status = convert_to_arrow_batch(*block, schema, 
arrow::default_memory_pool(),
+                                           &record_batch, default_timezone);
+    ASSERT_TRUE(status.ok()) << status;
+    ASSERT_NE(nullptr, record_batch);
+    ASSERT_EQ(2, record_batch->num_rows());
+
+    auto uuid_array =
+            
std::static_pointer_cast<arrow::FixedSizeBinaryArray>(record_batch->column(0));
+    ASSERT_EQ(16, uuid_array->byte_width());
+
+    const uint8_t expected0[] = {0x55, 0x0e, 0x84, 0x00, 0xe2, 0x9b, 0x41, 
0xd4,
+                                 0xa7, 0x16, 0x44, 0x66, 0x55, 0x44, 0x00, 
0x00};
+    const uint8_t expected1[] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 
0x77,
+                                 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 
0xff};
+    EXPECT_EQ(0, std::memcmp(uuid_array->GetValue(0), expected0, 
sizeof(expected0)));
+    EXPECT_EQ(0, std::memcmp(uuid_array->GetValue(1), expected1, 
sizeof(expected1)));
+}
+
+TEST(DataTypeSerDeArrowTest, NestedIcebergUuidStringToFixedSizeBinary) {
+    auto block = std::make_shared<Block>();
+    DataTypePtr data_type = std::make_shared<DataTypeStruct>(
+            std::vector<DataTypePtr> {std::make_shared<DataTypeString>()});
+    auto struct_column = data_type->create_column();
+
+    Struct row;
+    
row.push_back(Field::create_field<TYPE_STRING>("550e8400-e29b-41d4-a716-446655440000"));
+    struct_column->insert(Field::create_field<TYPE_STRUCT>(row));
+    block->insert(ColumnWithTypeAndName(struct_column->get_ptr(), data_type, 
"uuid_struct"));
+
+    auto metadata = arrow::KeyValueMetadata::Make({"originalType"}, {"uuid"});
+    auto schema = arrow::schema({arrow::field(
+            "uuid_struct",
+            arrow::struct_({arrow::field("id", arrow::fixed_size_binary(16), 
true, metadata)}),
+            true)});
+
+    std::shared_ptr<arrow::RecordBatch> record_batch;
+    cctz::time_zone default_timezone;
+    Status status = convert_to_arrow_batch(*block, schema, 
arrow::default_memory_pool(),
+                                           &record_batch, default_timezone);
+    ASSERT_TRUE(status.ok()) << status;
+
+    auto struct_array = 
std::static_pointer_cast<arrow::StructArray>(record_batch->column(0));
+    auto uuid_array = 
std::static_pointer_cast<arrow::FixedSizeBinaryArray>(struct_array->field(0));
+    const uint8_t expected[] = {0x55, 0x0e, 0x84, 0x00, 0xe2, 0x9b, 0x41, 0xd4,
+                                0xa7, 0x16, 0x44, 0x66, 0x55, 0x44, 0x00, 
0x00};
+    EXPECT_EQ(0, std::memcmp(uuid_array->GetValue(0), expected, 
sizeof(expected)));
+}
+
+TEST(DataTypeSerDeArrowTest, CharToFixedSizeBinaryPadsZeros) {
+    auto block = std::make_shared<Block>();
+    auto strcol = ColumnString::create();
+    strcol->insert_data("ab", 2);
+    DataTypePtr data_type(std::make_shared<DataTypeString>(4, TYPE_CHAR));
+    block->insert(ColumnWithTypeAndName(strcol->get_ptr(), data_type, 
"fixed_col"));
+
+    auto schema = arrow::schema({arrow::field("fixed_col", 
arrow::fixed_size_binary(4), true)});
+
+    std::shared_ptr<arrow::RecordBatch> record_batch;
+    cctz::time_zone default_timezone;
+    Status status = convert_to_arrow_batch(*block, schema, 
arrow::default_memory_pool(),
+                                           &record_batch, default_timezone);
+    ASSERT_TRUE(status.ok()) << status;
+
+    auto fixed_array =
+            
std::static_pointer_cast<arrow::FixedSizeBinaryArray>(record_batch->column(0));
+    const char expected[] = {'a', 'b', '\0', '\0'};
+    EXPECT_EQ(0, std::memcmp(fixed_array->GetValue(0), expected, 
sizeof(expected)));
+}
+
+TEST(DataTypeSerDeArrowTest, StringToLargeBinary) {
+    auto block = std::make_shared<Block>();
+    auto strcol = ColumnString::create();
+    strcol->insert_data("binary-value", 12);
+    DataTypePtr data_type(std::make_shared<DataTypeString>());
+    block->insert(ColumnWithTypeAndName(strcol->get_ptr(), data_type, 
"bin_col"));
+
+    auto schema = arrow::schema({arrow::field("bin_col", 
arrow::large_binary(), true)});
+
+    std::shared_ptr<arrow::RecordBatch> record_batch;
+    cctz::time_zone default_timezone;
+    Status status = convert_to_arrow_batch(*block, schema, 
arrow::default_memory_pool(),
+                                           &record_batch, default_timezone);
+    ASSERT_TRUE(status.ok()) << status;
+
+    auto binary_array = 
std::static_pointer_cast<arrow::LargeBinaryArray>(record_batch->column(0));
+    ASSERT_EQ(12, binary_array->value_length(0));
+    const uint8_t* raw = binary_array->value_data()->data() + 
binary_array->value_offset(0);
+    EXPECT_EQ(0, std::memcmp(raw, "binary-value", 12));
+}
+
 TEST(DataTypeSerDeArrowTest, BlockConverterTest) {
     std::vector<PrimitiveType> cols = {
             TYPE_INT,       TYPE_INT,        TYPE_STRING, TYPE_DECIMAL128I, 
TYPE_BOOLEAN,
diff --git a/be/test/core/data_type_serde/data_type_serde_varbinary_test.cpp 
b/be/test/core/data_type_serde/data_type_serde_varbinary_test.cpp
index c078de14bab..6016bb4a37a 100644
--- a/be/test/core/data_type_serde/data_type_serde_varbinary_test.cpp
+++ b/be/test/core/data_type_serde/data_type_serde_varbinary_test.cpp
@@ -281,6 +281,24 @@ TEST_F(DataTypeVarbinarySerDeTest, 
ArrowBinaryAndStringWithNullsAndInvalidType)
         }
     }
 
+    // LargeBinaryBuilder path (no nulls)
+    {
+        auto builder = std::make_shared<arrow::LargeBinaryBuilder>();
+        auto st = serde.write_column_to_arrow(*col, nullptr, builder.get(), 0, 
vals.size(), tz);
+        EXPECT_TRUE(st.ok()) << st.to_string();
+        std::shared_ptr<arrow::Array> arr;
+        ASSERT_TRUE(builder->Finish(&arr).ok());
+        auto* bin = dynamic_cast<arrow::LargeBinaryArray*>(arr.get());
+        ASSERT_NE(bin, nullptr);
+        ASSERT_EQ(bin->length(), static_cast<int>(vals.size()));
+        for (int i = 0; i < bin->length(); ++i) {
+            ASSERT_FALSE(bin->IsNull(i));
+            ASSERT_EQ(bin->value_length(i), 
static_cast<int64_t>(vals[i].size()));
+            const uint8_t* raw = bin->value_data()->data() + 
bin->value_offset(i);
+            EXPECT_EQ(memcmp(raw, vals[i].data(), vals[i].size()), 0);
+        }
+    }
+
     // Unsupported builder type
     {
         arrow::Int32Builder ib;
diff --git a/be/test/format/table/iceberg/arrow_schema_util_test.cpp 
b/be/test/format/table/iceberg/arrow_schema_util_test.cpp
index 2d897d2cec6..3379ed27257 100644
--- a/be/test/format/table/iceberg/arrow_schema_util_test.cpp
+++ b/be/test/format/table/iceberg/arrow_schema_util_test.cpp
@@ -224,6 +224,34 @@ TEST(ArrowSchemaUtilTest, test_list_field) {
     EXPECT_EQ("32", 
arrow_list->value_field()->metadata()->Get(pfid).ValueUnsafe());
 }
 
+TEST(ArrowSchemaUtilTest, test_binary_field_types) {
+    std::vector<NestedField> nested_fields;
+    nested_fields.reserve(4);
+    nested_fields.emplace_back(false, 1, "str_col", 
std::make_unique<StringType>(), std::nullopt);
+    nested_fields.emplace_back(false, 2, "bin_col", 
std::make_unique<BinaryType>(), std::nullopt);
+    nested_fields.emplace_back(false, 3, "fixed_col", 
std::make_unique<FixedType>(4), std::nullopt);
+    nested_fields.emplace_back(false, 4, "uuid_col", 
std::make_unique<UUIDType>(), std::nullopt);
+
+    Schema schema(1, std::move(nested_fields));
+
+    std::vector<std::shared_ptr<arrow::Field>> fields;
+    Status st = ArrowSchemaUtil::convert(&schema, "utc", fields);
+    ASSERT_TRUE(st.ok()) << st;
+    ASSERT_EQ(4, fields.size());
+
+    EXPECT_EQ(arrow::Type::STRING, fields[0]->type()->id());
+    EXPECT_EQ(arrow::Type::BINARY, fields[1]->type()->id());
+
+    ASSERT_EQ(arrow::Type::FIXED_SIZE_BINARY, fields[2]->type()->id());
+    auto fixed_type = 
std::static_pointer_cast<arrow::FixedSizeBinaryType>(fields[2]->type());
+    EXPECT_EQ(4, fixed_type->byte_width());
+
+    ASSERT_EQ(arrow::Type::FIXED_SIZE_BINARY, fields[3]->type()->id());
+    auto uuid_type = 
std::static_pointer_cast<arrow::FixedSizeBinaryType>(fields[3]->type());
+    EXPECT_EQ(16, uuid_type->byte_width());
+    EXPECT_EQ("uuid", 
fields[3]->metadata()->Get("originalType").ValueUnsafe());
+}
+
 TEST(ArrowSchemaUtilTest, test_parquet_filed_id) {
     std::string test_dir = "ut_dir/test_parquet_filed_id";
     Status st;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to