This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 267abae917f branch40: [improve](varbinary) support varbinary type with
topn runtime filter (#58721) (#59479)
267abae917f is described below
commit 267abae917fd731585a11c8a9d098d9c21e3e8b4
Author: zhangstar333 <[email protected]>
AuthorDate: Wed Dec 31 11:48:02 2025 +0800
branch40: [improve](varbinary) support varbinary type with topn runtime
filter (#58721) (#59479)
Problem Summary:
pick from master (#58721)
support varbinary with topn runtime filter eg: order by binary_col limit
n
and temp forbid varbinary type at: group by key, join key, comparison
predicate in FE part.
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/runtime/primitive_type.h | 3 +-
be/src/runtime/runtime_predicate.cpp | 6 ++
be/src/vec/columns/column_varbinary.cpp | 10 +-
be/src/vec/columns/column_varbinary.h | 5 +-
be/src/vec/common/string_view.h | 7 +-
be/src/vec/core/field.cpp | 4 +
be/src/vec/core/field.h | 96 +++++++++++++++--
be/src/vec/core/sort_block.h | 6 ++
be/src/vec/data_types/convert_field_to_type.cpp | 5 +
be/src/vec/data_types/data_type_varbinary.h | 2 +-
.../data_types/serde/data_type_varbinary_serde.cpp | 13 ++-
be/src/vec/exprs/vexpr.cpp | 6 ++
be/src/vec/exprs/vexpr.h | 7 ++
be/test/vec/columns/column_varbinary_test.cpp | 4 +-
be/test/vec/common/string_view_test.cpp | 104 +++++++++++++++++-
.../vec/data_types/data_type_varbinary_test.cpp | 113 +++++++++++++++++--
.../exec/format/parquet/parquet_reader_test.cpp | 24 ++---
.../doris/iceberg/IcebergSysTableColumnValue.java | 7 ++
.../org/apache/doris/analysis/OutFileClause.java | 2 +
.../nereids/rules/analysis/CheckAfterRewrite.java | 8 +-
.../apache/doris/nereids/types/VarBinaryType.java | 4 +
.../doris/nereids/util/TypeCoercionUtils.java | 6 ++
.../export/test_hive_export_varbinary.out | 29 +++++
.../data/external_table_p0/hive/test_hive_orc.out | 96 +++++++++++++++++
.../iceberg/test_iceberg_sys_table.out | 35 ++++++
.../export/test_hive_export_varbinary.groovy | 120 +++++++++++++++++++++
.../external_table_p0/hive/test_hive_orc.groovy | 55 ++++++++++
.../iceberg/test_iceberg_sys_table.groovy | 19 ++++
28 files changed, 752 insertions(+), 44 deletions(-)
diff --git a/be/src/runtime/primitive_type.h b/be/src/runtime/primitive_type.h
index 4673160bc52..4686bdbab7d 100644
--- a/be/src/runtime/primitive_type.h
+++ b/be/src/runtime/primitive_type.h
@@ -603,7 +603,8 @@ struct PrimitiveTypeTraits<TYPE_VARBINARY> {
using ColumnItemType = doris::StringView;
using DataType = vectorized::DataTypeVarbinary;
using ColumnType = vectorized::ColumnVarbinary;
- using NearestFieldType = doris::StringView;
+ // StringView is non-owning, but StringViewField wraps it with String for
ownership
+ using NearestFieldType = vectorized::StringViewField;
static constexpr PrimitiveType NearestPrimitiveType = TYPE_VARBINARY;
static constexpr PrimitiveType AvgNearestPrimitiveType = TYPE_VARBINARY;
};
diff --git a/be/src/runtime/runtime_predicate.cpp
b/be/src/runtime/runtime_predicate.cpp
index 8889c5955b1..1269d6b5666 100644
--- a/be/src/runtime/runtime_predicate.cpp
+++ b/be/src/runtime/runtime_predicate.cpp
@@ -197,6 +197,12 @@ bool RuntimePredicate::_init(PrimitiveType type) {
_get_value_fn = get_normal_value<TYPE_IPV6>;
break;
}
+ case PrimitiveType::TYPE_VARBINARY: {
+ _get_value_fn = [](const Field& field) {
+ return field.get<StringViewField>().get_string();
+ };
+ break;
+ }
default:
return false;
}
diff --git a/be/src/vec/columns/column_varbinary.cpp
b/be/src/vec/columns/column_varbinary.cpp
index 0203e151b6b..037f96fd5c6 100644
--- a/be/src/vec/columns/column_varbinary.cpp
+++ b/be/src/vec/columns/column_varbinary.cpp
@@ -28,6 +28,7 @@
#include "vec/columns/columns_common.h"
#include "vec/common/arena.h"
#include "vec/common/assert_cast.h"
+#include "vec/core/sort_block.h"
namespace doris::vectorized {
#include "common/compile_check_begin.h"
@@ -144,7 +145,7 @@ MutableColumnPtr ColumnVarbinary::permute(const
IColumn::Permutation& perm, size
res_data[i] = val;
continue;
}
- const auto* dst = const_cast<Arena&>(_arena).insert(val.data(),
val.size());
+ const auto* dst = res->_arena.insert(val.data(), val.size());
res_data[i] = doris::StringView(dst, val.size());
}
@@ -222,5 +223,12 @@ void ColumnVarbinary::insert_many_strings_overflow(const
StringRef* strings, siz
insert_many_strings(strings, num);
}
+void ColumnVarbinary::sort_column(const ColumnSorter* sorter, EqualFlags&
flags,
+ IColumn::Permutation& perms, EqualRange&
range,
+ bool last_column) const {
+ sorter->sort_column(assert_cast<const ColumnVarbinary&>(*this), flags,
perms, range,
+ last_column);
+}
+
#include "common/compile_check_end.h"
} // namespace doris::vectorized
diff --git a/be/src/vec/columns/column_varbinary.h
b/be/src/vec/columns/column_varbinary.h
index e9e900954c6..50fe660cfd8 100644
--- a/be/src/vec/columns/column_varbinary.h
+++ b/be/src/vec/columns/column_varbinary.h
@@ -77,7 +77,7 @@ public:
char* alloc(size_t length) { return _arena.alloc(length); }
void insert(const Field& x) override {
- auto value = vectorized::get<const doris::StringView&>(x);
+ const auto& value = vectorized::get<const StringViewField&>(x);
insert_data(value.data(), value.size());
}
@@ -185,6 +185,9 @@ public:
void insert_many_strings_overflow(const StringRef* strings, size_t num,
size_t max_length) override;
+ void sort_column(const ColumnSorter* sorter, EqualFlags& flags,
IColumn::Permutation& perms,
+ EqualRange& range, bool last_column) const override;
+
private:
Container _data;
Arena _arena;
diff --git a/be/src/vec/common/string_view.h b/be/src/vec/common/string_view.h
index 5cd560aad4a..218104ff750 100644
--- a/be/src/vec/common/string_view.h
+++ b/be/src/vec/common/string_view.h
@@ -126,16 +126,15 @@ public:
std::string dump_hex() const {
static const char* kHex = "0123456789ABCDEF";
std::string out;
- out.reserve(size_ * 2 + 3);
- out.push_back('X');
- out.push_back('\'');
+ out.reserve(size_ * 2 + 2);
+ out.push_back('0');
+ out.push_back('x');
const char* ptr = data();
for (uint32_t i = 0; i < size_; ++i) {
auto c = static_cast<unsigned char>(ptr[i]);
out.push_back(kHex[c >> 4]);
out.push_back(kHex[c & 0x0F]);
}
- out.push_back('\'');
return out;
}
diff --git a/be/src/vec/core/field.cpp b/be/src/vec/core/field.cpp
index ffefb8ffd20..c0b03df7c3c 100644
--- a/be/src/vec/core/field.cpp
+++ b/be/src/vec/core/field.cpp
@@ -739,6 +739,10 @@ std::string_view Field::as_string_view() const {
const auto& s = get<String>();
return {s.data(), s.size()};
}
+ if (type == PrimitiveType::TYPE_VARBINARY) {
+ const auto& svf = get<StringViewField>();
+ return {svf.data(), svf.size()};
+ }
// MATCH_PRIMITIVE_TYPE(INVALID_TYPE);
// MATCH_PRIMITIVE_TYPE(TYPE_NULL);
MATCH_PRIMITIVE_TYPE(TYPE_BOOLEAN);
diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h
index 2c611a59a28..3ef504dc0d6 100644
--- a/be/src/vec/core/field.h
+++ b/be/src/vec/core/field.h
@@ -246,6 +246,83 @@ private:
UInt32 scale;
};
+// StringViewField wraps a StringView and provides deep copy semantics.
+// Since StringView is a non-owning view (only contains pointer and length),
+// we need to store the actual data in a String to ensure the Field owns the
data.
+// This prevents dangling pointer issues when Field objects are copied or
moved.
+class StringViewField {
+public:
+ StringViewField() = default;
+ ~StringViewField() = default;
+
+ // Construct from raw data - performs deep copy
+ StringViewField(const char* data, size_t len) : _storage(data, len) {}
+
+ // Construct from StringView - performs deep copy
+ StringViewField(const StringView& sv) : _storage(sv.data(), sv.size()) {}
+
+ // Copy constructor - deep copy
+ StringViewField(const StringViewField& x) = default;
+
+ // Move constructor
+ StringViewField(StringViewField&& x) noexcept = default;
+
+ // Copy assignment - deep copy
+ StringViewField& operator=(const StringViewField& x) = default;
+
+ // Move assignment
+ StringViewField& operator=(StringViewField&& x) noexcept = default;
+
+ // Access methods
+ const char* data() const { return _storage.data(); }
+ size_t size() const { return _storage.size(); }
+ const String& get_string() const { return _storage; }
+
+ // Convert to StringView for compatibility
+ StringView to_string_view() const { return {data(),
static_cast<uint32_t>(size())}; }
+
+ // Comparison operators - using binary comparison (memcmp) for VARBINARY
semantics
+ bool operator<(const StringViewField& r) const {
+ int cmp = memcmp(_storage.data(), r._storage.data(),
+ std::min(_storage.size(), r._storage.size()));
+ return cmp < 0 || (cmp == 0 && _storage.size() < r._storage.size());
+ }
+ bool operator<=(const StringViewField& r) const { return !(r < *this); }
+ bool operator==(const StringViewField& r) const {
+ return _storage.size() == r._storage.size() &&
+ memcmp(_storage.data(), r._storage.data(), _storage.size()) ==
0;
+ }
+ bool operator>(const StringViewField& r) const { return r < *this; }
+ bool operator>=(const StringViewField& r) const { return !(*this < r); }
+ bool operator!=(const StringViewField& r) const { return !(*this == r); }
+
+ std::strong_ordering operator<=>(const StringViewField& r) const {
+ size_t min_size = std::min(_storage.size(), r._storage.size());
+ int cmp = memcmp(_storage.data(), r._storage.data(), min_size);
+ if (cmp < 0) {
+ return std::strong_ordering::less;
+ }
+ if (cmp > 0) {
+ return std::strong_ordering::greater;
+ }
+ // Prefixes are equal, compare lengths
+ return _storage.size() <=> r._storage.size();
+ }
+
+ // Arithmetic operators (not commonly used but required by Field)
+ const StringViewField& operator+=(const StringViewField& r) {
+ _storage += r._storage;
+ return *this;
+ }
+
+ const StringViewField& operator-=(const StringViewField& r) {
+ throw Exception(Status::FatalError("Not support minus operation on
StringViewField"));
+ }
+
+private:
+ String _storage; // Use String for deep copy and ownership
+};
+
/** 32 is enough. Round number is used for alignment and for better arithmetic
inside std::vector.
* NOTE: Actually, sizeof(std::string) is 32 when using libc++, so Field is
40 bytes.
*/
@@ -390,7 +467,7 @@ public:
case PrimitiveType::TYPE_VARCHAR:
return get<String>() <=> rhs.get<String>();
case PrimitiveType::TYPE_VARBINARY:
- return get<doris::StringView>() <=> rhs.get<doris::StringView>();
+ return get<StringViewField>() <=> rhs.get<StringViewField>();
case PrimitiveType::TYPE_DECIMAL32:
return get<Decimal32>() <=> rhs.get<Decimal32>();
case PrimitiveType::TYPE_DECIMAL64:
@@ -439,7 +516,7 @@ public:
f(field.template get<String>());
return;
case PrimitiveType::TYPE_VARBINARY:
- f(field.template get<doris::StringView>());
+ f(field.template get<StringViewField>());
return;
case PrimitiveType::TYPE_JSONB:
f(field.template get<JsonbField>());
@@ -489,11 +566,11 @@ public:
std::string_view as_string_view() const;
private:
- std::aligned_union_t<
- DBMS_MIN_FIELD_SIZE - sizeof(PrimitiveType), Null, UInt64,
UInt128, Int64, Int128, IPv6,
- Float64, String, JsonbField, Array, Tuple, Map, VariantMap,
DecimalField<Decimal32>,
- DecimalField<Decimal64>, DecimalField<Decimal128V2>,
DecimalField<Decimal128V3>,
- DecimalField<Decimal256>, BitmapValue, HyperLogLog, QuantileState,
doris::StringView>
+ std::aligned_union_t<DBMS_MIN_FIELD_SIZE - sizeof(PrimitiveType), Null,
UInt64, UInt128, Int64,
+ Int128, IPv6, Float64, String, JsonbField,
StringViewField, Array, Tuple,
+ Map, VariantMap, DecimalField<Decimal32>,
DecimalField<Decimal64>,
+ DecimalField<Decimal128V2>,
DecimalField<Decimal128V3>,
+ DecimalField<Decimal256>, BitmapValue, HyperLogLog,
QuantileState>
storage;
PrimitiveType type;
@@ -647,6 +724,11 @@ struct NearestFieldTypeImpl<PackedInt128> {
using Type = Int128;
};
+template <>
+struct NearestFieldTypeImpl<doris::StringView> {
+ using Type = StringViewField;
+};
+
template <typename T>
decltype(auto) cast_to_nearest_field_type(T&& x) {
using U = NearestFieldType<std::decay_t<T>>;
diff --git a/be/src/vec/core/sort_block.h b/be/src/vec/core/sort_block.h
index bc25129b4a2..b65f5b715df 100644
--- a/be/src/vec/core/sort_block.h
+++ b/be/src/vec/core/sort_block.h
@@ -38,6 +38,7 @@
#include "vec/columns/column_nullable.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_struct.h"
+#include "vec/columns/column_varbinary.h"
#include "vec/common/memcmp_small.h"
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
@@ -249,6 +250,10 @@ public:
EqualRange& range, bool last_column) const {
_sort_by_default(column, flags, perms, range, last_column);
}
+ void sort_column(const ColumnVarbinary& column, EqualFlags& flags,
IColumn::Permutation& perms,
+ EqualRange& range, bool last_column) const {
+ _sort_by_default(column, flags, perms, range, last_column);
+ }
void sort_column(const ColumnString64& column, EqualFlags& flags,
IColumn::Permutation& perms,
EqualRange& range, bool last_column) const {
@@ -378,6 +383,7 @@ private:
if constexpr (!std::is_same_v<ColumnType, ColumnString> &&
!std::is_same_v<ColumnType, ColumnString64> &&
!std::is_same_v<ColumnType, ColumnArray> &&
+ !std::is_same_v<ColumnType, ColumnVarbinary> &&
!std::is_same_v<ColumnType, ColumnMap> &&
!std::is_same_v<ColumnType, ColumnStruct>) {
auto value_a = column.get_data()[a];
diff --git a/be/src/vec/data_types/convert_field_to_type.cpp
b/be/src/vec/data_types/convert_field_to_type.cpp
index bdd3a7922ba..28947232d5b 100644
--- a/be/src/vec/data_types/convert_field_to_type.cpp
+++ b/be/src/vec/data_types/convert_field_to_type.cpp
@@ -93,6 +93,11 @@ public:
writer->writeString(x);
writer->writeEndString();
}
+ void operator()(const StringViewField& x, JsonbWriter* writer) const {
+ writer->writeStartString();
+ writer->writeString(x.data(), x.size());
+ writer->writeEndString();
+ }
void operator()(const JsonbField& x, JsonbWriter* writer) const {
const JsonbDocument* doc;
THROW_IF_ERROR(JsonbDocument::checkAndCreateDocument(x.get_value(),
x.get_size(), &doc));
diff --git a/be/src/vec/data_types/data_type_varbinary.h
b/be/src/vec/data_types/data_type_varbinary.h
index fa13d19287d..f84884d8e1b 100644
--- a/be/src/vec/data_types/data_type_varbinary.h
+++ b/be/src/vec/data_types/data_type_varbinary.h
@@ -40,7 +40,7 @@ class IColumn;
class DataTypeVarbinary : public IDataType {
public:
using ColumnType = ColumnVarbinary;
- using FieldType = doris::StringView;
+ using FieldType = StringViewField;
static constexpr PrimitiveType PType = TYPE_VARBINARY;
diff --git a/be/src/vec/data_types/serde/data_type_varbinary_serde.cpp
b/be/src/vec/data_types/serde/data_type_varbinary_serde.cpp
index 12e8a7c1924..b60ab825332 100644
--- a/be/src/vec/data_types/serde/data_type_varbinary_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_varbinary_serde.cpp
@@ -41,7 +41,7 @@ Status
DataTypeVarbinarySerDe::write_column_to_mysql_binary(const IColumn& colum
int64_t row_idx,
bool col_const,
const
FormatOptions& options) const {
auto col_index = index_check_const(row_idx, col_const);
- auto data = assert_cast<const
ColumnVarbinary&>(column).get_data()[col_index];
+ const auto& data = assert_cast<const
ColumnVarbinary&>(column).get_data()[col_index];
if (0 != result.push_string(data.data(), data.size())) {
return Status::InternalError("pack mysql buffer failed.");
@@ -62,7 +62,7 @@ Status DataTypeVarbinarySerDe::write_column_to_arrow(const
IColumn& column, cons
builder.type()->name()));
continue;
}
- auto string_view = varbinary_column_data[i];
+ const auto& string_view = varbinary_column_data[i];
RETURN_IF_ERROR(checkArrowStatus(builder.Append(string_view.data(),
string_view.size()),
column.get_name(),
builder.type()->name()));
}
@@ -118,8 +118,13 @@ Status
DataTypeVarbinarySerDe::deserialize_one_cell_from_json(IColumn& column, S
void DataTypeVarbinarySerDe::to_string(const IColumn& column, size_t row_num,
BufferWritable& bw,
const FormatOptions& options) const {
- const auto value = assert_cast<const
ColumnVarbinary&>(column).get_data_at(row_num);
- bw.write(value.data, value.size);
+ const auto& value = assert_cast<const
ColumnVarbinary&>(column).get_data()[row_num];
+ if (_nesting_level >= 2) { // in complex type, need to dump as hex string
by hand
+ const auto& hex_str = value.dump_hex();
+ bw.write(hex_str.data(), hex_str.size());
+ } else { // mysql protocol will be handle as hex binary data directly
+ bw.write(value.data(), value.size());
+ }
}
} // namespace doris::vectorized
diff --git a/be/src/vec/exprs/vexpr.cpp b/be/src/vec/exprs/vexpr.cpp
index 58707907f68..e7b945edee7 100644
--- a/be/src/vec/exprs/vexpr.cpp
+++ b/be/src/vec/exprs/vexpr.cpp
@@ -349,6 +349,12 @@ TExprNode create_texpr_node_from(const vectorized::Field&
field, const Primitive
THROW_IF_ERROR(create_texpr_literal_node<TYPE_TIMEV2>(&storage,
&node));
break;
}
+ case TYPE_VARBINARY: {
+ const auto& svf = field.get<vectorized::StringViewField>();
+ const std::string& storage = svf.get_string();
+ THROW_IF_ERROR(create_texpr_literal_node<TYPE_VARBINARY>(&storage,
&node));
+ break;
+ }
default:
throw Exception(ErrorCode::INTERNAL_ERROR, "runtime filter meet
invalid type {}",
int(type));
diff --git a/be/src/vec/exprs/vexpr.h b/be/src/vec/exprs/vexpr.h
index f86035ddaec..03694652bf0 100644
--- a/be/src/vec/exprs/vexpr.h
+++ b/be/src/vec/exprs/vexpr.h
@@ -595,6 +595,13 @@ Status create_texpr_literal_node(const void* data,
TExprNode* node, int precisio
(*node).__set_timev2_literal(timev2_literal);
(*node).__set_node_type(TExprNodeType::TIMEV2_LITERAL);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_TIMEV2,
precision, scale));
+ } else if constexpr (T == TYPE_VARBINARY) {
+ const auto* origin_value = reinterpret_cast<const std::string*>(data);
+ (*node).__set_node_type(TExprNodeType::VARBINARY_LITERAL);
+ TVarBinaryLiteral varbinary_literal;
+ varbinary_literal.__set_value(*origin_value);
+ (*node).__set_varbinary_literal(varbinary_literal);
+ (*node).__set_type(create_type_desc(PrimitiveType::TYPE_VARBINARY));
} else {
return Status::InvalidArgument("Invalid argument type!");
}
diff --git a/be/test/vec/columns/column_varbinary_test.cpp
b/be/test/vec/columns/column_varbinary_test.cpp
index 7799422b23d..4eeb637e64e 100644
--- a/be/test/vec/columns/column_varbinary_test.cpp
+++ b/be/test/vec/columns/column_varbinary_test.cpp
@@ -347,13 +347,13 @@ TEST_F(ColumnVarbinaryTest, FieldAccessOperatorAndGet) {
for (size_t i = 0; i < vals.size(); ++i) {
// operator[]
Field f = (*col)[i];
- auto sv = vectorized::get<const doris::StringView&>(f);
+ const auto& sv = vectorized::get<const StringViewField&>(f);
ASSERT_EQ(sv.size(), vals[i].size());
ASSERT_EQ(memcmp(sv.data(), vals[i].data(), sv.size()), 0);
// get(size_t, Field&)
Field f2;
col->get(i, f2);
- auto sv2 = vectorized::get<const doris::StringView&>(f2);
+ const auto& sv2 = vectorized::get<const StringViewField&>(f2);
ASSERT_EQ(sv2.size(), vals[i].size());
ASSERT_EQ(memcmp(sv2.data(), vals[i].data(), sv2.size()), 0);
}
diff --git a/be/test/vec/common/string_view_test.cpp
b/be/test/vec/common/string_view_test.cpp
index 4bfd8c25ea6..63bf3edb845 100644
--- a/be/test/vec/common/string_view_test.cpp
+++ b/be/test/vec/common/string_view_test.cpp
@@ -223,13 +223,13 @@ TEST_F(StringViewTest, ThreeWayComparisonOrdering) {
TEST_F(StringViewTest, DumpHex) {
// Empty
StringView empty;
- EXPECT_EQ(empty.dump_hex(), "X''");
+ EXPECT_EQ(empty.dump_hex(), "0x");
// Inline with known bytes
const unsigned char bytes_inline[] = {0x00, 0x01, 0x0A, 0x1F, 0x7F};
StringView svi(reinterpret_cast<const char*>(bytes_inline),
sizeof(bytes_inline));
EXPECT_TRUE(svi.isInline());
- EXPECT_EQ(svi.dump_hex(), "X'00010A1F7F'");
+ EXPECT_EQ(svi.dump_hex(), "0x00010A1F7F");
// Non-inline, length > 12
std::string big = make_bytes(16, 0x20); // bytes 0x20,0x21,...
@@ -237,13 +237,109 @@ TEST_F(StringViewTest, DumpHex) {
EXPECT_FALSE(svb.isInline());
// Build expected
std::ostringstream oss;
- oss << "X'";
+ oss << "0x";
for (unsigned char c : big) {
static const char* kHex = "0123456789ABCDEF";
oss << kHex[c >> 4] << kHex[c & 0x0F];
}
- oss << "'";
EXPECT_EQ(svb.dump_hex(), oss.str());
}
+// Verify inline strings with length > 4 correctly store and compare tail bytes
+TEST_F(StringViewTest, InlineTailBytesAndEquality) {
+ std::string s1 = "abcdEFGHIJ"; // len=10, inline
+ std::string s2 = "abcdEFGHIQ"; // same prefix, differ at last byte
+ StringView v1(s1);
+ StringView v2(s2);
+ ASSERT_TRUE(v1.isInline());
+ ASSERT_TRUE(v2.isInline());
+
+ // Full content preserved
+ EXPECT_EQ(static_cast<std::string>(v1), s1);
+ // operator== must detect tail difference
+ EXPECT_FALSE(v1 == v2);
+ EXPECT_NE(v1.compare(v2), 0);
+}
+
+// Cover constructors from std::string_view and unsigned char*, and high-bytes
dump
+TEST_F(StringViewTest, StringViewAndUnsignedCtorAndHighHex) {
+ // std::string_view ctor (inline boundary)
+ std::string inl = std::string(12, '\xAB');
+ std::string_view svw(inl);
+ StringView v_inl(svw);
+ EXPECT_TRUE(v_inl.isInline());
+ EXPECT_EQ(::memcmp(v_inl.data(), inl.data(), inl.size()), 0);
+
+ // unsigned char* ctor with >0x7F bytes to check sign issues in dump_hex
+ std::vector<uint8_t> bytes = {0x80, 0xFF, 0x00, 0x7F};
+ StringView v_unsigned(reinterpret_cast<unsigned char*>(bytes.data()),
+ static_cast<uint32_t>(bytes.size()));
+ EXPECT_TRUE(v_unsigned.isInline());
+ EXPECT_EQ(v_unsigned.dump_hex(), "0x80FF007F");
+}
+
+// Construct from nullptr with zero length should be a valid empty inline view
+TEST_F(StringViewTest, NullPtrZeroLenCtor) {
+ StringView v(static_cast<const char*>(nullptr), 0);
+ EXPECT_TRUE(v.empty());
+ EXPECT_TRUE(v.isInline());
+ EXPECT_EQ(v.size(), 0U);
+ // stream and string conversions should yield empty
+ std::ostringstream oss;
+ oss << v;
+ EXPECT_TRUE(oss.str().empty());
+ EXPECT_TRUE(static_cast<std::string>(v).empty());
+}
+
+// Compare where both sides share prefix but decision comes from length after
prefix (inline)
+TEST_F(StringViewTest, CompareAfterPrefixInlineLength) {
+ std::string a = "abcdEF"; // len=6
+ std::string b = "abcdEFGH"; // len=8, starts with a
+ StringView va(a), vb(b);
+ ASSERT_TRUE(va.isInline());
+ ASSERT_TRUE(vb.isInline());
+ EXPECT_LT(va.compare(vb), 0);
+ EXPECT_TRUE((va <=> vb) == std::strong_ordering::less);
+}
+
+// Same as above but with non-inline strings
+TEST_F(StringViewTest, CompareAfterPrefixNonInlineLength) {
+ std::string base = make_bytes(24, 0x41); // >=13 => non-inline
+ std::string short_s = base.substr(0, 20);
+ std::string long_s = short_s + "ZZ"; // same prefix, longer
+ StringView vs(short_s), vl(long_s);
+ ASSERT_FALSE(vs.isInline());
+ ASSERT_FALSE(vl.isInline());
+ EXPECT_LT(vs.compare(vl), 0);
+ EXPECT_TRUE((vs <=> vl) == std::strong_ordering::less);
+}
+
+// Non-inline copy semantics: copying should keep pointer identity and equality
+TEST_F(StringViewTest, NonInlineCopySemanticsAndIteration) {
+ std::string big = make_bytes(32, 0x21);
+ StringView a(big);
+ StringView b = a; // copy
+ ASSERT_FALSE(a.isInline());
+ ASSERT_FALSE(b.isInline());
+ EXPECT_EQ(a.data(), b.data());
+ EXPECT_TRUE(a == b);
+
+ // Iteration reconstructs the same bytes
+ std::string via_iter(a.begin(), a.end());
+ EXPECT_EQ(via_iter.size(), big.size());
+ EXPECT_EQ(::memcmp(via_iter.data(), big.data(), big.size()), 0);
+}
+
+// operator== should also detect non-inline tail differences (not only
compare())
+TEST_F(StringViewTest, NonInlineEqualityDetectsTailDiff) {
+ std::string s1 = make_bytes(20, 0x30);
+ std::string s2 = s1;
+ s2[10] ^= 0x1; // differ after prefix
+ StringView v1(s1), v2(s2);
+ ASSERT_FALSE(v1.isInline());
+ ASSERT_FALSE(v2.isInline());
+ EXPECT_FALSE(v1 == v2);
+ EXPECT_NE(v1.compare(v2), 0);
+}
+
} // namespace doris
diff --git a/be/test/vec/data_types/data_type_varbinary_test.cpp
b/be/test/vec/data_types/data_type_varbinary_test.cpp
index 33571fe4074..6465758c623 100644
--- a/be/test/vec/data_types/data_type_varbinary_test.cpp
+++ b/be/test/vec/data_types/data_type_varbinary_test.cpp
@@ -87,7 +87,7 @@ TEST_F(DataTypeVarbinaryTest, CreateColumnAndCheckColumn) {
TEST_F(DataTypeVarbinaryTest, GetDefaultField) {
DataTypeVarbinary dt;
Field def = dt.get_default();
- const auto& sv = get<const doris::StringView&>(def);
+ const auto& sv = get<const StringViewField&>(def);
EXPECT_EQ(sv.size(), 0U);
}
@@ -176,7 +176,7 @@ TEST_F(DataTypeVarbinaryTest, GetFieldWithDataType) {
auto fwd = dt.get_field_with_data_type(*col, 0);
EXPECT_EQ(fwd.base_scalar_type_id, PrimitiveType::TYPE_VARBINARY);
- const auto& sv = get<const doris::StringView&>(fwd.field);
+ const auto& sv = get<const StringViewField&>(fwd.field);
ASSERT_EQ(sv.size(), v.size());
ASSERT_EQ(memcmp(sv.data(), v.data(), sv.size()), 0);
}
@@ -189,18 +189,119 @@ TEST_F(DataTypeVarbinaryTest, GetFieldFromTExprNode) {
node.__isset.varbinary_literal = true;
Field f = dt.get_field(node);
- const auto& sv = get<const doris::StringView&>(f);
+ const auto& sv = get<const StringViewField&>(f);
ASSERT_EQ(sv.size(), 5U);
ASSERT_EQ(memcmp(sv.data(), "hello", 5), 0);
}
-TEST_F(DataTypeVarbinaryTest, ToProtobufLen) {
- DataTypeVarbinary dt(123);
+TEST_F(DataTypeVarbinaryTest, CheckColumnOnConstColumn) {
+ DataTypeVarbinary dt;
+ auto col = dt.create_column();
+ auto* vb = assert_cast<ColumnVarbinary*>(col.get());
+ std::string v = make_bytes(4, 0x12);
+ vb->insert_data(v.data(), v.size());
+
+ // Wrap as const column
+ auto cconst = ColumnConst::create(col->get_ptr(), /*size=*/5);
+ EXPECT_TRUE(dt.check_column(*cconst).ok());
+}
+
+TEST_F(DataTypeVarbinaryTest, SerializeDeserializeConstColumn) {
+ DataTypeVarbinary dt;
+ auto base = dt.create_column();
+ auto* vb = assert_cast<ColumnVarbinary*>(base.get());
+ std::string val = make_bytes(3, 0x7A);
+ vb->insert_data(val.data(), val.size());
+
+ // Make it const with logical row_num=5
+ ColumnPtr const_col = ColumnConst::create(base->get_ptr(), /*size=*/5);
+
+ int ver = BeExecVersionManager::get_newest_version();
+ // Expect: bool + size_t(row_num) + size_t(real_need_copy_num=1) + one
size + payload
+ size_t expected = sizeof(bool) + sizeof(size_t) + sizeof(size_t) +
sizeof(size_t) + val.size();
+ auto sz = dt.get_uncompressed_serialized_bytes(*const_col, ver);
+ EXPECT_EQ(static_cast<size_t>(sz), expected);
+
+ std::string buf;
+ buf.resize(expected);
+ char* p = buf.data();
+ char* end = dt.serialize(*const_col, p, ver);
+ ASSERT_EQ(static_cast<size_t>(end - p), expected);
+
+ MutableColumnPtr deser = dt.create_column();
+ const char* p2 = buf.data();
+ const char* end2 = dt.deserialize(p2, &deser, ver);
+ ASSERT_EQ(static_cast<size_t>(end2 - p2), expected);
+
+ // After deserialize, the output is a ColumnConst wrapping the data column.
+ ColumnPtr out = deser->get_ptr();
+ ASSERT_TRUE(is_column_const(*out));
+ const auto& cconst = assert_cast<const ColumnConst&>(*out);
+ EXPECT_EQ(cconst.size(), 5U); // logical row num retained
+ const auto& inner = assert_cast<const
ColumnVarbinary&>(*cconst.get_data_column_ptr());
+ ASSERT_EQ(inner.size(), 1U);
+ auto r = inner.get_data_at(0);
+ ASSERT_EQ(r.size, val.size());
+ ASSERT_EQ(memcmp(r.data, val.data(), r.size), 0);
+}
+
+TEST_F(DataTypeVarbinaryTest, SerDeWriteColumnToMysql) {
+ DataTypeVarbinary dt;
+ auto col = dt.create_column();
+ auto* vb = assert_cast<ColumnVarbinary*>(col.get());
+ std::string v1 = make_bytes(2, 0x10);
+ vb->insert_data(v1.data(), v1.size());
+
+ auto serde = dt.get_serde();
+ // binary protocol
+ doris::MysqlRowBinaryBuffer rb_bin;
+ auto format_options = DataTypeSerDe::FormatOptions();
+ auto st2 = serde->write_column_to_mysql_binary(*col, rb_bin,
/*row_idx=*/0, /*col_const=*/false,
+ format_options);
+ EXPECT_TRUE(st2.ok());
+ EXPECT_GT(rb_bin.length(), 0);
+}
+
+TEST_F(DataTypeVarbinaryTest, GetStorageFieldTypeThrows) {
+ DataTypeVarbinary dt;
+ EXPECT_THROW({ (void)dt.get_storage_field_type(); }, doris::Exception);
+}
+
+TEST_F(DataTypeVarbinaryTest, GetFieldFromTExprNodeWithEmbeddedNull) {
+ DataTypeVarbinary dt;
+ TExprNode node;
+ node.node_type = TExprNodeType::VARBINARY_LITERAL;
+ std::string raw = std::string("a\0b", 3);
+ node.varbinary_literal.value = raw;
+ node.__isset.varbinary_literal = true;
+
+ Field f = dt.get_field(node);
+ const auto& sv = get<const StringViewField&>(f);
+ ASSERT_EQ(sv.size(), raw.size());
+ ASSERT_EQ(memcmp(sv.data(), raw.data(), sv.size()), 0);
+}
+
+TEST_F(DataTypeVarbinaryTest, ToProtobufDefaultLen) {
+ DataTypeVarbinary dt; // default len = -1
PTypeDesc ptype;
PTypeNode pnode;
PScalarType scalar;
dt.to_protobuf(&ptype, &pnode, &scalar);
- EXPECT_EQ(scalar.len(), 123);
+ EXPECT_EQ(scalar.len(), -1);
+}
+
+TEST_F(DataTypeVarbinaryTest, GetFieldWithDataTypeNonInline) {
+ DataTypeVarbinary dt;
+ auto col = dt.create_column();
+ auto* vb = assert_cast<ColumnVarbinary*>(col.get());
+ std::string big = make_bytes(doris::StringView::kInlineSize + 6, 0x55);
+ vb->insert_data(big.data(), big.size());
+
+ auto fwd = dt.get_field_with_data_type(*col, 0);
+ EXPECT_EQ(fwd.base_scalar_type_id, PrimitiveType::TYPE_VARBINARY);
+ const auto& sv = get<const StringViewField&>(fwd.field);
+ ASSERT_EQ(sv.size(), big.size());
+ ASSERT_EQ(memcmp(sv.data(), big.data(), sv.size()), 0);
}
} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/test/vec/exec/format/parquet/parquet_reader_test.cpp
b/be/test/vec/exec/format/parquet/parquet_reader_test.cpp
index 341526e0926..659f1211e24 100644
--- a/be/test/vec/exec/format/parquet/parquet_reader_test.cpp
+++ b/be/test/vec/exec/format/parquet/parquet_reader_test.cpp
@@ -243,9 +243,9 @@ TEST_F(ParquetReaderTest, uuid_varbinary) {
auto varbinary_column =
assert_cast<const
ColumnVarbinary*>(nullable_column->get_nested_column_ptr().get());
auto& data = varbinary_column->get_data();
- EXPECT_EQ(data[0].dump_hex(), "X'550E8400E29B41D4A716446655440000'");
- EXPECT_EQ(data[1].dump_hex(), "X'123E4567E89B12D3A456426614174000'");
- EXPECT_EQ(data[2].dump_hex(), "X'00000000000000000000000000000000'");
+ EXPECT_EQ(data[0].dump_hex(), "0x550E8400E29B41D4A716446655440000");
+ EXPECT_EQ(data[1].dump_hex(), "0x123E4567E89B12D3A456426614174000");
+ EXPECT_EQ(data[2].dump_hex(), "0x00000000000000000000000000000000");
}
TEST_F(ParquetReaderTest, varbinary_varbinary) {
@@ -316,9 +316,9 @@ TEST_F(ParquetReaderTest, varbinary_varbinary) {
auto varbinary_column =
assert_cast<const
ColumnVarbinary*>(nullable_column->get_nested_column_ptr().get());
auto& data = varbinary_column->get_data();
- EXPECT_EQ(data[0].dump_hex(), "X'0123456789ABCDEF'");
- EXPECT_EQ(data[1].dump_hex(), "X'FEDCBA9876543210'");
- EXPECT_EQ(data[2].dump_hex(), "X'00'");
+ EXPECT_EQ(data[0].dump_hex(), "0x0123456789ABCDEF");
+ EXPECT_EQ(data[1].dump_hex(), "0xFEDCBA9876543210");
+ EXPECT_EQ(data[2].dump_hex(), "0x00");
}
TEST_F(ParquetReaderTest, varbinary_string) {
@@ -391,9 +391,9 @@ TEST_F(ParquetReaderTest, varbinary_string) {
auto varbinary_column =
assert_cast<const
ColumnVarbinary*>(nullable_column->get_nested_column_ptr().get());
auto& data = varbinary_column->get_data();
- EXPECT_EQ(data[0].dump_hex(), "X'0123456789ABCDEF'");
- EXPECT_EQ(data[1].dump_hex(), "X'FEDCBA9876543210'");
- EXPECT_EQ(data[2].dump_hex(), "X'00'");
+ EXPECT_EQ(data[0].dump_hex(), "0x0123456789ABCDEF");
+ EXPECT_EQ(data[1].dump_hex(), "0xFEDCBA9876543210");
+ EXPECT_EQ(data[2].dump_hex(), "0x00");
}
TEST_F(ParquetReaderTest, varbinary_string2) {
@@ -465,9 +465,9 @@ TEST_F(ParquetReaderTest, varbinary_string2) {
auto nullable_column = assert_cast<const ColumnNullable*>(col.get());
auto string_column =
assert_cast<const
ColumnString*>(nullable_column->get_nested_column_ptr().get());
- EXPECT_EQ(StringView(string_column->get_data_at(0)).dump_hex(),
"X'0123456789ABCDEF'");
- EXPECT_EQ(StringView(string_column->get_data_at(1)).dump_hex(),
"X'FEDCBA9876543210'");
- EXPECT_EQ(StringView(string_column->get_data_at(2)).dump_hex(), "X'00'");
+ EXPECT_EQ(StringView(string_column->get_data_at(0)).dump_hex(),
"0x0123456789ABCDEF");
+ EXPECT_EQ(StringView(string_column->get_data_at(1)).dump_hex(),
"0xFEDCBA9876543210");
+ EXPECT_EQ(StringView(string_column->get_data_at(2)).dump_hex(), "0x00");
}
static ParquetReader* create_parquet_reader(TFileScanRangeParams& scan_params,
diff --git
a/fe/be-java-extensions/iceberg-metadata-scanner/src/main/java/org/apache/doris/iceberg/IcebergSysTableColumnValue.java
b/fe/be-java-extensions/iceberg-metadata-scanner/src/main/java/org/apache/doris/iceberg/IcebergSysTableColumnValue.java
index b1caff8ae8f..70814f27f75 100644
---
a/fe/be-java-extensions/iceberg-metadata-scanner/src/main/java/org/apache/doris/iceberg/IcebergSysTableColumnValue.java
+++
b/fe/be-java-extensions/iceberg-metadata-scanner/src/main/java/org/apache/doris/iceberg/IcebergSysTableColumnValue.java
@@ -142,6 +142,13 @@ public class IcebergSysTableColumnValue implements
ColumnValue {
@Override
public byte[] getBytes() {
+ //
https://github.com/apache/iceberg/blob/8626ef5137024c1a69daaff97a832af6b0ae37ea/api/src/main/java/org/apache/iceberg/types/Type.java#L45C5-L45C30
+ if (fieldData instanceof ByteBuffer) {
+ ByteBuffer buffer = (ByteBuffer) fieldData;
+ byte[] bytes = new byte[buffer.remaining()];
+ buffer.get(bytes);
+ return bytes;
+ }
return (byte[]) fieldData;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
index 78bc458b76f..2d3cd457130 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
@@ -255,6 +255,7 @@ public class OutFileClause {
case HLL:
case BITMAP:
case QUANTILE_STATE:
+ case VARBINARY:
orcType = "binary";
break;
case DATEV2:
@@ -412,6 +413,7 @@ public class OutFileClause {
case HLL:
case BITMAP:
case QUANTILE_STATE:
+ case VARBINARY:
checkOrcType(schema.second, "binary", true,
resultType.getPrimitiveType().toString());
break;
case STRUCT:
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java
index cbec9deb0eb..29f93153738 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CheckAfterRewrite.java
@@ -143,7 +143,7 @@ public class CheckAfterRewrite extends
OneAnalysisRuleFactory {
if (plan instanceof LogicalAggregate) {
LogicalAggregate<?> agg = (LogicalAggregate<?>) plan;
for (Expression groupBy : agg.getGroupByExpressions()) {
- if (groupBy.getDataType().isObjectOrVariantType()) {
+ if (groupBy.getDataType().isObjectOrVariantType() ||
groupBy.getDataType().isVarBinaryType()) {
throw new AnalysisException(Type.OnlyMetricTypeErrorMsg);
}
}
@@ -181,11 +181,17 @@ public class CheckAfterRewrite extends
OneAnalysisRuleFactory {
for (Expression conjunct : join.getHashJoinConjuncts()) {
if (conjunct.anyMatch(e -> ((Expression)
e).getDataType().isVariantType())) {
throw new AnalysisException("variant type could not in
join equal conditions: " + conjunct.toSql());
+ } else if (conjunct.anyMatch(e -> ((Expression)
e).getDataType().isVarBinaryType())) {
+ throw new AnalysisException(
+ "varbinary type could not in join equal
conditions: " + conjunct.toSql());
}
}
for (Expression conjunct : join.getMarkJoinConjuncts()) {
if (conjunct.anyMatch(e -> ((Expression)
e).getDataType().isVariantType())) {
throw new AnalysisException("variant type could not in
join equal conditions: " + conjunct.toSql());
+ } else if (conjunct.anyMatch(e -> ((Expression)
e).getDataType().isVarBinaryType())) {
+ throw new AnalysisException(
+ "varbinary type could not in join equal
conditions: " + conjunct.toSql());
}
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VarBinaryType.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VarBinaryType.java
index de5dfcbdfe2..a06b7c20053 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VarBinaryType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VarBinaryType.java
@@ -21,6 +21,8 @@ import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.Type;
import org.apache.doris.nereids.types.coercion.PrimitiveType;
+import com.google.common.base.Preconditions;
+
import java.util.Objects;
/**
@@ -40,6 +42,8 @@ public class VarBinaryType extends PrimitiveType {
}
public VarBinaryType(int len) {
+ Preconditions.checkArgument(0 <= len && len <= MAX_VARBINARY_LENGTH,
+ "VarBinary length must be between 0 and " +
MAX_VARBINARY_LENGTH + ", but got: " + len);
this.len = len;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/TypeCoercionUtils.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/TypeCoercionUtils.java
index e02a2925674..0e1689caf72 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/TypeCoercionUtils.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/TypeCoercionUtils.java
@@ -1321,6 +1321,12 @@ public class TypeCoercionUtils {
Expression left = comparisonPredicate.left();
Expression right = comparisonPredicate.right();
+ // TODO: remove this restriction after supporting varbinary comparison
in BE
+ if (left.getDataType().isVarBinaryType() ||
right.getDataType().isVarBinaryType()) {
+ throw new AnalysisException("data type varbinary "
+ + " could not used in ComparisonPredicate now " +
comparisonPredicate.toSql());
+ }
+
// same type
if (left.getDataType().equals(right.getDataType())) {
if (!supportCompare(left.getDataType(), false)) {
diff --git
a/regression-test/data/external_table_p0/export/test_hive_export_varbinary.out
b/regression-test/data/external_table_p0/export/test_hive_export_varbinary.out
new file mode 100644
index 00000000000..f50f72bd68a
--- /dev/null
+++
b/regression-test/data/external_table_p0/export/test_hive_export_varbinary.out
@@ -0,0 +1,29 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !select_tvf0 --
+1 0x550E8400E29B41D4A716446655440000 0x0123456789ABCDEF
+2 0x123E4567E89B12D3A456426614174000 0xFEDCBA9876543210
+3 0x00000000000000000000000000000000 0x00
+4 \N \N
+5 0xABCDEF1234567890 0xFFFF
+
+-- !select_tvf1 --
+1 0x550E8400E29B41D4A716446655440000 0x0123456789ABCDEF
+2 0x123E4567E89B12D3A456426614174000 0xFEDCBA9876543210
+3 0x00000000000000000000000000000000 0x00
+4 \N \N
+5 0xABCDEF1234567890 0xFFFF
+
+-- !select_tvf2 --
+1 0x550E8400E29B41D4A716446655440000 0x0123456789ABCDEF
+2 0x123E4567E89B12D3A456426614174000 0xFEDCBA9876543210
+3 0x00000000000000000000000000000000 0x00
+4 \N \N
+5 0xABCDEF1234567890 0xFFFF
+
+-- !select_tvf3 --
+1 0x550E8400E29B41D4A716446655440000 0x0123456789ABCDEF
+2 0x123E4567E89B12D3A456426614174000 0xFEDCBA9876543210
+3 0x00000000000000000000000000000000 0x00
+4 \N \N
+5 0xABCDEF1234567890 0xFFFF
+
diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc.out
b/regression-test/data/external_table_p0/hive/test_hive_orc.out
index 42f49602d9a..aa105acc1cd 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_orc.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_orc.out
@@ -1005,6 +1005,54 @@ tablets tinyint_col 179 182 182 187
183 181 177 183 177 187 183 202 202 186
528
4 71 986333570 88457348565826264 true 3.851428E7
5.177242499015833e+17 Football stumble result taste pleased midst. Mirror
loyal divide. Ultimately injury chip lawyer. Leadership teacher belong. \N
2022-08-26T19:19:31 140717.2626 phones smallint_col 2022-08-22
[] ["PWCCGPfT"] phones smallint_col
3 21 986131998 683875991736989008 \N 6.0774349E8
\N Weird period none. Assertion coincide college. Subscriber fridge craft.
Poisonous donation ordinary. Explode village debt. split terrify
2022-08-27T01:05:20 390407.1015 tablets tinyint_col 2015-11-04
[6.630172173849485e+17] [null, "lSQFYzUG", "vMVMwfZzpl", "QRFiYUUefBc",
"VdtTHy", "YrPtPPzynqXCCzm", "LfIgQvGimBBzlgn"] tablets tinyint_col
+-- !sql_topn_binary_col1 --
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+
+-- !sql_topn_binary_col2 --
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+
+-- !sql_topn_binary_col3 --
+0x79656C6C6F772070756E6368 yellow punch
+0x79657374657264617920736574746C656D656E7420756E636F6E7363696F75732062696F6C6F6779
yesterday settlement unconscious biology
+0x796574207265696E666F726365 yet reinforce
+0x7965742072656C6178 yet relax
+0x796F756E67206372656174697665207072657365727665 young creative preserve
+0x796F757220617070726F7072696174656C7920736574746C657220646F6D696E616E6365
your appropriately settler dominance
+0x796F75727320696E766164652070726976616379 yours invade privacy
+0x796F7572732073657373696F6E20636F756E63696C yours session council
+0x796F75746820626563617573652065787065646974696F6E youth because expedition
+0x7A65726F207765617220736561736F6E zero wear season
+
+-- !sql_topn_binary_col4 --
+0x79656C6C6F772070756E6368 yellow punch
+0x79657374657264617920736574746C656D656E7420756E636F6E7363696F75732062696F6C6F6779
yesterday settlement unconscious biology
+0x796574207265696E666F726365 yet reinforce
+0x7965742072656C6178 yet relax
+0x796F756E67206372656174697665207072657365727665 young creative preserve
+0x796F757220617070726F7072696174656C7920736574746C657220646F6D696E616E6365
your appropriately settler dominance
+0x796F75727320696E766164652070726976616379 yours invade privacy
+0x796F7572732073657373696F6E20636F756E63696C yours session council
+0x796F75746820626563617573652065787065646974696F6E youth because expedition
+0x7A65726F207765617220736561736F6E zero wear season
+
-- !select_top50 --
4 55 999742610 400899305488827731 false 6.5976813E8
7.87233046169374e+17 \N base tennis pit vertical friday
2022-08-19T07:29:58 \N tablets smallint_col 2019-02-07
[7.53124931825377e+17] ["NbSSBtwzpxNSkkwga"] tablets smallint_col
2 49 999613702 105493714032727452 \N 6.3322381E8
9.864232441024018e+17 Unveil bright recruit participate. Suspect impression
camera mathematical revelation. Fault live2 elbow debt west hydrogen current.
how literary 2022-09-03T17:20:21 481707.1065 tablets boolean_col
2020-01-12 [] ["HoMrAnn", "wteEFvIwoZsVpVQdscMb", null, "zcGFmv",
"kGEBBckbMtX", "hrEtCGFdPWZK"] tablets boolean_col
@@ -2011,3 +2059,51 @@ tablets tinyint_col 179 182 182 187
183 181 177 183 177 187 183 202 202 186
528
4 71 986333570 88457348565826264 true 3.851428E7
5.177242499015833e+17 Football stumble result taste pleased midst. Mirror
loyal divide. Ultimately injury chip lawyer. Leadership teacher belong. \N
2022-08-26T19:19:31 140717.2626 phones smallint_col 2022-08-22
[] ["PWCCGPfT"] phones smallint_col
3 21 986131998 683875991736989008 \N 6.0774349E8
\N Weird period none. Assertion coincide college. Subscriber fridge craft.
Poisonous donation ordinary. Explode village debt. split terrify
2022-08-27T01:05:20 390407.1015 tablets tinyint_col 2015-11-04
[6.630172173849485e+17] [null, "lSQFYzUG", "vMVMwfZzpl", "QRFiYUUefBc",
"VdtTHy", "YrPtPPzynqXCCzm", "LfIgQvGimBBzlgn"] tablets tinyint_col
+-- !sql_topn_binary_col1 --
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+
+-- !sql_topn_binary_col2 --
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+\N \N
+
+-- !sql_topn_binary_col3 --
+0x79656C6C6F772070756E6368 yellow punch
+0x79657374657264617920736574746C656D656E7420756E636F6E7363696F75732062696F6C6F6779
yesterday settlement unconscious biology
+0x796574207265696E666F726365 yet reinforce
+0x7965742072656C6178 yet relax
+0x796F756E67206372656174697665207072657365727665 young creative preserve
+0x796F757220617070726F7072696174656C7920736574746C657220646F6D696E616E6365
your appropriately settler dominance
+0x796F75727320696E766164652070726976616379 yours invade privacy
+0x796F7572732073657373696F6E20636F756E63696C yours session council
+0x796F75746820626563617573652065787065646974696F6E youth because expedition
+0x7A65726F207765617220736561736F6E zero wear season
+
+-- !sql_topn_binary_col4 --
+0x79656C6C6F772070756E6368 yellow punch
+0x79657374657264617920736574746C656D656E7420756E636F6E7363696F75732062696F6C6F6779
yesterday settlement unconscious biology
+0x796574207265696E666F726365 yet reinforce
+0x7965742072656C6178 yet relax
+0x796F756E67206372656174697665207072657365727665 young creative preserve
+0x796F757220617070726F7072696174656C7920736574746C657220646F6D696E616E6365
your appropriately settler dominance
+0x796F75727320696E766164652070726976616379 yours invade privacy
+0x796F7572732073657373696F6E20636F756E63696C yours session council
+0x796F75746820626563617573652065787065646974696F6E youth because expedition
+0x7A65726F207765617220736561736F6E zero wear season
+
diff --git
a/regression-test/data/external_table_p0/iceberg/test_iceberg_sys_table.out
b/regression-test/data/external_table_p0/iceberg/test_iceberg_sys_table.out
index f6b7072cc54..030a379d20b 100644
--- a/regression-test/data/external_table_p0/iceberg/test_iceberg_sys_table.out
+++ b/regression-test/data/external_table_p0/iceberg/test_iceberg_sys_table.out
@@ -975,3 +975,38 @@ total_data_file_size_in_bytes bigint Yes true
\N NONE
-- !select_partitions_count --
9
+-- !varbinary_sys_table_desc --
+column_sizes map<int,bigint> Yes true \N NONE
+content int Yes true \N NONE
+content_offset bigint Yes true \N NONE
+content_size_in_bytes bigint Yes true \N NONE
+equality_ids array<int> Yes true \N NONE
+file_format text Yes true \N NONE
+file_path text Yes true \N NONE
+file_size_in_bytes bigint Yes true \N NONE
+first_row_id bigint Yes true \N NONE
+key_metadata varbinary(2147483647) Yes true \N NONE
+lower_bounds map<int,varbinary(2147483647)> Yes true \N NONE
+nan_value_counts map<int,bigint> Yes true \N NONE
+null_value_counts map<int,bigint> Yes true \N NONE
+readable_metrics
struct<id:struct<column_size:bigint,value_count:bigint,null_value_count:bigint,nan_value_count:bigint,lower_bound:int,upper_bound:int>,name:struct<column_size:bigint,value_count:bigint,null_value_count:bigint,nan_value_count:bigint,lower_bound:text,upper_bound:text>>
Yes true \N NONE
+record_count bigint Yes true \N NONE
+referenced_data_file text Yes true \N NONE
+sort_order_id int Yes true \N NONE
+spec_id int Yes true \N NONE
+split_offsets array<bigint> Yes true \N NONE
+upper_bounds map<int,varbinary(2147483647)> Yes true \N NONE
+value_counts map<int,bigint> Yes true \N NONE
+
+-- !varbinary_sys_table_select --
+0 PARQUET 1 {1:0x01000000, 2:0x416C696365} {1:0x01000000,
2:0x416C696365}
+0 PARQUET 1 {1:0x02000000, 2:0x426F622055706461746564}
{1:0x02000000, 2:0x426F622055706461746564}
+0 PARQUET 1 {1:0x02000000, 2:0x426F62} {1:0x02000000,
2:0x426F62}
+0 PARQUET 1 {1:0x04000000, 2:0x44617665} {1:0x04000000,
2:0x44617665}
+0 PARQUET 1 {1:0x05000000, 2:0x457665} {1:0x05000000,
2:0x457665}
+0 PARQUET 1 {1:0x06000000, 2:0x4672616E6B} {1:0x06000000,
2:0x4672616E6B}
+0 PARQUET 1 {1:0x07000000, 2:0x4772616365} {1:0x07000000,
2:0x4772616365}
+0 PARQUET 1 {1:0x08000000, 2:0x4865696469} {1:0x08000000,
2:0x4865696469}
+0 PARQUET 1 {1:0x09000000, 2:0x4976616E} {1:0x09000000,
2:0x4976616E}
+0 PARQUET 1 {1:0x0A000000, 2:0x4A756479} {1:0x0A000000,
2:0x4A756479}
+
diff --git
a/regression-test/suites/external_table_p0/export/test_hive_export_varbinary.groovy
b/regression-test/suites/external_table_p0/export/test_hive_export_varbinary.groovy
new file mode 100644
index 00000000000..dff673c6b61
--- /dev/null
+++
b/regression-test/suites/external_table_p0/export/test_hive_export_varbinary.groovy
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.codehaus.groovy.runtime.IOGroovyMethods
+
+import java.nio.charset.StandardCharsets
+import java.nio.file.Files
+import java.nio.file.Paths
+
+suite("test_hive_export_varbinary", "external,hive,external_docker") {
+
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+ logger.info("diable Hive test.")
+ return;
+ }
+
+ for (String hivePrefix : ["hive2"]) {
+ setHivePrefix(hivePrefix)
+ String hms_port = context.config.otherConfigs.get(hivePrefix +
"HmsPort")
+ String hdfs_port = context.config.otherConfigs.get(hivePrefix +
"HdfsPort")
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+
+ // It's okay to use random `hdfsUser`, but can not be empty.
+ def hdfsUserName = "doris"
+ def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+ def outfile_path = "/user/doris/tmp_data"
+ def uri = "${defaultFS}" + "${outfile_path}/exp_"
+
+ def outfile_to_HDFS = {format,export_table_name ->
+ // select ... into outfile ...
+ def uuid = UUID.randomUUID().toString()
+ outfile_path = "/user/doris/tmp_data/${uuid}"
+ uri = "${defaultFS}" + "${outfile_path}/exp_"
+
+ def res = sql """
+ SELECT * FROM ${export_table_name} t ORDER BY id
+ INTO OUTFILE "${uri}"
+ FORMAT AS ${format}
+ PROPERTIES (
+ "fs.defaultFS"="${defaultFS}",
+ "hadoop.username" = "${hdfsUserName}"
+ );
+ """
+ logger.info("outfile success path: " + res[0][3]);
+ return res[0][3]
+ }
+
+ try {
+ String catalog_name_with_export =
"${hivePrefix}_test_varbinary_with_export"
+ sql """drop catalog if exists ${catalog_name_with_export}"""
+ sql """create catalog if not exists ${catalog_name_with_export}
properties (
+ "type"="hms",
+ 'hive.metastore.uris' =
'thrift://${externalEnvIp}:${hms_port}',
+ "enable.mapping.varbinary"="true"
+ );"""
+
+ sql """ switch ${catalog_name_with_export}"""
+ sql """ use `test_varbinary` """
+
+ // test outfile to hdfs
+ def format = "parquet"
+ def export_table_name = "test_hive_binary_parquet"
+
+ def outfile_url0 = outfile_to_HDFS(format, export_table_name)
+ order_qt_select_tvf0 """ select * from HDFS(
+ "uri" = "${outfile_url0}.${format}",
+ "hadoop.username" = "${hdfsUserName}",
+ "enable_mapping_varbinary"="true",
+ "format" = "${format}");
+ """
+
+ format = "parquet"
+ export_table_name = "test_hive_binary_orc"
+ def outfile_url1 = outfile_to_HDFS(format, export_table_name)
+ order_qt_select_tvf1 """ select * from HDFS(
+ "uri" = "${outfile_url1}.${format}",
+ "hadoop.username" = "${hdfsUserName}",
+ "enable_mapping_varbinary"="true",
+ "format" = "${format}");
+ """
+
+ format = "orc"
+ export_table_name = "test_hive_binary_parquet"
+ def outfile_url2 = outfile_to_HDFS(format, export_table_name)
+ order_qt_select_tvf2 """ select * from HDFS(
+ "uri" = "${outfile_url2}.${format}",
+ "hadoop.username" = "${hdfsUserName}",
+ "enable_mapping_varbinary"="true",
+ "format" = "${format}");
+ """
+
+ format = "orc"
+ export_table_name = "test_hive_binary_orc"
+ def outfile_url3 = outfile_to_HDFS(format, export_table_name)
+ order_qt_select_tvf3 """ select * from HDFS(
+ "uri" = "${outfile_url3}.${format}",
+ "hadoop.username" = "${hdfsUserName}",
+ "enable_mapping_varbinary"="true",
+ "format" = "${format}");
+ """
+
+ } finally {
+ }
+ }
+}
diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
index 380169fa667..8e19fe581d5 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
@@ -204,6 +204,61 @@ suite("test_hive_orc",
"all_types,p0,external,hive,external_docker,external_dock
sql """use `${catalog_name}`.`default`"""
select_top50()
sql """drop catalog if exists ${catalog_name}"""
+
+ sql """drop catalog if exists test_hive_orc_mapping_varbinary"""
+ sql """create catalog if not exists
test_hive_orc_mapping_varbinary properties (
+ "type"="hms",
+ 'hive.metastore.uris' =
'thrift://${externalEnvIp}:${hms_port}',
+ 'enable.mapping.varbinary' = 'true'
+ );"""
+ sql """use `test_hive_orc_mapping_varbinary`.`default`"""
+
+ explain {
+ sql("select binary_col from orc_all_types order by
binary_col,string_col asc limit 10;")
+ contains("TOPN OPT:1")
+ }
+ explain {
+ sql("select binary_col from orc_all_types order by
binary_col asc limit 10;")
+ contains("TOPN OPT:1")
+ }
+ order_qt_sql_topn_binary_col1 """ select
binary_col,cast(binary_col as string) from orc_all_types order by binary_col
asc limit 10; """
+ order_qt_sql_topn_binary_col2 """ select
binary_col,cast(binary_col as string) from orc_all_types order by binary_col
asc ,string_col asc limit 10; """
+ order_qt_sql_topn_binary_col3 """ select
binary_col,cast(binary_col as string) from orc_all_types order by binary_col
desc limit 10; """
+ order_qt_sql_topn_binary_col4 """ select
binary_col,cast(binary_col as string) from orc_all_types order by binary_col
desc,string_col desc limit 10; """
+
+ sql """ switch internal; """
+ sql """ drop database if exists test_view_varbinary_db"""
+ sql """ create database if not exists test_view_varbinary_db"""
+ sql """use test_view_varbinary_db"""
+ test {
+ sql " create view test_view_varbinary as select binary_col
from `test_hive_orc_mapping_varbinary`.`default`.`orc_all_types`; "
+ exception " View does not support VARBINARY type: binary_col"
+ }
+
+ test {
+ sql """ CREATE MATERIALIZED VIEW test_mv_varbinary
+ BUILD DEFERRED REFRESH AUTO ON MANUAL
+ DISTRIBUTED BY RANDOM BUCKETS 2
+ PROPERTIES ('replication_num' = '1')
+ AS select binary_col from
`test_hive_orc_mapping_varbinary`.`default`.`orc_all_types`; """
+ exception " MTMV do not support varbinary type : binary_col"
+ }
+
+ test {
+ sql " select count() from
`test_hive_orc_mapping_varbinary`.`default`.`orc_all_types` group by
binary_col; "
+ exception " errCode = 2"
+ }
+
+ test {
+ sql " select * from
`test_hive_orc_mapping_varbinary`.`default`.`orc_all_types` as a join
`test_hive_orc_mapping_varbinary`.`default`.`orc_all_types` as b on
a.binary_col = b.binary_col; "
+ exception " errCode = 2,"
+ }
+
+ test {
+ sql " select * from
`test_hive_orc_mapping_varbinary`.`default`.`orc_all_types` where binary_col =
X'AB'; "
+ exception " could not used in ComparisonPredicate now"
+ }
+
} finally {
}
}
diff --git
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_sys_table.groovy
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_sys_table.groovy
index 0b2925a0d6e..0efa0e0233a 100644
---
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_sys_table.groovy
+++
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_sys_table.groovy
@@ -390,4 +390,23 @@ suite("test_iceberg_sys_table",
"p0,external,doris,external_docker,external_dock
sql """select committed_at, snapshot_id, parent_id, operation from
${catalog_name}.${db_name}.test_iceberg_systable_tbl1\$snapshots"""
}
try_sql("DROP USER ${user}")
+
+ sql """drop catalog if exists test_iceberg_varbinary_sys"""
+ sql """
+ CREATE CATALOG test_iceberg_varbinary_sys PROPERTIES (
+ 'type'='iceberg',
+ 'iceberg.catalog.type'='rest',
+ 'uri' = 'http://${externalEnvIp}:${rest_port}',
+ "s3.access_key" = "admin",
+ "s3.secret_key" = "password",
+ "s3.endpoint" = "http://${externalEnvIp}:${minio_port}",
+ "s3.region" = "us-east-1",
+ 'enable.mapping.varbinary' = 'true'
+ );"""
+
+ sql """switch test_iceberg_varbinary_sys """
+ sql """use ${db_name}"""
+
+ order_qt_varbinary_sys_table_desc """desc
test_iceberg_systable_unpartitioned\$files"""
+ order_qt_varbinary_sys_table_select """select content, file_format,
record_count, lower_bounds, upper_bounds from
test_iceberg_systable_unpartitioned\$files;"""
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]