This is an automated email from the ASF dual-hosted git repository.
zclll pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 5a9cb5aa255 [optimize](function) modify to_base64, from_base64, unhex
to avoid a extra copy (#57182)
5a9cb5aa255 is described below
commit 5a9cb5aa255c6fb8f0ea39296200488188ae18c7
Author: admiring_xm <[email protected]>
AuthorDate: Wed Oct 22 16:11:43 2025 +0800
[optimize](function) modify to_base64, from_base64, unhex to avoid a extra
copy (#57182)
The format is method/a/b, where a denotes the number of lines and b
denotes the length of each line's string.
```text
Run on (24 X 2395.45 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x12)
L1 Instruction 32 KiB (x12)
L2 Unified 1024 KiB (x12)
L3 Unified 32768 KiB (x1)
Load Average: 3.32, 2.73, 2.74
--------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------
BM_ToBase64Impl_Old/1000/256 151122 ns 151123 ns 4623
BM_ToBase64Impl_Old/100/65536 801007 ns 800983 ns 785
BM_ToBase64Impl_Old/10/100000 115405 ns 115405 ns 6024
BM_ToBase64Impl_New/1000/256 126745 ns 126732 ns 5512
BM_ToBase64Impl_New/100/65536 449138 ns 449122 ns 1304
BM_ToBase64Impl_New/10/100000 54417 ns 54373 ns 12773
BM_FromBase64Impl_Old/1000/256 95666 ns 95666 ns 7315
BM_FromBase64Impl_Old/100/65536 750223 ns 750193 ns 867
BM_FromBase64Impl_Old/10/100000 113150 ns 113146 ns 6115
BM_FromBase64Impl_New/1000/256 79121 ns 79121 ns 8847
BM_FromBase64Impl_New/100/65536 522309 ns 521026 ns 1214
BM_FromBase64Impl_New/10/100000 78207 ns 78205 ns 8929
BM_UnhexImpl_Old/1000/256 13058 ns 13058 ns 53759
BM_UnhexImpl_Old/100/65536 1328 ns 1327 ns 484917
BM_UnhexImpl_Old/100/100000 1319 ns 1319 ns 529219
BM_UnhexImpl_New/1000/256 6045 ns 6023 ns 116043
BM_UnhexImpl_New/100/65536 656 ns 656 ns 1056606
BM_UnhexImpl_New/100/100000 710 ns 710 ns 984401
BM_UnhexNullImpl_Old/1000/256 6413 ns 6413 ns 109417
BM_UnhexNullImpl_Old/100/65536 686 ns 681 ns 1054879
BM_UnhexNullImpl_Old/100/100000 725 ns 725 ns 957391
BM_UnhexNullImpl_New/1000/256 6144 ns 6144 ns 114054
BM_UnhexNullImpl_New/100/65536 664 ns 664 ns 981229
BM_UnhexNullImpl_New/100/100000 710 ns 708 ns 982551
```
---
be/benchmark/benchmark_main.cpp | 1 +
be/benchmark/benchmark_string.hpp | 394 ++++++++++++++++++++++++++
be/src/vec/functions/function_string.cpp | 142 +++++-----
be/test/vec/function/function_string_test.cpp | 19 +-
4 files changed, 489 insertions(+), 67 deletions(-)
diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp
index 950c55b5883..6e7213a89ec 100644
--- a/be/benchmark/benchmark_main.cpp
+++ b/be/benchmark/benchmark_main.cpp
@@ -20,6 +20,7 @@
#include "benchmark_bit_pack.hpp"
#include "benchmark_fastunion.hpp"
#include "benchmark_hll_merge.hpp"
+#include "benchmark_string.hpp"
#include "binary_cast_benchmark.hpp"
#include "vec/columns/column_string.h"
#include "vec/core/block.h"
diff --git a/be/benchmark/benchmark_string.hpp
b/be/benchmark/benchmark_string.hpp
new file mode 100644
index 00000000000..3bd2a2d442a
--- /dev/null
+++ b/be/benchmark/benchmark_string.hpp
@@ -0,0 +1,394 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <benchmark/benchmark.h>
+
+#include <random>
+#include <vector>
+
+#include "vec/functions/function_string.cpp"
+#include "vec/functions/string_hex_util.h"
+
+namespace doris::vectorized {
+
+// old logic for to_base64
+struct OldToBase64Impl {
+ static Status vector(const ColumnString::Chars& data, const
ColumnString::Offsets& offsets,
+ ColumnString::Chars& dst_data, ColumnString::Offsets&
dst_offsets) {
+ auto rows_count = offsets.size();
+ dst_offsets.resize(rows_count);
+ std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
+ std::vector<char> heap_buf;
+ for (int i = 0; i < rows_count; ++i) {
+ const auto* source = reinterpret_cast<const char*>(&data[offsets[i
- 1]]);
+ size_t srclen = offsets[i] - offsets[i - 1];
+
+ if (srclen == 0) {
+ StringOP::push_empty_string(i, dst_data, dst_offsets);
+ continue;
+ }
+
+ auto cipher_len = 4 * ((srclen + 2) / 3);
+ char* dst = nullptr;
+ if (cipher_len <= stack_buf.size()) {
+ dst = stack_buf.data();
+ } else {
+ heap_buf.resize(cipher_len);
+ dst = heap_buf.data();
+ }
+
+ auto outlen = base64_encode((const unsigned char*)source, srclen,
(unsigned char*)dst);
+
+ StringOP::push_value_string(std::string_view(dst, outlen), i,
dst_data, dst_offsets);
+ }
+ return Status::OK();
+ }
+};
+
+// old logic for from_base64
+struct OldFromBase64Impl {
+ static Status vector(const ColumnString::Chars& data, const
ColumnString::Offsets& offsets,
+ ColumnString::Chars& dst_data, ColumnString::Offsets&
dst_offsets,
+ NullMap& null_map) {
+ auto rows_count = offsets.size();
+ dst_offsets.resize(rows_count);
+ std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
+ std::vector<char> heap_buf;
+ for (int i = 0; i < rows_count; ++i) {
+ if (null_map[i]) {
+ StringOP::push_null_string(i, dst_data, dst_offsets, null_map);
+ continue;
+ }
+
+ const auto* source = reinterpret_cast<const char*>(&data[offsets[i
- 1]]);
+ ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
+
+ if (srclen == 0) {
+ StringOP::push_empty_string(i, dst_data, dst_offsets);
+ continue;
+ }
+
+ auto cipher_len = srclen / 4 * 3;
+ char* dst = nullptr;
+ if (cipher_len <= stack_buf.size()) {
+ dst = stack_buf.data();
+ } else {
+ heap_buf.resize(cipher_len);
+ dst = heap_buf.data();
+ }
+ auto outlen = base64_decode(source, srclen, dst);
+
+ if (outlen < 0) {
+ StringOP::push_null_string(i, dst_data, dst_offsets, null_map);
+ } else {
+ StringOP::push_value_string(std::string_view(dst, outlen), i,
dst_data,
+ dst_offsets);
+ }
+ }
+
+ return Status::OK();
+ }
+};
+
+// old logic for unhex
+struct OldUnHexImpl {
+ static Status vector(const ColumnString::Chars& data, const
ColumnString::Offsets& offsets,
+ ColumnString::Chars& dst_data, ColumnString::Offsets&
dst_offsets) {
+ auto rows_count = offsets.size();
+ dst_offsets.resize(rows_count);
+ std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
+ std::vector<char> heap_buf;
+ for (int i = 0; i < rows_count; ++i) {
+ const auto* source = reinterpret_cast<const char*>(&data[offsets[i
- 1]]);
+ ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
+
+ if (srclen == 0) {
+ StringOP::push_empty_string(i, dst_data, dst_offsets);
+ continue;
+ }
+
+ auto cipher_len = srclen / 2;
+ char* dst = nullptr;
+ if (cipher_len <= stack_buf.size()) {
+ dst = stack_buf.data();
+ } else {
+ heap_buf.resize(cipher_len);
+ dst = heap_buf.data();
+ }
+
+ int outlen = string_hex::hex_decode(source, srclen, dst);
+ StringOP::push_value_string(std::string_view(dst, outlen), i,
dst_data, dst_offsets);
+ }
+
+ return Status::OK();
+ }
+
+ static Status vector(const ColumnString::Chars& data, const
ColumnString::Offsets& offsets,
+ ColumnString::Chars& dst_data, ColumnString::Offsets&
dst_offsets,
+ ColumnUInt8::Container* null_map_data) {
+ auto rows_count = offsets.size();
+ dst_offsets.resize(rows_count);
+ std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
+ std::vector<char> heap_buf;
+ for (int i = 0; i < rows_count; ++i) {
+ const auto* source = reinterpret_cast<const char*>(&data[offsets[i
- 1]]);
+ ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
+
+ if (srclen == 0) {
+ StringOP::push_null_string(i, dst_data, dst_offsets,
*null_map_data);
+ continue;
+ }
+
+ auto cipher_len = srclen / 2;
+ char* dst = nullptr;
+ if (cipher_len <= stack_buf.size()) {
+ dst = stack_buf.data();
+ } else {
+ heap_buf.resize(cipher_len);
+ dst = heap_buf.data();
+ }
+
+ int outlen = string_hex::hex_decode(source, srclen, dst);
+ if (outlen == 0) {
+ StringOP::push_null_string(i, dst_data, dst_offsets,
*null_map_data);
+ continue;
+ }
+
+ StringOP::push_value_string(std::string_view(dst, outlen), i,
dst_data, dst_offsets);
+ }
+
+ return Status::OK();
+ }
+};
+
+static void generate_test_data(ColumnString::Chars& data,
ColumnString::Offsets& offsets,
+ size_t num_rows, size_t str_len, unsigned char
max_char) {
+ const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
+ std::mt19937 rng(12345);
+ std::uniform_int_distribution<unsigned char> dist(0, max_char);
+
+ offsets.resize(num_rows);
+ data.clear();
+ data.reserve(num_rows * str_len);
+
+ size_t offset = 0;
+ for (size_t i = 0; i < num_rows; ++i) {
+ for (size_t j = 0; j < str_len; ++j) {
+ data.push_back(static_cast<char>(base64_chars[dist(rng)]));
+ }
+ offset += str_len;
+ offsets[i] = cast_set<uint32_t>(offset);
+ }
+}
+
+static void BM_ToBase64Impl_Old(benchmark::State& state) {
+ size_t rows = state.range(0);
+ size_t len = state.range(1);
+ ColumnString::Chars data;
+ ColumnString::Offsets offsets;
+ generate_test_data(data, offsets, rows, len, 63);
+
+ ColumnString::Chars dst_data;
+ ColumnString::Offsets dst_offsets;
+
+ for (auto _ : state) {
+ dst_data.clear();
+ dst_offsets.clear();
+ benchmark::DoNotOptimize(OldToBase64Impl::vector(data, offsets,
dst_data, dst_offsets));
+ }
+}
+
+static void BM_ToBase64Impl_New(benchmark::State& state) {
+ size_t rows = state.range(0);
+ size_t len = state.range(1);
+ ColumnString::Chars data;
+ ColumnString::Offsets offsets;
+ generate_test_data(data, offsets, rows, len, 63);
+
+ ColumnString::Chars dst_data;
+ ColumnString::Offsets dst_offsets;
+
+ for (auto _ : state) {
+ dst_data.clear();
+ dst_offsets.clear();
+ benchmark::DoNotOptimize(ToBase64Impl::vector(data, offsets, dst_data,
dst_offsets));
+ }
+}
+
+// 10, 100000 is a big data test case for testing memory allocation on the heap
+BENCHMARK(BM_ToBase64Impl_Old)
+ ->Args({1000, 256})
+ ->Args({100, 65536})
+ ->Args({10, 100000})
+ ->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_ToBase64Impl_New)
+ ->Args({1000, 256})
+ ->Args({100, 65536})
+ ->Args({10, 100000})
+ ->Unit(benchmark::kNanosecond);
+
+static void BM_FromBase64Impl_Old(benchmark::State& state) {
+ size_t rows = state.range(0);
+ size_t len = state.range(1);
+ ColumnString::Chars data;
+ ColumnString::Offsets offsets;
+ auto null_map = ColumnUInt8::create(rows, 0);
+ generate_test_data(data, offsets, rows, len, 63);
+
+ ColumnString::Chars dst_data;
+ ColumnString::Offsets dst_offsets;
+
+ for (auto _ : state) {
+ dst_data.clear();
+ dst_offsets.clear();
+ benchmark::DoNotOptimize(OldFromBase64Impl::vector(data, offsets,
dst_data, dst_offsets,
+
null_map->get_data()));
+ }
+}
+
+static void BM_FromBase64Impl_New(benchmark::State& state) {
+ size_t rows = state.range(0);
+ size_t len = state.range(1);
+ ColumnString::Chars data;
+ ColumnString::Offsets offsets;
+ auto null_map = ColumnUInt8::create(rows, 0);
+ generate_test_data(data, offsets, rows, len, 63);
+
+ ColumnString::Chars dst_data;
+ ColumnString::Offsets dst_offsets;
+
+ for (auto _ : state) {
+ dst_data.clear();
+ dst_offsets.clear();
+ benchmark::DoNotOptimize(
+ FromBase64Impl::vector(data, offsets, dst_data, dst_offsets,
null_map->get_data()));
+ }
+}
+
+// 10, 100000 is a big data test case for testing memory allocation on the heap
+BENCHMARK(BM_FromBase64Impl_Old)
+ ->Args({1000, 256})
+ ->Args({100, 65536})
+ ->Args({10, 100000})
+ ->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_FromBase64Impl_New)
+ ->Args({1000, 256})
+ ->Args({100, 65536})
+ ->Args({10, 100000})
+ ->Unit(benchmark::kNanosecond);
+
+static void BM_UnhexImpl_Old(benchmark::State& state) {
+ size_t rows = state.range(0);
+ size_t len = state.range(1);
+ ColumnString::Chars data;
+ ColumnString::Offsets offsets;
+ generate_test_data(data, offsets, rows, len, 16);
+
+ ColumnString::Chars dst_data;
+ ColumnString::Offsets dst_offsets;
+
+ for (auto _ : state) {
+ dst_data.clear();
+ dst_offsets.clear();
+ benchmark::DoNotOptimize(OldUnHexImpl::vector(data, offsets, dst_data,
dst_offsets));
+ }
+}
+
+static void BM_UnhexImpl_New(benchmark::State& state) {
+ size_t rows = state.range(0);
+ size_t len = state.range(1);
+ ColumnString::Chars data;
+ ColumnString::Offsets offsets;
+ generate_test_data(data, offsets, rows, len, 16);
+
+ ColumnString::Chars dst_data;
+ ColumnString::Offsets dst_offsets;
+
+ for (auto _ : state) {
+ dst_data.clear();
+ dst_offsets.clear();
+ benchmark::DoNotOptimize(
+ UnHexImpl<UnHexImplEmpty>::vector(data, offsets, dst_data,
dst_offsets));
+ }
+}
+
+// 100, 100000 is a big data test case for testing memory allocation on the
heap
+BENCHMARK(BM_UnhexImpl_Old)
+ ->Args({1000, 256})
+ ->Args({100, 65536})
+ ->Args({100, 100000})
+ ->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_UnhexImpl_New)
+ ->Args({1000, 256})
+ ->Args({100, 65536})
+ ->Args({100, 100000})
+ ->Unit(benchmark::kNanosecond);
+
+static void BM_UnhexNullImpl_Old(benchmark::State& state) {
+ size_t rows = state.range(0);
+ size_t len = state.range(1);
+ ColumnString::Chars data;
+ ColumnString::Offsets offsets;
+ auto null_map = ColumnUInt8::create(rows, 0);
+ generate_test_data(data, offsets, rows, len, 16);
+
+ ColumnString::Chars dst_data;
+ ColumnString::Offsets dst_offsets;
+
+ for (auto _ : state) {
+ dst_data.clear();
+ dst_offsets.clear();
+ benchmark::DoNotOptimize(
+ OldUnHexImpl::vector(data, offsets, dst_data, dst_offsets,
&null_map->get_data()));
+ }
+}
+
+static void BM_UnhexNullImpl_New(benchmark::State& state) {
+ size_t rows = state.range(0);
+ size_t len = state.range(1);
+ ColumnString::Chars data;
+ ColumnString::Offsets offsets;
+ auto null_map = ColumnUInt8::create(rows, 0);
+ generate_test_data(data, offsets, rows, len, 16);
+
+ ColumnString::Chars dst_data;
+ ColumnString::Offsets dst_offsets;
+
+ for (auto _ : state) {
+ dst_data.clear();
+ dst_offsets.clear();
+ benchmark::DoNotOptimize(UnHexImpl<UnHexImplNull>::vector(
+ data, offsets, dst_data, dst_offsets, &null_map->get_data()));
+ }
+}
+
+// 100, 100000 is a big data test case for testing memory allocation on the
heap
+BENCHMARK(BM_UnhexNullImpl_Old)
+ ->Args({1000, 256})
+ ->Args({100, 65536})
+ ->Args({100, 100000})
+ ->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_UnhexNullImpl_New)
+ ->Args({1000, 256})
+ ->Args({100, 65536})
+ ->Args({100, 100000})
+ ->Unit(benchmark::kNanosecond);
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/functions/function_string.cpp
b/be/src/vec/functions/function_string.cpp
index 7a6c71eadbe..4bdcc88428e 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -978,30 +978,32 @@ struct UnHexImpl {
ColumnString::Chars& dst_data, ColumnString::Offsets&
dst_offsets) {
auto rows_count = offsets.size();
dst_offsets.resize(rows_count);
- std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
- std::vector<char> heap_buf;
+
+ int64_t total_size = 0;
+ for (size_t i = 0; i < rows_count; i++) {
+ size_t len = offsets[i] - offsets[i - 1];
+ total_size += len / 2;
+ }
+ ColumnString::check_chars_length(total_size, rows_count);
+ dst_data.resize(total_size);
+ char* dst_data_ptr = reinterpret_cast<char*>(dst_data.data());
+ size_t offset = 0;
+
for (int i = 0; i < rows_count; ++i) {
const auto* source = reinterpret_cast<const char*>(&data[offsets[i
- 1]]);
ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
- if (srclen == 0) {
- StringOP::push_empty_string(i, dst_data, dst_offsets);
+ if (UNLIKELY(srclen == 0)) {
+ dst_offsets[i] = cast_set<uint32_t>(offset);
continue;
}
- auto cipher_len = srclen / 2;
- char* dst = nullptr;
- if (cipher_len <= stack_buf.size()) {
- dst = stack_buf.data();
- } else {
- heap_buf.resize(cipher_len);
- dst = heap_buf.data();
- }
+ int outlen = string_hex::hex_decode(source, srclen, dst_data_ptr +
offset);
- int outlen = string_hex::hex_decode(source, srclen, dst);
- StringOP::push_value_string(std::string_view(dst, outlen), i,
dst_data, dst_offsets);
+ offset += outlen;
+ dst_offsets[i] = cast_set<uint32_t>(offset);
}
-
+ dst_data.pop_back(total_size - offset);
return Status::OK();
}
@@ -1010,35 +1012,39 @@ struct UnHexImpl {
ColumnUInt8::Container* null_map_data) {
auto rows_count = offsets.size();
dst_offsets.resize(rows_count);
- std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
- std::vector<char> heap_buf;
+
+ int64_t total_size = 0;
+ for (size_t i = 0; i < rows_count; i++) {
+ size_t len = offsets[i] - offsets[i - 1];
+ total_size += len / 2;
+ }
+ ColumnString::check_chars_length(total_size, rows_count);
+ dst_data.resize(total_size);
+ char* dst_data_ptr = reinterpret_cast<char*>(dst_data.data());
+ size_t offset = 0;
+
for (int i = 0; i < rows_count; ++i) {
const auto* source = reinterpret_cast<const char*>(&data[offsets[i
- 1]]);
ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
- if (srclen == 0) {
- StringOP::push_null_string(i, dst_data, dst_offsets,
*null_map_data);
+ if (UNLIKELY(srclen == 0)) {
+ (*null_map_data)[i] = 1;
+ dst_offsets[i] = cast_set<uint32_t>(offset);
continue;
}
- auto cipher_len = srclen / 2;
- char* dst = nullptr;
- if (cipher_len <= stack_buf.size()) {
- dst = stack_buf.data();
- } else {
- heap_buf.resize(cipher_len);
- dst = heap_buf.data();
- }
+ int outlen = string_hex::hex_decode(source, srclen, dst_data_ptr +
offset);
- int outlen = string_hex::hex_decode(source, srclen, dst);
if (outlen == 0) {
- StringOP::push_null_string(i, dst_data, dst_offsets,
*null_map_data);
+ (*null_map_data)[i] = 1;
+ dst_offsets[i] = cast_set<uint32_t>(offset);
continue;
}
- StringOP::push_value_string(std::string_view(dst, outlen), i,
dst_data, dst_offsets);
+ offset += outlen;
+ dst_offsets[i] = cast_set<uint32_t>(offset);
}
-
+ dst_data.pop_back(total_size - offset);
return Status::OK();
}
};
@@ -1088,30 +1094,33 @@ struct ToBase64Impl {
ColumnString::Chars& dst_data, ColumnString::Offsets&
dst_offsets) {
auto rows_count = offsets.size();
dst_offsets.resize(rows_count);
- std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
- std::vector<char> heap_buf;
+
+ size_t total_size = 0;
+ for (size_t i = 0; i < rows_count; i++) {
+ size_t len = offsets[i] - offsets[i - 1];
+ total_size += 4 * ((len + 2) / 3);
+ }
+ ColumnString::check_chars_length(total_size, rows_count);
+ dst_data.resize(total_size);
+ auto* dst_data_ptr = dst_data.data();
+ size_t offset = 0;
+
for (int i = 0; i < rows_count; ++i) {
const auto* source = reinterpret_cast<const char*>(&data[offsets[i
- 1]]);
size_t srclen = offsets[i] - offsets[i - 1];
- if (srclen == 0) {
- StringOP::push_empty_string(i, dst_data, dst_offsets);
+ if (UNLIKELY(srclen == 0)) {
+ dst_offsets[i] = cast_set<uint32_t>(offset);
continue;
}
- auto cipher_len = srclen / 2;
- char* dst = nullptr;
- if (cipher_len <= stack_buf.size()) {
- dst = stack_buf.data();
- } else {
- heap_buf.resize(cipher_len);
- dst = heap_buf.data();
- }
-
- auto outlen = base64_encode((const unsigned char*)source, srclen,
(unsigned char*)dst);
+ auto outlen = doris::base64_encode((const unsigned char*)source,
srclen,
+ (unsigned char*)(dst_data_ptr +
offset));
- StringOP::push_value_string(std::string_view(dst, outlen), i,
dst_data, dst_offsets);
+ offset += outlen;
+ dst_offsets[i] = cast_set<uint32_t>(offset);
}
+ dst_data.pop_back(total_size - offset);
return Status::OK();
}
};
@@ -1126,40 +1135,43 @@ struct FromBase64Impl {
NullMap& null_map) {
auto rows_count = offsets.size();
dst_offsets.resize(rows_count);
- std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
- std::vector<char> heap_buf;
+
+ size_t total_size = 0;
+ for (size_t i = 0; i < rows_count; i++) {
+ auto len = offsets[i] - offsets[i - 1];
+ total_size += len / 4 * 3;
+ }
+ ColumnString::check_chars_length(total_size, rows_count);
+ dst_data.resize(total_size);
+ char* dst_data_ptr = reinterpret_cast<char*>(dst_data.data());
+ size_t offset = 0;
+
for (int i = 0; i < rows_count; ++i) {
- if (null_map[i]) {
- StringOP::push_null_string(i, dst_data, dst_offsets, null_map);
+ if (UNLIKELY(null_map[i])) {
+ null_map[i] = 1;
+ dst_offsets[i] = cast_set<uint32_t>(offset);
continue;
}
const auto* source = reinterpret_cast<const char*>(&data[offsets[i
- 1]]);
ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
- if (srclen == 0) {
- StringOP::push_empty_string(i, dst_data, dst_offsets);
+ if (UNLIKELY(srclen == 0)) {
+ dst_offsets[i] = cast_set<uint32_t>(offset);
continue;
}
- auto cipher_len = srclen / 2;
- char* dst = nullptr;
- if (cipher_len <= stack_buf.size()) {
- dst = stack_buf.data();
- } else {
- heap_buf.resize(cipher_len);
- dst = heap_buf.data();
- }
- auto outlen = base64_decode(source, srclen, dst);
+ auto outlen = base64_decode(source, srclen, dst_data_ptr + offset);
if (outlen < 0) {
- StringOP::push_null_string(i, dst_data, dst_offsets, null_map);
+ null_map[i] = 1;
+ dst_offsets[i] = cast_set<uint32_t>(offset);
} else {
- StringOP::push_value_string(std::string_view(dst, outlen), i,
dst_data,
- dst_offsets);
+ offset += outlen;
+ dst_offsets[i] = cast_set<uint32_t>(offset);
}
}
-
+ dst_data.pop_back(total_size - offset);
return Status::OK();
}
};
diff --git a/be/test/vec/function/function_string_test.cpp
b/be/test/vec/function/function_string_test.cpp
index 84272d4475c..6ce498cf630 100644
--- a/be/test/vec/function/function_string_test.cpp
+++ b/be/test/vec/function/function_string_test.cpp
@@ -2515,6 +2515,12 @@ TEST(function_string_test, function_hex_test) {
{{std::string("23 12 --!__!_!__!")},
std::string("3233203132202D2D215F5F215F215F5F21")},
{{std::string("112+ + +")}, std::string("3131322B202B202B")},
{{std::string(" + 23 ")},
std::string("20202020202B20202020202020323320")},
+ {{std::string("ππ")}, std::string("F09F9880F09F8D95")},
+ {{std::string("ζ΅θ―")}, std::string("E6B58BE8AF95")},
+ {{std::string("γγγ«γ‘γ―")},
std::string("E38193E38293E381ABE381A1E381AF")},
+ {{std::string("μλ
νμΈμ")},
std::string("EC9588EB8595ED9598EC84B8EC9A94")},
+ {{std::string("ν
μ€νΈ")}, std::string("ED858CEC8AA4ED8AB8")},
+ {{std::string("ππ")}, std::string("F09F8E89F09F8D94")},
};
check_function_all_arg_comb<DataTypeString, true>(func_name, input_types,
data_set);
}
@@ -2525,16 +2531,19 @@ TEST(function_string_test, function_unhex_test) {
DataSet data_set = {
{{std::string("41624364456667")}, std::string("AbCdEfg")},
{{std::string("E4BDA0E5A5BD48454C4C4F")}, std::string("δ½ ε₯½HELLO")},
+ {{std::string("F09F9880F09F8D95")}, std::string("ππ")},
+ {{std::string("E6B58BE8AF95")}, std::string("ζ΅θ―")},
{{std::string("")}, std::string("")},
{{Null()}, Null()},
{{std::string("21402324402A2028212623")}, std::string("!@#$@*
(!&#")},
{{std::string("4A534B41422851405F5F21")},
std::string("JSKAB(Q@__!")},
- // {{std::string("M4D59207465737420537472E4BDA0E5A5BD2020")},
Null()},
{{std::string("2020202020202020202020202020202020")},
std::string(" ")},
{{std::string("3233203132202D2D215F5F215F215F5F21")},
std::string("23 12 --!__!_!__!")},
{{std::string("3131322B202B202B")}, std::string("112+ + +")},
{{std::string("20202020202B20202020202020323320")}, std::string("
+ 23 ")},
- // {{std::string("!")}, Null()},
+ {{std::string("E38193E38293E381ABE381A1E381AF")},
std::string("γγγ«γ‘γ―")},
+ {{std::string("EC9588EB8595ED9598EC84B8EC9A94")},
std::string("μλ
νμΈμ")},
+ {{std::string("ED858CEC8AA4ED8AB8")}, std::string("ν
μ€νΈ")},
};
check_function_all_arg_comb<DataTypeString, true>(unhex_func_name,
input_types, data_set);
@@ -2542,6 +2551,8 @@ TEST(function_string_test, function_unhex_test) {
data_set = {
{{std::string("41624364456667")}, std::string("AbCdEfg")},
{{std::string("E4BDA0E5A5BD48454C4C4F")}, std::string("δ½ ε₯½HELLO")},
+ {{std::string("F09F9880F09F8D95")}, std::string("ππ")},
+ {{std::string("E6B58BE8AF95")}, std::string("ζ΅θ―")},
{{std::string("")}, Null()},
{{Null()}, Null()},
{{std::string("21402324402A2028212623")}, std::string("!@#$@*
(!&#")},
@@ -2553,6 +2564,10 @@ TEST(function_string_test, function_unhex_test) {
{{std::string("20202020202B20202020202020323320")}, std::string("
+ 23 ")},
{{std::string("41G42")}, Null()},
{{std::string("!")}, Null()},
+ {{std::string("F09F8E89F09F8D94")}, std::string("ππ")},
+ {{std::string("E38193E38293E381ABE381A1E381AF")},
std::string("γγγ«γ‘γ―")},
+ {{std::string("EC9588EB8595ED9598EC84B8EC9A94")},
std::string("μλ
νμΈμ")},
+ {{std::string("ED858CEC8AA4ED8AB8")}, std::string("ν
μ€νΈ")},
};
check_function_all_arg_comb<DataTypeString, true>(unhex_null_func_name,
input_types, data_set);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]