This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new b9274bc9e0 GH-49420: [C++][Gandiva] Fix castVARCHAR memory allocation
and len<=0 handling (#49421)
b9274bc9e0 is described below
commit b9274bc9e043338e070f342503e392b935e5fbfa
Author: Dmitry Chirkov <[email protected]>
AuthorDate: Thu Mar 12 22:56:56 2026 -0700
GH-49420: [C++][Gandiva] Fix castVARCHAR memory allocation and len<=0
handling (#49421)
### Rationale for this change
The `castVARCHAR` functions in Gandiva have memory allocation
inefficiencies and missing edge case handling. See GH-49420 for details.
### What changes are included in this PR?
**Functional fixes:**
- `bool`: Remove unused 5-byte arena allocation; return string literal
directly
- `int32`/`int64`: Add handling for `len=0` (return empty string) and
`len<0` (set error)
**Memory allocation optimizations:**
- `int32`/`int64`: Allocate fixed small buffer (11/20 bytes) directly in
arena, use optimized digit-pair conversion writing right-to-left, then
`memmove` to align. Returns `min(len, actual_size)` bytes.
- `date64`: Allocate only `min(len, 10)` bytes upfront (output is always
"YYYY-MM-DD")
- `float32`/`float64`: Allocate only `min(len, 24)` bytes upfront (max
output length)
**Code cleanup:**
- Extract common code into helper macros to reduce duplication
### Are these changes tested?
Yes. Added tests for `len=0` and `len<0` edge cases for int64, date64,
float32, float64, and bool types. All existing Gandiva tests pass. Adhoc
performance benchmarking was performed both via direct expression evaluation as
well as via query execution via Dremio.
### Are there any user-facing changes?
No. Users will see reduced memory usage and proper error messages for
invalid len parameter values.
Note: Error messages for negative `len` remain different between
precompiled ("Output buffer length can't be negative") and interpreted ("Buffer
length cannot be negative") code paths, preserving existing behavior.
* GitHub Issue: #49420
Authored-by: Dmitry Chirkov <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/gandiva/gdv_function_stubs_test.cc | 44 +++++++
cpp/src/gandiva/gdv_string_function_stubs.cc | 169 ++++++++++++++-----------
cpp/src/gandiva/precompiled/string_ops.cc | 12 +-
cpp/src/gandiva/precompiled/string_ops_test.cc | 4 +
4 files changed, 150 insertions(+), 79 deletions(-)
diff --git a/cpp/src/gandiva/gdv_function_stubs_test.cc
b/cpp/src/gandiva/gdv_function_stubs_test.cc
index cbe24b7423..1e3d1df7c3 100644
--- a/cpp/src/gandiva/gdv_function_stubs_test.cc
+++ b/cpp/src/gandiva/gdv_function_stubs_test.cc
@@ -350,6 +350,14 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromInt64) {
out_str = gdv_fn_castVARCHAR_int64_int64(ctx_ptr, 12345, 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "123");
EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_int64_int64(ctx_ptr, 347, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_int64_int64(ctx_ptr, 347, -1, &out_len);
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be
negative"));
+ ctx.Reset();
}
TEST(TestGdvFnStubs, TestCastVARCHARFromMilliseconds) {
@@ -381,6 +389,15 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromMilliseconds) {
out_str = gdv_fn_castVARCHAR_date64_int64(ctx_ptr, ts, 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "2008");
EXPECT_FALSE(ctx.has_error());
+
+ ts = StringToTimestamp("2021-04-23 10:20:33");
+ out_str = gdv_fn_castVARCHAR_date64_int64(ctx_ptr, ts, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_date64_int64(ctx_ptr, ts, -1, &out_len);
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be
negative"));
+ ctx.Reset();
}
TEST(TestGdvFnStubs, TestCastVARCHARFromFloat) {
@@ -416,6 +433,14 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromFloat) {
out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 1.2345f, 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "1.2");
EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 1.2345f, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 1.2345f, -1, &out_len);
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be
negative"));
+ ctx.Reset();
}
TEST(TestGdvFnStubs, TestCastVARCHARFromDouble) {
@@ -451,6 +476,25 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromDouble) {
out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 1.2345, 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "1.2");
EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 1.2345, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 1.2345, -1, &out_len);
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be
negative"));
+ ctx.Reset();
+
+ // test long repeating decimal (1/3) with large buffer
+ out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 1.0 / 3.0, 100,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "0.3333333333333333");
+ EXPECT_FALSE(ctx.has_error());
+
+ // test exponential notation with large negative exponent (24 chars)
+ out_str =
+ gdv_fn_castVARCHAR_float64_int64(ctx_ptr, -1.2345678901234567e-100, 100,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "-1.2345678901234567E-100");
+ EXPECT_FALSE(ctx.has_error());
}
TEST(TestGdvFnStubs, TestSubstringIndex) {
diff --git a/cpp/src/gandiva/gdv_string_function_stubs.cc
b/cpp/src/gandiva/gdv_string_function_stubs.cc
index e7982461b4..66c6b4ec0e 100644
--- a/cpp/src/gandiva/gdv_string_function_stubs.cc
+++ b/cpp/src/gandiva/gdv_string_function_stubs.cc
@@ -18,8 +18,10 @@
#include "gandiva/gdv_function_stubs.h"
#include <utf8proc.h>
+#include <limits>
#include <string>
#include <string_view>
+#include <type_traits>
#include <unordered_map>
#include <vector>
@@ -81,91 +83,110 @@ const char* gdv_fn_regexp_extract_utf8_utf8_int32(int64_t
ptr, int64_t holder_pt
return (*holder)(context, data, data_len, extract_index, out_length);
}
-#define GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(IN_TYPE, CAST_NAME, ARROW_TYPE)
\
- GANDIVA_EXPORT
\
- const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(
\
- int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {
\
- if (len < 0) {
\
- gdv_fn_context_set_error_msg(context, "Buffer length cannot be
negative"); \
- *out_len = 0;
\
- return "";
\
- }
\
- if (len == 0) {
\
- *out_len = 0;
\
- return "";
\
- }
\
- arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter;
\
- char* ret = reinterpret_cast<char*>(
\
- gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len)));
\
- if (ret == nullptr) {
\
- gdv_fn_context_set_error_msg(context, "Could not allocate memory");
\
- *out_len = 0;
\
- return "";
\
- }
\
- arrow::Status status = formatter(value, [&](std::string_view v) {
\
- int64_t size = static_cast<int64_t>(v.size());
\
- *out_len = static_cast<int32_t>(len < size ? len : size);
\
- memcpy(ret, v.data(), *out_len);
\
- return arrow::Status::OK();
\
- });
\
- if (!status.ok()) {
\
- std::string err = "Could not cast " + std::to_string(value) + " to
string"; \
- gdv_fn_context_set_error_msg(context, err.c_str());
\
- *out_len = 0;
\
- return "";
\
- }
\
- return ret;
\
+// The following castVARCHAR macros are optimized to allocate only the actual
+// string size instead of the maximum buffer length (which can be 65536+
bytes).
+
+// Helper: arena allocation + null check
+#define GDV_FN_CAST_VARLEN_ALLOC(SIZE)
\
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
SIZE)); \
+ if (ret == nullptr) {
\
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory");
\
+ *out_len = 0;
\
+ return "";
\
+ }
+
+// Helper: function signature + len validation
+#define GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME)
\
+ GANDIVA_EXPORT
\
+ const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(
\
+ int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {
\
+ if (len < 0) {
\
+ gdv_fn_context_set_error_msg(context, "Buffer length cannot be
negative"); \
+ *out_len = 0;
\
+ return "";
\
+ }
\
+ if (len == 0) {
\
+ *out_len = 0;
\
+ return "";
\
+ }
+
+// Macro for integer types (int32/int64). Uses
arrow::internal::detail::FormatAllDigits
+// to convert digits right-to-left into a small arena allocation (11 bytes for
int32,
+// 20 for int64).
+#define GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(IN_TYPE, CAST_NAME, ARROW_TYPE)
\
+ GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME)
\
+ constexpr int32_t max_len = std::numeric_limits<gdv_##IN_TYPE>::digits10 +
2; \
+ GDV_FN_CAST_VARLEN_ALLOC(max_len)
\
+ char* end = ret + max_len;
\
+ char* cursor = end;
\
+ auto uval = arrow::internal::detail::Abs(value);
\
+ arrow::internal::detail::FormatAllDigits(uval, &cursor);
\
+ if (value < 0) {
\
+ arrow::internal::detail::FormatOneChar('-', &cursor);
\
+ }
\
+ int32_t slen = static_cast<int32_t>(end - cursor);
\
+ *out_len = static_cast<int32_t>(len < slen ? len : slen);
\
+ memmove(ret, cursor, *out_len);
\
+ return ret;
\
}
-#define GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(IN_TYPE, CAST_NAME, ARROW_TYPE)
\
- GANDIVA_EXPORT
\
- const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(
\
- int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {
\
- if (len < 0) {
\
- gdv_fn_context_set_error_msg(context, "Buffer length cannot be
negative"); \
- *out_len = 0;
\
- return "";
\
- }
\
- if (len == 0) {
\
- *out_len = 0;
\
- return "";
\
- }
\
- gandiva::GdvStringFormatter<arrow::ARROW_TYPE> formatter;
\
- char* ret = reinterpret_cast<char*>(
\
- gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len)));
\
- if (ret == nullptr) {
\
- gdv_fn_context_set_error_msg(context, "Could not allocate memory");
\
- *out_len = 0;
\
- return "";
\
- }
\
- arrow::Status status = formatter(value, [&](std::string_view v) {
\
- int64_t size = static_cast<int64_t>(v.size());
\
- *out_len = static_cast<int32_t>(len < size ? len : size);
\
- memcpy(ret, v.data(), *out_len);
\
- return arrow::Status::OK();
\
- });
\
- if (!status.ok()) {
\
- std::string err = "Could not cast " + std::to_string(value) + " to
string"; \
- gdv_fn_context_set_error_msg(context, err.c_str());
\
- *out_len = 0;
\
- return "";
\
- }
\
- return ret;
\
+// Helper: invoke formatter callback, copy result to ret, handle errors.
+// Used by date64 and float types that rely on
arrow::internal::StringFormatter.
+#define GDV_FN_CAST_VARLEN_FORMATTER_SUFFIX
\
+ arrow::Status status = formatter(value, [&](std::string_view v) {
\
+ int64_t size = static_cast<int64_t>(v.size());
\
+ *out_len = static_cast<int32_t>(len < size ? len : size);
\
+ memcpy(ret, v.data(), *out_len);
\
+ return arrow::Status::OK();
\
+ });
\
+ if (!status.ok()) {
\
+ std::string err = "Could not cast " + std::to_string(value) + " to
string"; \
+ gdv_fn_context_set_error_msg(context, err.c_str());
\
+ *out_len = 0;
\
+ return "";
\
+ }
\
+ return ret;
\
}
-#define CAST_VARLEN_TYPE_FROM_NUMERIC(VARLEN_TYPE) \
- GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(int32, VARLEN_TYPE, Int32Type) \
- GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(int64, VARLEN_TYPE, Int64Type) \
- GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(date64, VARLEN_TYPE, Date64Type) \
- GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float32, VARLEN_TYPE, FloatType) \
+// Macro for date64 type. Output is always "YYYY-MM-DD" = 10 chars max.
+#define GDV_FN_CAST_VARLEN_TYPE_FROM_DATE64(IN_TYPE, CAST_NAME, ARROW_TYPE) \
+ GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME) \
+ constexpr int32_t max_date_str_len = 10; \
+ int32_t alloc_len = \
+ static_cast<int32_t>(len < max_date_str_len ? len : max_date_str_len); \
+ GDV_FN_CAST_VARLEN_ALLOC(alloc_len) \
+ arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter; \
+ GDV_FN_CAST_VARLEN_FORMATTER_SUFFIX
+
+// Macro for float types (float32/float64). Uses Java-compatible formatting.
+// Max string: "-1.2345678901234567E-308" = 24 chars.
+#define GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(IN_TYPE, CAST_NAME, ARROW_TYPE) \
+ GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME) \
+ constexpr int32_t max_real_str_len = 24; \
+ int32_t alloc_len = \
+ static_cast<int32_t>(len < max_real_str_len ? len : max_real_str_len); \
+ GDV_FN_CAST_VARLEN_ALLOC(alloc_len) \
+ gandiva::GdvStringFormatter<arrow::ARROW_TYPE> formatter; \
+ GDV_FN_CAST_VARLEN_FORMATTER_SUFFIX
+
+// Use optimized integer macro for int32/int64, date64 macro, and real macro
for floats
+#define CAST_VARLEN_TYPE_FROM_NUMERIC(VARLEN_TYPE) \
+ GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(int32, VARLEN_TYPE, Int32Type) \
+ GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(int64, VARLEN_TYPE, Int64Type) \
+ GDV_FN_CAST_VARLEN_TYPE_FROM_DATE64(date64, VARLEN_TYPE, Date64Type) \
+ GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float32, VARLEN_TYPE, FloatType) \
GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float64, VARLEN_TYPE, DoubleType)
CAST_VARLEN_TYPE_FROM_NUMERIC(VARCHAR)
CAST_VARLEN_TYPE_FROM_NUMERIC(VARBINARY)
#undef CAST_VARLEN_TYPE_FROM_NUMERIC
-#undef GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE
+#undef GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER
+#undef GDV_FN_CAST_VARLEN_TYPE_FROM_DATE64
#undef GDV_FN_CAST_VARLEN_TYPE_FROM_REAL
+#undef GDV_FN_CAST_VARLEN_FORMATTER_SUFFIX
+#undef GDV_FN_CAST_VARLEN_ALLOC
+#undef GDV_FN_CAST_VARLEN_PREFIX
GDV_FORCE_INLINE
void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) {
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc
b/cpp/src/gandiva/precompiled/string_ops.cc
index 54243085f7..035d3c8c62 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -582,11 +582,13 @@ const char* castVARCHAR_bool_int64(gdv_int64 context,
gdv_boolean value,
*out_length = 0;
return "";
}
- const char* out =
- reinterpret_cast<const char*>(gdv_fn_context_arena_malloc(context, 5));
- out = value ? "true" : "false";
- *out_length = value ? ((len > 4) ? 4 : len) : ((len > 5) ? 5 : len);
- return out;
+ if (value) {
+ *out_length = (len > 4) ? 4 : len;
+ return "true";
+ } else {
+ *out_length = (len > 5) ? 5 : len;
+ return "false";
+ }
}
// Truncates the string to given length
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc
b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 51174c1113..d57eb43753 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -417,6 +417,10 @@ TEST(TestStringOps, TestCastBoolToVarchar) {
EXPECT_EQ(std::string(out_str, out_len), "false");
EXPECT_FALSE(ctx.has_error());
+ out_str = castVARCHAR_bool_int64(ctx_ptr, true, 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
castVARCHAR_bool_int64(ctx_ptr, true, -3, &out_len);
EXPECT_THAT(ctx.get_error(),
::testing::HasSubstr("Output buffer length can't be negative"));