(arrow) branch main updated: GH-49420: [C++][Gandiva] Fix castVARCHAR memory allocation and len<=0 handling (#49421)

kou Thu, 12 Mar 2026 22:57:32 -0700

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new b9274bc9e0 GH-49420: [C++][Gandiva] Fix castVARCHAR memory allocation 
and len<=0 handling (#49421)
b9274bc9e0 is described below

commit b9274bc9e043338e070f342503e392b935e5fbfa
Author: Dmitry Chirkov <[email protected]>
AuthorDate: Thu Mar 12 22:56:56 2026 -0700

    GH-49420: [C++][Gandiva] Fix castVARCHAR memory allocation and len<=0 
handling (#49421)
    
    ### Rationale for this change
    The `castVARCHAR` functions in Gandiva have memory allocation 
inefficiencies and missing edge case handling. See GH-49420 for details.
    
    ### What changes are included in this PR?
    **Functional fixes:**
    - `bool`: Remove unused 5-byte arena allocation; return string literal 
directly
    - `int32`/`int64`: Add handling for `len=0` (return empty string) and 
`len<0` (set error)
    
    **Memory allocation optimizations:**
    - `int32`/`int64`: Allocate fixed small buffer (11/20 bytes) directly in 
arena, use optimized digit-pair conversion writing right-to-left, then 
`memmove` to align. Returns `min(len, actual_size)` bytes.
    - `date64`: Allocate only `min(len, 10)` bytes upfront (output is always 
"YYYY-MM-DD")
    - `float32`/`float64`: Allocate only `min(len, 24)` bytes upfront (max 
output length)
    
    **Code cleanup:**
    - Extract common code into helper macros to reduce duplication
    
    ### Are these changes tested?
    Yes. Added tests for `len=0` and `len<0` edge cases for int64, date64, 
float32, float64, and bool types. All existing Gandiva tests pass. Adhoc 
performance benchmarking was performed both via direct expression evaluation as 
well as via query execution via Dremio.
    
    ### Are there any user-facing changes?
    No. Users will see reduced memory usage and proper error messages for 
invalid len parameter values.
    Note: Error messages for negative `len` remain different between 
precompiled ("Output buffer length can't be negative") and interpreted ("Buffer 
length cannot be negative") code paths, preserving existing behavior.
    * GitHub Issue: #49420
    
    Authored-by: Dmitry Chirkov <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/gandiva/gdv_function_stubs_test.cc     |  44 +++++++
 cpp/src/gandiva/gdv_string_function_stubs.cc   | 169 ++++++++++++++-----------
 cpp/src/gandiva/precompiled/string_ops.cc      |  12 +-
 cpp/src/gandiva/precompiled/string_ops_test.cc |   4 +
 4 files changed, 150 insertions(+), 79 deletions(-)

diff --git a/cpp/src/gandiva/gdv_function_stubs_test.cc 
b/cpp/src/gandiva/gdv_function_stubs_test.cc
index cbe24b7423..1e3d1df7c3 100644
--- a/cpp/src/gandiva/gdv_function_stubs_test.cc
+++ b/cpp/src/gandiva/gdv_function_stubs_test.cc
@@ -350,6 +350,14 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromInt64) {
   out_str = gdv_fn_castVARCHAR_int64_int64(ctx_ptr, 12345, 3, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "123");
   EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_castVARCHAR_int64_int64(ctx_ptr, 347, 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_castVARCHAR_int64_int64(ctx_ptr, 347, -1, &out_len);
+  EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be 
negative"));
+  ctx.Reset();
 }
 
 TEST(TestGdvFnStubs, TestCastVARCHARFromMilliseconds) {
@@ -381,6 +389,15 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromMilliseconds) {
   out_str = gdv_fn_castVARCHAR_date64_int64(ctx_ptr, ts, 4, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "2008");
   EXPECT_FALSE(ctx.has_error());
+
+  ts = StringToTimestamp("2021-04-23 10:20:33");
+  out_str = gdv_fn_castVARCHAR_date64_int64(ctx_ptr, ts, 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_castVARCHAR_date64_int64(ctx_ptr, ts, -1, &out_len);
+  EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be 
negative"));
+  ctx.Reset();
 }
 
 TEST(TestGdvFnStubs, TestCastVARCHARFromFloat) {
@@ -416,6 +433,14 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromFloat) {
   out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 1.2345f, 3, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "1.2");
   EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 1.2345f, 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_castVARCHAR_float32_int64(ctx_ptr, 1.2345f, -1, &out_len);
+  EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be 
negative"));
+  ctx.Reset();
 }
 
 TEST(TestGdvFnStubs, TestCastVARCHARFromDouble) {
@@ -451,6 +476,25 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromDouble) {
   out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 1.2345, 3, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "1.2");
   EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 1.2345, 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 1.2345, -1, &out_len);
+  EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer length cannot be 
negative"));
+  ctx.Reset();
+
+  // test long repeating decimal (1/3) with large buffer
+  out_str = gdv_fn_castVARCHAR_float64_int64(ctx_ptr, 1.0 / 3.0, 100, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "0.3333333333333333");
+  EXPECT_FALSE(ctx.has_error());
+
+  // test exponential notation with large negative exponent (24 chars)
+  out_str =
+      gdv_fn_castVARCHAR_float64_int64(ctx_ptr, -1.2345678901234567e-100, 100, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "-1.2345678901234567E-100");
+  EXPECT_FALSE(ctx.has_error());
 }
 
 TEST(TestGdvFnStubs, TestSubstringIndex) {
diff --git a/cpp/src/gandiva/gdv_string_function_stubs.cc 
b/cpp/src/gandiva/gdv_string_function_stubs.cc
index e7982461b4..66c6b4ec0e 100644
--- a/cpp/src/gandiva/gdv_string_function_stubs.cc
+++ b/cpp/src/gandiva/gdv_string_function_stubs.cc
@@ -18,8 +18,10 @@
 #include "gandiva/gdv_function_stubs.h"
 
 #include <utf8proc.h>
+#include <limits>
 #include <string>
 #include <string_view>
+#include <type_traits>
 #include <unordered_map>
 #include <vector>
 
@@ -81,91 +83,110 @@ const char* gdv_fn_regexp_extract_utf8_utf8_int32(int64_t 
ptr, int64_t holder_pt
   return (*holder)(context, data, data_len, extract_index, out_length);
 }
 
-#define GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(IN_TYPE, CAST_NAME, ARROW_TYPE)      
   \
-  GANDIVA_EXPORT                                                               
   \
-  const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(                      
   \
-      int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {  
   \
-    if (len < 0) {                                                             
   \
-      gdv_fn_context_set_error_msg(context, "Buffer length cannot be 
negative");  \
-      *out_len = 0;                                                            
   \
-      return "";                                                               
   \
-    }                                                                          
   \
-    if (len == 0) {                                                            
   \
-      *out_len = 0;                                                            
   \
-      return "";                                                               
   \
-    }                                                                          
   \
-    arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter;             
   \
-    char* ret = reinterpret_cast<char*>(                                       
   \
-        gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len)));      
   \
-    if (ret == nullptr) {                                                      
   \
-      gdv_fn_context_set_error_msg(context, "Could not allocate memory");      
   \
-      *out_len = 0;                                                            
   \
-      return "";                                                               
   \
-    }                                                                          
   \
-    arrow::Status status = formatter(value, [&](std::string_view v) {          
   \
-      int64_t size = static_cast<int64_t>(v.size());                           
   \
-      *out_len = static_cast<int32_t>(len < size ? len : size);                
   \
-      memcpy(ret, v.data(), *out_len);                                         
   \
-      return arrow::Status::OK();                                              
   \
-    });                                                                        
   \
-    if (!status.ok()) {                                                        
   \
-      std::string err = "Could not cast " + std::to_string(value) + " to 
string"; \
-      gdv_fn_context_set_error_msg(context, err.c_str());                      
   \
-      *out_len = 0;                                                            
   \
-      return "";                                                               
   \
-    }                                                                          
   \
-    return ret;                                                                
   \
+// The following castVARCHAR macros are optimized to allocate only the actual
+// string size instead of the maximum buffer length (which can be 65536+ 
bytes).
+
+// Helper: arena allocation + null check
+#define GDV_FN_CAST_VARLEN_ALLOC(SIZE)                                         
    \
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
SIZE)); \
+  if (ret == nullptr) {                                                        
    \
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory");        
    \
+    *out_len = 0;                                                              
    \
+    return "";                                                                 
    \
+  }
+
+// Helper: function signature + len validation
+#define GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME)                          
  \
+  GANDIVA_EXPORT                                                               
  \
+  const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(                      
  \
+      int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {  
  \
+    if (len < 0) {                                                             
  \
+      gdv_fn_context_set_error_msg(context, "Buffer length cannot be 
negative"); \
+      *out_len = 0;                                                            
  \
+      return "";                                                               
  \
+    }                                                                          
  \
+    if (len == 0) {                                                            
  \
+      *out_len = 0;                                                            
  \
+      return "";                                                               
  \
+    }
+
+// Macro for integer types (int32/int64). Uses 
arrow::internal::detail::FormatAllDigits
+// to convert digits right-to-left into a small arena allocation (11 bytes for 
int32,
+// 20 for int64).
+#define GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(IN_TYPE, CAST_NAME, ARROW_TYPE)   
 \
+  GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME)                                
 \
+  constexpr int32_t max_len = std::numeric_limits<gdv_##IN_TYPE>::digits10 + 
2; \
+  GDV_FN_CAST_VARLEN_ALLOC(max_len)                                            
 \
+  char* end = ret + max_len;                                                   
 \
+  char* cursor = end;                                                          
 \
+  auto uval = arrow::internal::detail::Abs(value);                             
 \
+  arrow::internal::detail::FormatAllDigits(uval, &cursor);                     
 \
+  if (value < 0) {                                                             
 \
+    arrow::internal::detail::FormatOneChar('-', &cursor);                      
 \
+  }                                                                            
 \
+  int32_t slen = static_cast<int32_t>(end - cursor);                           
 \
+  *out_len = static_cast<int32_t>(len < slen ? len : slen);                    
 \
+  memmove(ret, cursor, *out_len);                                              
 \
+  return ret;                                                                  
 \
   }
 
-#define GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(IN_TYPE, CAST_NAME, ARROW_TYPE)      
   \
-  GANDIVA_EXPORT                                                               
   \
-  const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(                      
   \
-      int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {  
   \
-    if (len < 0) {                                                             
   \
-      gdv_fn_context_set_error_msg(context, "Buffer length cannot be 
negative");  \
-      *out_len = 0;                                                            
   \
-      return "";                                                               
   \
-    }                                                                          
   \
-    if (len == 0) {                                                            
   \
-      *out_len = 0;                                                            
   \
-      return "";                                                               
   \
-    }                                                                          
   \
-    gandiva::GdvStringFormatter<arrow::ARROW_TYPE> formatter;                  
   \
-    char* ret = reinterpret_cast<char*>(                                       
   \
-        gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len)));      
   \
-    if (ret == nullptr) {                                                      
   \
-      gdv_fn_context_set_error_msg(context, "Could not allocate memory");      
   \
-      *out_len = 0;                                                            
   \
-      return "";                                                               
   \
-    }                                                                          
   \
-    arrow::Status status = formatter(value, [&](std::string_view v) {          
   \
-      int64_t size = static_cast<int64_t>(v.size());                           
   \
-      *out_len = static_cast<int32_t>(len < size ? len : size);                
   \
-      memcpy(ret, v.data(), *out_len);                                         
   \
-      return arrow::Status::OK();                                              
   \
-    });                                                                        
   \
-    if (!status.ok()) {                                                        
   \
-      std::string err = "Could not cast " + std::to_string(value) + " to 
string"; \
-      gdv_fn_context_set_error_msg(context, err.c_str());                      
   \
-      *out_len = 0;                                                            
   \
-      return "";                                                               
   \
-    }                                                                          
   \
-    return ret;                                                                
   \
+// Helper: invoke formatter callback, copy result to ret, handle errors.
+// Used by date64 and float types that rely on 
arrow::internal::StringFormatter.
+#define GDV_FN_CAST_VARLEN_FORMATTER_SUFFIX                                    
 \
+  arrow::Status status = formatter(value, [&](std::string_view v) {            
 \
+    int64_t size = static_cast<int64_t>(v.size());                             
 \
+    *out_len = static_cast<int32_t>(len < size ? len : size);                  
 \
+    memcpy(ret, v.data(), *out_len);                                           
 \
+    return arrow::Status::OK();                                                
 \
+  });                                                                          
 \
+  if (!status.ok()) {                                                          
 \
+    std::string err = "Could not cast " + std::to_string(value) + " to 
string"; \
+    gdv_fn_context_set_error_msg(context, err.c_str());                        
 \
+    *out_len = 0;                                                              
 \
+    return "";                                                                 
 \
+  }                                                                            
 \
+  return ret;                                                                  
 \
   }
 
-#define CAST_VARLEN_TYPE_FROM_NUMERIC(VARLEN_TYPE)                   \
-  GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(int32, VARLEN_TYPE, Int32Type)   \
-  GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(int64, VARLEN_TYPE, Int64Type)   \
-  GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(date64, VARLEN_TYPE, Date64Type) \
-  GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float32, VARLEN_TYPE, FloatType) \
+// Macro for date64 type. Output is always "YYYY-MM-DD" = 10 chars max.
+#define GDV_FN_CAST_VARLEN_TYPE_FROM_DATE64(IN_TYPE, CAST_NAME, ARROW_TYPE)  \
+  GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME)                              \
+  constexpr int32_t max_date_str_len = 10;                                   \
+  int32_t alloc_len =                                                        \
+      static_cast<int32_t>(len < max_date_str_len ? len : max_date_str_len); \
+  GDV_FN_CAST_VARLEN_ALLOC(alloc_len)                                        \
+  arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter;             \
+  GDV_FN_CAST_VARLEN_FORMATTER_SUFFIX
+
+// Macro for float types (float32/float64). Uses Java-compatible formatting.
+// Max string: "-1.2345678901234567E-308" = 24 chars.
+#define GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(IN_TYPE, CAST_NAME, ARROW_TYPE)    \
+  GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME)                              \
+  constexpr int32_t max_real_str_len = 24;                                   \
+  int32_t alloc_len =                                                        \
+      static_cast<int32_t>(len < max_real_str_len ? len : max_real_str_len); \
+  GDV_FN_CAST_VARLEN_ALLOC(alloc_len)                                        \
+  gandiva::GdvStringFormatter<arrow::ARROW_TYPE> formatter;                  \
+  GDV_FN_CAST_VARLEN_FORMATTER_SUFFIX
+
+// Use optimized integer macro for int32/int64, date64 macro, and real macro 
for floats
+#define CAST_VARLEN_TYPE_FROM_NUMERIC(VARLEN_TYPE)                     \
+  GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(int32, VARLEN_TYPE, Int32Type)  \
+  GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(int64, VARLEN_TYPE, Int64Type)  \
+  GDV_FN_CAST_VARLEN_TYPE_FROM_DATE64(date64, VARLEN_TYPE, Date64Type) \
+  GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float32, VARLEN_TYPE, FloatType)   \
   GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float64, VARLEN_TYPE, DoubleType)
 
 CAST_VARLEN_TYPE_FROM_NUMERIC(VARCHAR)
 CAST_VARLEN_TYPE_FROM_NUMERIC(VARBINARY)
 
 #undef CAST_VARLEN_TYPE_FROM_NUMERIC
-#undef GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE
+#undef GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER
+#undef GDV_FN_CAST_VARLEN_TYPE_FROM_DATE64
 #undef GDV_FN_CAST_VARLEN_TYPE_FROM_REAL
+#undef GDV_FN_CAST_VARLEN_FORMATTER_SUFFIX
+#undef GDV_FN_CAST_VARLEN_ALLOC
+#undef GDV_FN_CAST_VARLEN_PREFIX
 
 GDV_FORCE_INLINE
 void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) {
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc 
b/cpp/src/gandiva/precompiled/string_ops.cc
index 54243085f7..035d3c8c62 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -582,11 +582,13 @@ const char* castVARCHAR_bool_int64(gdv_int64 context, 
gdv_boolean value,
     *out_length = 0;
     return "";
   }
-  const char* out =
-      reinterpret_cast<const char*>(gdv_fn_context_arena_malloc(context, 5));
-  out = value ? "true" : "false";
-  *out_length = value ? ((len > 4) ? 4 : len) : ((len > 5) ? 5 : len);
-  return out;
+  if (value) {
+    *out_length = (len > 4) ? 4 : len;
+    return "true";
+  } else {
+    *out_length = (len > 5) ? 5 : len;
+    return "false";
+  }
 }
 
 // Truncates the string to given length
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc 
b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 51174c1113..d57eb43753 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -417,6 +417,10 @@ TEST(TestStringOps, TestCastBoolToVarchar) {
   EXPECT_EQ(std::string(out_str, out_len), "false");
   EXPECT_FALSE(ctx.has_error());
 
+  out_str = castVARCHAR_bool_int64(ctx_ptr, true, 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
   castVARCHAR_bool_int64(ctx_ptr, true, -3, &out_len);
   EXPECT_THAT(ctx.get_error(),
               ::testing::HasSubstr("Output buffer length can't be negative"));

(arrow) branch main updated: GH-49420: [C++][Gandiva] Fix castVARCHAR memory allocation and len<=0 handling (#49421)

Reply via email to