projjal commented on a change in pull request #10059: URL: https://github.com/apache/arrow/pull/10059#discussion_r622732364
########## File path: cpp/src/gandiva/precompiled/types.h ########## @@ -263,6 +263,9 @@ const char* substr_utf8_int64_int64(gdv_int64 context, const char* input, const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in_len, gdv_int64 offset64, gdv_int32* out_len); +const char* regexp_replace(const char* regexp, const char* str, Review comment: this is not required ########## File path: java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java ########## @@ -648,6 +648,66 @@ public void testRegex() throws GandivaException { eval.close(); } + @Test + public void testRegexpReplace() throws GandivaException { + + Field x = Field.nullable("x", new ArrowType.Utf8()); + Field replaceString = Field.nullable("replaceString", new ArrowType.Utf8()); + + Field retType = Field.nullable("c", new ArrowType.Utf8()); + + TreeNode cond = + TreeBuilder.makeFunction( + "regexp_replace", + Lists.newArrayList(TreeBuilder.makeField(x), TreeBuilder.makeStringLiteral("ana"), + TreeBuilder.makeField(replaceString)), + new ArrowType.Utf8()); + ExpressionTree expr = TreeBuilder.makeExpression(cond, retType); + Schema schema = new Schema(Lists.newArrayList(x, replaceString)); + Projector eval = Projector.make(schema, Lists.newArrayList(expr)); + + int numRows = 5; + byte[] validity = new byte[]{(byte) 15, 0}; + String[] valuesX = new String[]{"banana", "bananaana", "bananana", "anaana", "anaana"}; + String[] valuesReplace = new String[]{"", "", "", "", ""}; Review comment: atleast keep one non-empty replace string in the test ########## File path: cpp/src/gandiva/gdv_function_stubs.cc ########## @@ -42,6 +43,41 @@ bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, return (*holder)(std::string(data, data_len)); } +const char* gdv_fn_regexp_replace_utf8_utf8( + int64_t ptr, int64_t holder_ptr, const char* data, int32_t data_len, + const char* /*pattern*/, int32_t /*pattern_len*/, const char* replace_string, + int32_t replace_string_len, int32_t* out_length) { + std::string user_input(data, data_len); + std::string replace_input(replace_string, replace_string_len); + gandiva::ExecutionContext* context = reinterpret_cast<gandiva::ExecutionContext*>(ptr); + + gandiva::ReplaceHolder* holder = reinterpret_cast<gandiva::ReplaceHolder*>(holder_ptr); + bool was_replaced = (*holder)(context, user_input, replace_input, out_length); + + if (!was_replaced) { + return data; + } + + // This condition treats the case where the whole string is replaced by an empty string + if (*out_length == 0) { + return ""; + } + + char* result_buffer = + reinterpret_cast<char*>(gdv_fn_context_arena_malloc(ptr, *out_length)); + + if (result_buffer == nullptr) { + std::string err_msg = "Could not allocate memory for string: " + user_input; + gdv_fn_context_set_error_msg(ptr, err_msg.data()); + *out_length = 0; + return ""; + } + + memcpy(result_buffer, user_input.c_str(), *out_length); Review comment: Use user_input.data() instead of c_str() (Latter I believe does another allocation and copy) ########## File path: cpp/src/gandiva/gdv_function_stubs.cc ########## @@ -42,6 +43,41 @@ bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, return (*holder)(std::string(data, data_len)); } +const char* gdv_fn_regexp_replace_utf8_utf8( + int64_t ptr, int64_t holder_ptr, const char* data, int32_t data_len, + const char* /*pattern*/, int32_t /*pattern_len*/, const char* replace_string, + int32_t replace_string_len, int32_t* out_length) { + std::string user_input(data, data_len); + std::string replace_input(replace_string, replace_string_len); + gandiva::ExecutionContext* context = reinterpret_cast<gandiva::ExecutionContext*>(ptr); + + gandiva::ReplaceHolder* holder = reinterpret_cast<gandiva::ReplaceHolder*>(holder_ptr); + bool was_replaced = (*holder)(context, user_input, replace_input, out_length); + + if (!was_replaced) { + return data; + } + + // This condition treats the case where the whole string is replaced by an empty string + if (*out_length == 0) { + return ""; + } + + char* result_buffer = + reinterpret_cast<char*>(gdv_fn_context_arena_malloc(ptr, *out_length)); + + if (result_buffer == nullptr) { + std::string err_msg = "Could not allocate memory for string: " + user_input; Review comment: This involves implicit allocation. We should not have another allocation in the error path of allocation failure. Just keep set error message to "Could not allocate memory for result" ########## File path: cpp/src/gandiva/replace_holder_test.cc ########## @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/replace_holder.h" +#include "gandiva/regex_util.h" + +#include <memory> +#include <vector> + +#include <gtest/gtest.h> + +namespace gandiva { + +class TestReplaceHolder : public ::testing::Test { + public: + FunctionNode BuildReplace(std::string pattern) { Review comment: This function seems unnecessary. It is also building "replace" function node which is a different function -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org