[GitHub] [arrow] vvellanki commented on a change in pull request #11287: ARROW-14193: [C++][Gandiva] Implement INSTR function

GitBox Wed, 03 Nov 2021 18:03:44 -0700


vvellanki commented on a change in pull request #11287:
URL: https://github.com/apache/arrow/pull/11287#discussion_r741632047




##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +794,27 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+int32_t gd_fn_instr_utf8(int64_t context, const char* string, int32_t 
string_len,
+                         const char* substring, int32_t substring_len) {

Review comment:
       If the substring is a literal, you should be able to pre-process the 
substring to implement this faster. Can you open a separate ticket to handle a 
literal substring?

##########
File path: cpp/src/gandiva/gdv_function_stubs_test.cc
##########
@@ -766,4 +766,42 @@ TEST(TestGdvFnStubs, TestCastVarbinaryFloat8) {
   ctx.Reset();
 }
 
+TEST(TestGdvFnStubs, TestInstr) {
+  gandiva::ExecutionContext ctx;
+
+  int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+  std::string s1 = "hello world!";
+  auto s1_len = static_cast<int32_t>(s1.size());
+  std::string s2 = "world";
+  auto s2_len = static_cast<int32_t>(s2.size());
+
+  auto result = gd_fn_instr_utf8(ctx_ptr, s1.c_str(), s1_len, s2.c_str(), 
s2_len);
+  EXPECT_EQ(result, 6);
+
+  s1 = "apple, banana, mango";
+  s1_len = static_cast<int32_t>(s1.size());
+  s2 = "mango";
+  s2_len = static_cast<int32_t>(s2.size());
+
+  result = gd_fn_instr_utf8(ctx_ptr, s1.c_str(), s1_len, s2.c_str(), s2_len);
+  EXPECT_EQ(result, 15);
+
+  s1 = "";
+  s1_len = static_cast<int32_t>(s1.size());
+  s2 = "mango";
+  s2_len = static_cast<int32_t>(s2.size());
+
+  result = gd_fn_instr_utf8(ctx_ptr, s1.c_str(), s1_len, s2.c_str(), s2_len);
+  EXPECT_EQ(result, 0);
+
+  s1 = "open the door";
+  s1_len = static_cast<int32_t>(s1.size());
+  s2 = "";
+  s2_len = static_cast<int32_t>(s2.size());
+
+  result = gd_fn_instr_utf8(ctx_ptr, s1.c_str(), s1_len, s2.c_str(), s2_len);
+  EXPECT_EQ(result, 0);

Review comment:
       My testing shows that this should return 1

##########
File path: cpp/src/gandiva/gdv_function_stubs_test.cc
##########
@@ -766,4 +766,42 @@ TEST(TestGdvFnStubs, TestCastVarbinaryFloat8) {
   ctx.Reset();
 }
 
+TEST(TestGdvFnStubs, TestInstr) {
+  gandiva::ExecutionContext ctx;
+
+  int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+
+  std::string s1 = "hello world!";
+  auto s1_len = static_cast<int32_t>(s1.size());
+  std::string s2 = "world";
+  auto s2_len = static_cast<int32_t>(s2.size());
+
+  auto result = gd_fn_instr_utf8(ctx_ptr, s1.c_str(), s1_len, s2.c_str(), 
s2_len);
+  EXPECT_EQ(result, 6);
+
+  s1 = "apple, banana, mango";
+  s1_len = static_cast<int32_t>(s1.size());
+  s2 = "mango";
+  s2_len = static_cast<int32_t>(s2.size());

Review comment:
       Both of these tests match at the end. Can you add tests where the 
substring is at index 0 (the return value should be 1) and where the substring 
is somewhere in the middle?
   
   Also, a test case where the string has most of the substring except the last 
character - for e.g. instr("hello world", "worldA") - this will catch off-by-1 
errors in the code

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +794,27 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+int32_t gd_fn_instr_utf8(int64_t context, const char* string, int32_t 
string_len,
+                         const char* substring, int32_t substring_len) {
+  if (string_len <= 0 || substring_len <= 0) {
+    return 0;
+  }
+
+  std::string str(string, string_len);

Review comment:
       We should avoid using std::string as this does a memcpy. It would also 
do a malloc, malloc requires a lock impacting performance on multi-threaded 
machines.
   
   You should write a loop like as follows:
   for(int i = 0; i < (string_len - substring_len); i++) {
     if (string[i] == substring[0]) {
       // check if there is a complete match starting from this index
     }
   }

##########
File path: cpp/src/gandiva/tests/projector_test.cc
##########
@@ -1606,4 +1606,47 @@ TEST_F(TestProjector, TestCastNullableIntYearInterval) {
   EXPECT_ARROW_ARRAY_EQUALS(out_int64, outputs.at(1));
 }
 
+TEST_F(TestProjector, TestInstr) {
+  // schema for input fields
+  auto field0 = field("f0", arrow::utf8());
+  auto field1 = field("f2", arrow::utf8());
+  auto schema = arrow::schema({field0, field1});
+
+  // output fields
+  auto output_instr = field("out_instr", int32());
+
+
+  // Build expression
+  auto instr_expr = TreeExprBuilder::MakeExpression("instr",
+                                                    {field0, field1}, 
output_instr);
+
+
+  std::shared_ptr<Projector> projector;
+  auto status =
+      Projector::Make(schema, {instr_expr}, TestConfiguration(), &projector);
+  EXPECT_TRUE(status.ok());
+
+  // Create a row-batch with some sample data
+  int num_records = 4;
+  auto array0 = MakeArrowArrayUtf8(
+      {"hello world!", "apple, banana, mango", "", "open the door"},
+      {true, true, true, true});
+  auto array1 = MakeArrowArrayUtf8(
+      {"world", "mango", "mango", ""},

Review comment:
       What is the expected output if the substr is not null, but the length is 
0? Is this matching at index 1 or not matching?
   
   The Hive documentation is not clear on this. Can you test how the Hive 
function behaves? My testing shows that an empty substring (not-null, but a 
substring of "") returns 1 for the INSTR(column, '')

##########
File path: cpp/src/gandiva/function_registry_string.cc
##########
@@ -406,6 +406,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
 
       NativeFunction("split_part", {}, DataTypeVector{utf8(), utf8(), 
int32()}, utf8(),
                      kResultNullIfNull, "split_part",
+                     NativeFunction::kNeedsContext | 
NativeFunction::kCanReturnErrors),
+
+      NativeFunction("instr", {}, DataTypeVector{utf8(), utf8()},  int32(),
+                     kResultNullIfNull, "gd_fn_instr_utf8",

Review comment:
       Why does this function need context? I also dont see this function 
return errors
   
   Do we need the kNeedsContext and kCanReturnErrors flags?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] vvellanki commented on a change in pull request #11287: ARROW-14193: [C++][Gandiva] Implement INSTR function

Reply via email to