[GitHub] [arrow] rkavanap commented on a change in pull request #11051: ARROW-13827: [C++][Gandiva] Implement LEVENSHTEIN Hive functions on Gandiva

GitBox Tue, 02 Nov 2021 10:56:05 -0700


rkavanap commented on a change in pull request #11051:
URL: https://github.com/apache/arrow/pull/11051#discussion_r740843680




##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,
+                                const char* in2, int32_t in2_len) {
+  if (in1_len < 0 || in2_len < 0) {
+    gdv_fn_context_set_error_msg(context, "String length must be greater than 
0");

Review comment:
       shouldn't message be String length must be greater than or equal to zero

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,
+                                const char* in2, int32_t in2_len) {
+  if (in1_len < 0 || in2_len < 0) {
+    gdv_fn_context_set_error_msg(context, "String length must be greater than 
0");
+    return 0;
+  }
+  int32_t a, b, c, aux;

Review comment:
       Isn't better to handle the special case where length of one of the 
strings (or both) is zero with if statements here (to avoid allocations for 
these special cases).
                 if (in1_len == 0) return in2_len;
                 if (in2_len == 0) return in1_len;

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,
+                                const char* in2, int32_t in2_len) {
+  if (in1_len < 0 || in2_len < 0) {
+    gdv_fn_context_set_error_msg(context, "String length must be greater than 
0");
+    return 0;
+  }
+  int32_t a, b, c, aux;
+  int32_t len_dist_1 = in1_len + 1;
+  int32_t len_dist_2 = in2_len + 1;
+  // dist[i][j] represents the Levenstein distance between the strings
+  int** dist;
+  if ((dist = (int**)malloc((len_dist_1 + 1) * sizeof(int*))) == NULL) {
+    gdv_fn_context_set_error_msg(context, "Insufficient space to allocate 
buffer");
+    return 0;
+  }
+  for (int i = 0; i <= in1_len; i++) {
+    if ((dist[i] = (int*)malloc(len_dist_2 * sizeof(int*))) == NULL) {
+      gdv_fn_context_set_error_msg(context, "Insufficient space to allocate 
buffer");
+      return 0;
+    }
+  }
+  for (int32_t i = 0; i <= in1_len; i++) dist[i][0] = i;
+  for (int32_t j = 1; j <= in2_len; j++) dist[0][j] = j;
+
+  for (int32_t j = 0; j < in2_len; j++) {
+    for (int32_t i = 0; i < in1_len; i++) {
+      if (in1[i] == in2[j]) {
+        dist[i + 1][j + 1] = dist[i][j];

Review comment:
       shouldn't this also be min(dist[i][j], 1 + dist[i][j + 1], 1 + dist[i + 
1][j]), when in1[i] = in2[j]?
   and when in1[i] != in2[j]  it is a min (1 + dist[i][j], 1 + dist[i][j + 1], 
1 + dist[i + 1][j]) as properly coded from 1676..1680... 
   
   Atleast In Java it is coded that way.. 
   
   
   
   

##########
File path: cpp/src/gandiva/tests/projector_test.cc
##########
@@ -818,6 +818,41 @@ TEST_F(TestProjector, TestConcat) {
   EXPECT_ARROW_ARRAY_EQUALS(exp_concat, outputs.at(0));
 }
 
+TEST_F(TestProjector, TestLevenshtein) {
+  // schema for input fields
+  auto f0 = field("f0", arrow::utf8());
+  auto f1 = field("f1", arrow::utf8());
+  auto schema = arrow::schema({f0, f1});
+
+  // output fields
+  auto field_lev = field("levenshtein", arrow::int32());
+
+  // Build expression
+  auto lev_expr = TreeExprBuilder::MakeExpression("levenshtein", {f0, f1}, 
field_lev);
+
+  std::shared_ptr<Projector> projector;
+  auto status = Projector::Make(schema, {lev_expr}, TestConfiguration(), 
&projector);
+  EXPECT_TRUE(status.ok()) << status.message();
+
+  // Create a row-batch with some sample data
+  int num_records = 4;
+  auto array0 = MakeArrowArrayUtf8({"cat", "task", "", "a"}, {true, true, 
true, true});
+  auto array1 = MakeArrowArrayUtf8({"coat", "test", "a", ""}, {true, true, 
true, true});

Review comment:
       maybe some fields with validity bit false as well (i.e null fields)?

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,
+                                const char* in2, int32_t in2_len) {
+  if (in1_len < 0 || in2_len < 0) {
+    gdv_fn_context_set_error_msg(context, "String length must be greater than 
0");
+    return 0;
+  }
+  int32_t a, b, c, aux;
+  int32_t len_dist_1 = in1_len + 1;
+  int32_t len_dist_2 = in2_len + 1;
+  // dist[i][j] represents the Levenstein distance between the strings
+  int** dist;
+  if ((dist = (int**)malloc((len_dist_1 + 1) * sizeof(int*))) == NULL) {

Review comment:
       Isn't it faster to do a single malloc for the entire 2d array in one go 
as length of both strings is known in advance? e.g dist = malloc((len_dist_1 + 
1) * sizeof(int *) + len_dist_1 * len_dist_2 * sizeof(int)) and use a loop to 
adjust dist[i];
   int *p = (int *) dist + len_dist_1;
   for (i = 0; i < len_dist_1; i++) dist[i] = p + len_dist_2 * i; 
   
    that will reduce small allocations in the inner loop and also allow a 
single free.

##########
File path: cpp/src/gandiva/precompiled/string_ops_test.cc
##########
@@ -912,6 +912,35 @@ TEST(TestStringOps, TestReverse) {
   ctx.Reset();
 }
 
+TEST(TestStringOps, TestLevenshtein) {
+  gandiva::ExecutionContext ctx;
+  uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+
+  EXPECT_EQ(levenshtein_utf8_utf8(ctx_ptr, "kitten", 6, "sitting", 7), 3);
+  EXPECT_FALSE(ctx.has_error());
+
+  EXPECT_EQ(levenshtein_utf8_utf8(ctx_ptr, "book", 4, "back", 4), 2);
+  EXPECT_FALSE(ctx.has_error());
+
+  EXPECT_EQ(levenshtein_utf8_utf8(ctx_ptr, "", 0, "a", 1), 1);
+  EXPECT_FALSE(ctx.has_error());
+
+  EXPECT_EQ(levenshtein_utf8_utf8(ctx_ptr, "test", 4, "task", 4), 2);
+  EXPECT_FALSE(ctx.has_error());
+
+  EXPECT_EQ(levenshtein_utf8_utf8(ctx_ptr, "cat", 3, "coat", 4), 1);
+  EXPECT_FALSE(ctx.has_error());
+
+  EXPECT_EQ(levenshtein_utf8_utf8(ctx_ptr, "coat", 4, "coat", 4), 0);
+  EXPECT_FALSE(ctx.has_error());
+
+  EXPECT_EQ(levenshtein_utf8_utf8(ctx_ptr, "book", -5, "back", 4), 0);

Review comment:
       test case for "book" to "bo" or "boo". and "b" to "book"?

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,
+                                const char* in2, int32_t in2_len) {
+  if (in1_len < 0 || in2_len < 0) {
+    gdv_fn_context_set_error_msg(context, "String length must be greater than 
0");
+    return 0;
+  }
+  int32_t a, b, c, aux;
+  int32_t len_dist_1 = in1_len + 1;
+  int32_t len_dist_2 = in2_len + 1;
+  // dist[i][j] represents the Levenstein distance between the strings
+  int** dist;
+  if ((dist = (int**)malloc((len_dist_1 + 1) * sizeof(int*))) == NULL) {
+    gdv_fn_context_set_error_msg(context, "Insufficient space to allocate 
buffer");
+    return 0;
+  }
+  for (int i = 0; i <= in1_len; i++) {
+    if ((dist[i] = (int*)malloc(len_dist_2 * sizeof(int*))) == NULL) {

Review comment:
       if we are doing it this way, shouldn't this be a sizeof(int) and not 
sizeof(int *) as dist[i] is a int pointer but dist[i][j] is an int.

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,
+                                const char* in2, int32_t in2_len) {
+  if (in1_len < 0 || in2_len < 0) {
+    gdv_fn_context_set_error_msg(context, "String length must be greater than 
0");
+    return 0;
+  }
+  int32_t a, b, c, aux;
+  int32_t len_dist_1 = in1_len + 1;
+  int32_t len_dist_2 = in2_len + 1;
+  // dist[i][j] represents the Levenstein distance between the strings
+  int** dist;
+  if ((dist = (int**)malloc((len_dist_1 + 1) * sizeof(int*))) == NULL) {

Review comment:
       Isn't it faster to do a single malloc for the entire 2d array in one go 
as length of both strings is known in advance? e.g dist = malloc((len_dist_1 + 
1) * sizeof(int *) + len_dist_1 * len_dist_2 * sizeof(int)) and use a loop to 
adjust dist[i];
   int *p = (int *) dist + len_dist_1;
   for (i = 0; i < len_dist_1; i++) dist[i] = p + len_dist_2 * i; 
   (warning: pointer calculation may not be correct and needs testing..this is 
just to convey the idea).
   
    that will reduce small allocations in the inner loop and also allow a 
single free.

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,

Review comment:
       I feel there is a more space efficient algorithm than this as we are 
only interested in the edit distance and is not interested in reconstructing 
the edited input..Atleast the Java version uses this space efficient algorithm.
   
   Is there any reason why we have to use this algorithm and not the one used 
by Java? 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] rkavanap commented on a change in pull request #11051: ARROW-13827: [C++][Gandiva] Implement LEVENSHTEIN Hive functions on Gandiva

Reply via email to