vvellanki commented on a change in pull request #11522:
URL: https://github.com/apache/arrow/pull/11522#discussion_r740961823



##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,
+                                const char* in2, int32_t in2_len) {
+  if (in1_len < 0 || in2_len < 0) {
+    gdv_fn_context_set_error_msg(context, "String length must be greater than 
0");
+    return 0;
+  }
+  int32_t a, b, c, aux;
+  int32_t len_dist_1 = in1_len + 1;
+  int32_t len_dist_2 = in2_len + 1;
+  // dist[i][j] represents the Levenstein distance between the strings
+  int** dist;
+  if ((dist = (int**)malloc((len_dist_1 + 1) * sizeof(int*))) == NULL) {

Review comment:
       Why len_dist_1 + 1 and not len_dist_1?
   
   The algorithm in wikipedia allocates a matrix of size (in1_len + 1) x 
(in2_len + 1)

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,
+                                const char* in2, int32_t in2_len) {
+  if (in1_len < 0 || in2_len < 0) {
+    gdv_fn_context_set_error_msg(context, "String length must be greater than 
0");
+    return 0;
+  }
+  int32_t a, b, c, aux;
+  int32_t len_dist_1 = in1_len + 1;
+  int32_t len_dist_2 = in2_len + 1;
+  // dist[i][j] represents the Levenstein distance between the strings
+  int** dist;
+  if ((dist = (int**)malloc((len_dist_1 + 1) * sizeof(int*))) == NULL) {
+    gdv_fn_context_set_error_msg(context, "Insufficient space to allocate 
buffer");
+    return 0;
+  }
+  for (int i = 0; i <= in1_len; i++) {
+    if ((dist[i] = (int*)malloc(len_dist_2 * sizeof(int*))) == NULL) {
+      gdv_fn_context_set_error_msg(context, "Insufficient space to allocate 
buffer");
+      return 0;
+    }
+  }
+  for (int32_t i = 0; i <= in1_len; i++) dist[i][0] = i;
+  for (int32_t j = 1; j <= in2_len; j++) dist[0][j] = j;
+
+  for (int32_t j = 0; j < in2_len; j++) {
+    for (int32_t i = 0; i < in1_len; i++) {
+      if (in1[i] == in2[j]) {
+        dist[i + 1][j + 1] = dist[i][j];
+      } else {
+        a = dist[i][j + 1] + 1;
+        b = dist[i + 1][j] + 1;
+        c = dist[i][j] + 1;
+        aux = (a < b ? a : b);
+        dist[i + 1][j + 1] = (aux < c ? aux : c);

Review comment:
       Can you implement this algorithm "Iterative with two matrix rows" from 
https://en.wikipedia.org/wiki/Levenshtein_distance instead? It uses less memory 
and takes the same amount of time to compute the distance

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char* 
value, int32_t value_len
   return value;
 }
 
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t 
in1_len,
+                                const char* in2, int32_t in2_len) {
+  if (in1_len < 0 || in2_len < 0) {
+    gdv_fn_context_set_error_msg(context, "String length must be greater than 
0");
+    return 0;
+  }
+  int32_t a, b, c, aux;
+  int32_t len_dist_1 = in1_len + 1;
+  int32_t len_dist_2 = in2_len + 1;
+  // dist[i][j] represents the Levenstein distance between the strings
+  int** dist;
+  if ((dist = (int**)malloc((len_dist_1 + 1) * sizeof(int*))) == NULL) {
+    gdv_fn_context_set_error_msg(context, "Insufficient space to allocate 
buffer");
+    return 0;
+  }
+  for (int i = 0; i <= in1_len; i++) {
+    if ((dist[i] = (int*)malloc(len_dist_2 * sizeof(int*))) == NULL) {

Review comment:
       Can we allocate one large chunk of memory instead of making multiple 
calls to malloc? Each call to malloc requires a lock!!!
   
   * ptr = malloc((in1_len + 1) * (in2_len + 1) * sizeof(int)) space. Line 1663 
is allocating space for int*
   * rowSize = (int2_len + 1) * sizeof(int)
   * dist[i] = ptr + i * rowSize




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to