ViniciusSouzaRoque commented on a change in pull request #11522:
URL: https://github.com/apache/arrow/pull/11522#discussion_r743194505
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,55 @@ const char* convert_toUTF8(int64_t context, const char*
value, int32_t value_len
return value;
}
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t
in1_len,
+ const char* in2, int32_t in2_len) {
+ if (in1_len < 0 || in2_len < 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+ int32_t a, b, c, aux;
+ int32_t len_dist_1 = in1_len + 1;
+ int32_t len_dist_2 = in2_len + 1;
+ // dist[i][j] represents the Levenstein distance between the strings
+ int** dist;
+ if ((dist = (int**)malloc((len_dist_1 + 1) * sizeof(int*))) == NULL) {
+ gdv_fn_context_set_error_msg(context, "Insufficient space to allocate
buffer");
+ return 0;
+ }
+ for (int i = 0; i <= in1_len; i++) {
+ if ((dist[i] = (int*)malloc(len_dist_2 * sizeof(int*))) == NULL) {
+ gdv_fn_context_set_error_msg(context, "Insufficient space to allocate
buffer");
+ return 0;
+ }
+ }
+ for (int32_t i = 0; i <= in1_len; i++) dist[i][0] = i;
+ for (int32_t j = 1; j <= in2_len; j++) dist[0][j] = j;
+
+ for (int32_t j = 0; j < in2_len; j++) {
+ for (int32_t i = 0; i < in1_len; i++) {
+ if (in1[i] == in2[j]) {
+ dist[i + 1][j + 1] = dist[i][j];
+ } else {
+ a = dist[i][j + 1] + 1;
+ b = dist[i + 1][j] + 1;
+ c = dist[i][j] + 1;
+ aux = (a < b ? a : b);
+ dist[i + 1][j + 1] = (aux < c ? aux : c);
Review comment:
I created this solution using this algorithm, and applied a new memory
allocation, using only one large chunk of memory.
I was also able to remove the memory dump using std::swap()
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,83 @@ const char* convert_toUTF8(int64_t context, const char*
value, int32_t value_len
return value;
}
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t
in1_len,
+ const char* in2, int32_t in2_len) {
+ if (in1_len < 0 || in2_len < 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // Check input size 0
+ if (in1_len == 0) {
+ return in2_len;
+ }
+ if (in2_len == 0) {
+ return in1_len;
+ }
+
+ int* ptr = new int[(in2_len + 1) * 2];
Review comment:
Okay, it makes sense! I did the exchange using a boolean variable and
two integers so I don't need to copy anything (besides the input sizes)
See if it looks good, please!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]