ViniciusSouzaRoque commented on a change in pull request #11522:
URL: https://github.com/apache/arrow/pull/11522#discussion_r744694943
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,104 @@ const char* convert_toUTF8(int64_t context, const char*
value, int32_t value_len
return value;
}
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t
in1_len,
+ const char* in2, int32_t in2_len) {
+ if (in1_len < 0 || in2_len < 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // Check input size 0
+ if (in1_len == 0) {
+ return in2_len;
+ }
+ if (in2_len == 0) {
+ return in1_len;
+ }
+
+ // A and B is one copy from lengths
+ int A;
+ int B;
+
+ // 'smaller' Is a variable to validate the inversion of entries without a
copy of them.
+ bool smaller = false;
+ if (in1_len < in2_len) {
+ smaller = true;
+ A = in2_len;
+ B = in1_len;
+ } else {
+ A = in1_len;
+ B = in2_len;
+ }
+
+ int* ptr = new int[(B + 1) * 2];
+ if (ptr == 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // MEMORY ADRESS MALLOC
+ // v0 -> (0, ..., &ptr[in2_len])
+ // v1 -> (in2_len+1, ..., &ptr[in2_len * 2])
+ int* v0;
+ int* v1;
+ v0 = &ptr[0];
+ v1 = &ptr[B + 1];
+
+ // Initializate v0
+ for (int i = 0; i <= B; i++) {
+ v0[i] = i;
+ }
+
+ // Initialize interactive mode
+ for (int i = 0; i < A; i++) {
+ // The first element to V1 is [i + 1]
+ // For edit distance you can delete (i+1) chars from in1 to match empty
in2 position
+ v1[0] = i + 1;
+
+ for (int j = 0; j < B; j++) {
+ // Calculate costs to modify
+ int deletionCost = v0[j + 1] + 1;
+ int insertionCost = v1[j] + 1;
+ int substitutionCost = v0[j] + 1;
+
+ if (smaller) {
+ if (in2[i] == in1[j]) {
+ substitutionCost = v0[j];
+ }
+ } else {
+ if (in1[i] == in2[j]) {
+ substitutionCost = v0[j];
+ }
+ }
+
+ // Catch the minor cost
+ int min;
+ min = deletionCost;
+
+ if (min > substitutionCost) {
+ min = substitutionCost;
+ }
+ if (min > insertionCost) {
+ min = insertionCost;
+ }
+
+ // Set the minor cost to v1
+ v1[j + 1] = min;
+ }
+
+ // Swaping v0 and v1
+ std::swap(v0, v1);
+ }
+ // The results of v1 are now in v0, Levenshtein value is in v0[n]
Review comment:
yeah, this coment is right, in Levenshtein "Iterative with two matrix
rows" solicited per @rkavanap , after the last swap, the results of v1 are now
in v0.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]