vvellanki commented on a change in pull request #11522:
URL: https://github.com/apache/arrow/pull/11522#discussion_r744611922
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,104 @@ const char* convert_toUTF8(int64_t context, const char*
value, int32_t value_len
return value;
}
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t
in1_len,
+ const char* in2, int32_t in2_len) {
+ if (in1_len < 0 || in2_len < 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // Check input size 0
+ if (in1_len == 0) {
+ return in2_len;
+ }
+ if (in2_len == 0) {
+ return in1_len;
+ }
+
+ // A and B is one copy from lengths
+ int A;
+ int B;
+
+ // 'smaller' Is a variable to validate the inversion of entries without a
copy of them.
+ bool smaller = false;
+ if (in1_len < in2_len) {
+ smaller = true;
+ A = in2_len;
+ B = in1_len;
+ } else {
+ A = in1_len;
+ B = in2_len;
+ }
+
+ int* ptr = new int[(B + 1) * 2];
+ if (ptr == 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // MEMORY ADRESS MALLOC
+ // v0 -> (0, ..., &ptr[in2_len])
+ // v1 -> (in2_len+1, ..., &ptr[in2_len * 2])
+ int* v0;
+ int* v1;
+ v0 = &ptr[0];
+ v1 = &ptr[B + 1];
+
+ // Initializate v0
+ for (int i = 0; i <= B; i++) {
+ v0[i] = i;
+ }
+
+ // Initialize interactive mode
+ for (int i = 0; i < A; i++) {
+ // The first element to V1 is [i + 1]
+ // For edit distance you can delete (i+1) chars from in1 to match empty
in2 position
+ v1[0] = i + 1;
+
+ for (int j = 0; j < B; j++) {
+ // Calculate costs to modify
+ int deletionCost = v0[j + 1] + 1;
+ int insertionCost = v1[j] + 1;
+ int substitutionCost = v0[j] + 1;
+
+ if (smaller) {
Review comment:
You can eliminate the if (smaller) condition by using:
if (arr_larger[i] == arr_smaller[j]) {
}
You will avoid len_smaller * len_larger if-conditions
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,104 @@ const char* convert_toUTF8(int64_t context, const char*
value, int32_t value_len
return value;
}
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t
in1_len,
+ const char* in2, int32_t in2_len) {
+ if (in1_len < 0 || in2_len < 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // Check input size 0
+ if (in1_len == 0) {
+ return in2_len;
+ }
+ if (in2_len == 0) {
+ return in1_len;
+ }
+
+ // A and B is one copy from lengths
+ int A;
+ int B;
+
+ // 'smaller' Is a variable to validate the inversion of entries without a
copy of them.
+ bool smaller = false;
+ if (in1_len < in2_len) {
+ smaller = true;
+ A = in2_len;
+ B = in1_len;
+ } else {
+ A = in1_len;
+ B = in2_len;
+ }
+
+ int* ptr = new int[(B + 1) * 2];
+ if (ptr == 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // MEMORY ADRESS MALLOC
+ // v0 -> (0, ..., &ptr[in2_len])
+ // v1 -> (in2_len+1, ..., &ptr[in2_len * 2])
+ int* v0;
+ int* v1;
+ v0 = &ptr[0];
+ v1 = &ptr[B + 1];
+
+ // Initializate v0
+ for (int i = 0; i <= B; i++) {
+ v0[i] = i;
+ }
+
+ // Initialize interactive mode
+ for (int i = 0; i < A; i++) {
+ // The first element to V1 is [i + 1]
+ // For edit distance you can delete (i+1) chars from in1 to match empty
in2 position
+ v1[0] = i + 1;
+
+ for (int j = 0; j < B; j++) {
+ // Calculate costs to modify
+ int deletionCost = v0[j + 1] + 1;
+ int insertionCost = v1[j] + 1;
+ int substitutionCost = v0[j] + 1;
+
+ if (smaller) {
+ if (in2[i] == in1[j]) {
+ substitutionCost = v0[j];
+ }
+ } else {
+ if (in1[i] == in2[j]) {
+ substitutionCost = v0[j];
+ }
+ }
+
+ // Catch the minor cost
+ int min;
+ min = deletionCost;
+
+ if (min > substitutionCost) {
+ min = substitutionCost;
+ }
+ if (min > insertionCost) {
+ min = insertionCost;
+ }
+
+ // Set the minor cost to v1
+ v1[j + 1] = min;
+ }
+
+ // Swaping v0 and v1
+ std::swap(v0, v1);
+ }
+ // The results of v1 are now in v0, Levenshtein value is in v0[n]
Review comment:
Not sure what this means - results of v1 are now in v0. Is this comment
correct?
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,104 @@ const char* convert_toUTF8(int64_t context, const char*
value, int32_t value_len
return value;
}
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t
in1_len,
+ const char* in2, int32_t in2_len) {
+ if (in1_len < 0 || in2_len < 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // Check input size 0
+ if (in1_len == 0) {
+ return in2_len;
+ }
+ if (in2_len == 0) {
+ return in1_len;
+ }
+
+ // A and B is one copy from lengths
+ int A;
+ int B;
+
+ // 'smaller' Is a variable to validate the inversion of entries without a
copy of them.
+ bool smaller = false;
Review comment:
You can do better than this; define the following:
char *arr_larger; // points to the larger array
char *arr_smaller; // points to the smaller array
Similarly, len_larger and len_smaller
Do one if-condition like this and initialise arr_larger, arr_smaller,
len_larger, len_smaller
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,104 @@ const char* convert_toUTF8(int64_t context, const char*
value, int32_t value_len
return value;
}
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t
in1_len,
+ const char* in2, int32_t in2_len) {
+ if (in1_len < 0 || in2_len < 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // Check input size 0
+ if (in1_len == 0) {
+ return in2_len;
+ }
+ if (in2_len == 0) {
+ return in1_len;
+ }
+
+ // A and B is one copy from lengths
+ int A;
+ int B;
+
+ // 'smaller' Is a variable to validate the inversion of entries without a
copy of them.
+ bool smaller = false;
+ if (in1_len < in2_len) {
+ smaller = true;
+ A = in2_len;
+ B = in1_len;
+ } else {
+ A = in1_len;
+ B = in2_len;
+ }
+
+ int* ptr = new int[(B + 1) * 2];
+ if (ptr == 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // MEMORY ADRESS MALLOC
+ // v0 -> (0, ..., &ptr[in2_len])
+ // v1 -> (in2_len+1, ..., &ptr[in2_len * 2])
+ int* v0;
+ int* v1;
+ v0 = &ptr[0];
+ v1 = &ptr[B + 1];
+
+ // Initializate v0
+ for (int i = 0; i <= B; i++) {
+ v0[i] = i;
+ }
+
+ // Initialize interactive mode
+ for (int i = 0; i < A; i++) {
Review comment:
outer loop will iterate to len_larger
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,104 @@ const char* convert_toUTF8(int64_t context, const char*
value, int32_t value_len
return value;
}
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t
in1_len,
+ const char* in2, int32_t in2_len) {
+ if (in1_len < 0 || in2_len < 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // Check input size 0
+ if (in1_len == 0) {
+ return in2_len;
+ }
+ if (in2_len == 0) {
+ return in1_len;
+ }
+
+ // A and B is one copy from lengths
+ int A;
+ int B;
+
+ // 'smaller' Is a variable to validate the inversion of entries without a
copy of them.
+ bool smaller = false;
+ if (in1_len < in2_len) {
+ smaller = true;
+ A = in2_len;
+ B = in1_len;
+ } else {
+ A = in1_len;
+ B = in2_len;
+ }
+
+ int* ptr = new int[(B + 1) * 2];
+ if (ptr == 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // MEMORY ADRESS MALLOC
+ // v0 -> (0, ..., &ptr[in2_len])
+ // v1 -> (in2_len+1, ..., &ptr[in2_len * 2])
+ int* v0;
+ int* v1;
+ v0 = &ptr[0];
+ v1 = &ptr[B + 1];
+
+ // Initializate v0
+ for (int i = 0; i <= B; i++) {
+ v0[i] = i;
+ }
+
+ // Initialize interactive mode
+ for (int i = 0; i < A; i++) {
+ // The first element to V1 is [i + 1]
+ // For edit distance you can delete (i+1) chars from in1 to match empty
in2 position
+ v1[0] = i + 1;
+
+ for (int j = 0; j < B; j++) {
Review comment:
inner loop will iterate to len_smaller
##########
File path: cpp/src/gandiva/precompiled/string_ops_test.cc
##########
@@ -912,6 +912,35 @@ TEST(TestStringOps, TestReverse) {
ctx.Reset();
}
+TEST(TestStringOps, TestLevenshtein) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+
+ EXPECT_EQ(levenshtein_utf8_utf8(ctx_ptr, "kitten", 6, "sitting", 7), 3);
Review comment:
Can you add more test cases from the wikipedia page? or even from the
Hive UDF implementation?
The logic is quite complex, it is better to have good test coverage
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1642,6 +1642,104 @@ const char* convert_toUTF8(int64_t context, const char*
value, int32_t value_len
return value;
}
+// Calculate the levenshtein distance between two string values
+FORCE_INLINE
+gdv_int32 levenshtein_utf8_utf8(int64_t context, const char* in1, int32_t
in1_len,
+ const char* in2, int32_t in2_len) {
+ if (in1_len < 0 || in2_len < 0) {
+ gdv_fn_context_set_error_msg(context, "String length must be greater than
0");
+ return 0;
+ }
+
+ // Check input size 0
+ if (in1_len == 0) {
+ return in2_len;
+ }
+ if (in2_len == 0) {
+ return in1_len;
+ }
+
+ // A and B is one copy from lengths
+ int A;
+ int B;
+
+ // 'smaller' Is a variable to validate the inversion of entries without a
copy of them.
+ bool smaller = false;
+ if (in1_len < in2_len) {
+ smaller = true;
+ A = in2_len;
+ B = in1_len;
+ } else {
+ A = in1_len;
+ B = in2_len;
+ }
+
+ int* ptr = new int[(B + 1) * 2];
Review comment:
Memory allocation is for (len_smaller + 1) * 2
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]