Repository: incubator-impala Updated Branches: refs/heads/master 94fc6a3d0 -> 64ffbc4b5
IMPALA-3973: add position and occurrence to instr() Change-Id: Ie9648de458d243306fa14adc5e7f7002bf6f67fd Reviewed-on: http://gerrit.cloudera.org:8080/4094 Tested-by: Internal Jenkins Reviewed-by: Matthew Jacobs <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/64ffbc4b Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/64ffbc4b Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/64ffbc4b Branch: refs/heads/master Commit: 64ffbc4b5112b21e738a16218cb396c9b65ff70d Parents: 94fc6a3 Author: Zoltan Ivanfi <[email protected]> Authored: Tue Aug 23 13:26:05 2016 +0200 Committer: Matthew Jacobs <[email protected]> Committed: Tue Sep 13 20:28:27 2016 +0000 ---------------------------------------------------------------------- be/src/experiments/CMakeLists.txt | 2 +- be/src/experiments/string-search-sse-test.cc | 119 +++++++++++++++++++ be/src/experiments/string-search-test.cc | 123 -------------------- be/src/exprs/expr-test.cc | 65 +++++++++++ be/src/exprs/string-functions-ir.cc | 69 +++++++++-- be/src/exprs/string-functions.h | 5 + be/src/runtime/CMakeLists.txt | 1 + be/src/runtime/string-search-test.cc | 135 ++++++++++++++++++++++ be/src/runtime/string-search.h | 71 +++++++++++- common/function-registry/impala_functions.py | 3 + 10 files changed, 458 insertions(+), 135 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/experiments/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/be/src/experiments/CMakeLists.txt b/be/src/experiments/CMakeLists.txt index 743d2ab..e4d4895 100644 --- a/be/src/experiments/CMakeLists.txt +++ b/be/src/experiments/CMakeLists.txt @@ -38,4 +38,4 @@ target_link_libraries(tuple-splitter-test Experiments ${IMPALA_LINK_LIBS}) target_link_libraries(hash-partition-test ${IMPALA_LINK_LIBS}) target_link_libraries(compression-test ${IMPALA_LINK_LIBS}) -ADD_BE_TEST(string-search-test) +ADD_BE_TEST(string-search-sse-test) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/experiments/string-search-sse-test.cc ---------------------------------------------------------------------- diff --git a/be/src/experiments/string-search-sse-test.cc b/be/src/experiments/string-search-sse-test.cc new file mode 100644 index 0000000..e570a6b --- /dev/null +++ b/be/src/experiments/string-search-sse-test.cc @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include <string> + +#include "experiments/string-search-sse.h" + +#include "common/names.h" + +namespace impala { + +// Test string search for haystack/needle, up to len for each. +// If the length is -1, use the full string length +// haystack/needle are null terminated +void TestSearch(const char* haystack_orig, const char* needle_orig, int haystack_len = -1, + int needle_len = -1) { + string haystack_copy(haystack_orig); + string needle_copy(needle_orig); + const char* haystack = haystack_copy.c_str(); + const char* needle = needle_copy.c_str(); + + string haystack_buffer, needle_buffer; + + const char* haystack_null_terminated = haystack; + const char* needle_null_terminated = needle; + + // Make null terminated versions (for libc). + if (haystack_len != -1) { + haystack_buffer = string(haystack, haystack_len); + haystack_null_terminated = haystack_buffer.c_str(); + } else { + haystack_len = strlen(haystack); + } + + if (needle_len != -1) { + needle_buffer = string(needle, needle_len); + needle_null_terminated = needle_buffer.c_str(); + } else { + needle_len = strlen(needle); + } + + // Call libc for correct result + const char* libc_result = strstr(haystack_null_terminated, needle_null_terminated); + int libc_offset = (libc_result == NULL) ? -1 : libc_result - haystack_null_terminated; + + StringValue haystack_str_val(const_cast<char*>(haystack), haystack_len); + + // Call our strstr with null terminated needle + StringSearchSSE needle1(needle_null_terminated); + int null_offset = needle1.Search(haystack_str_val); + EXPECT_EQ(null_offset, libc_offset); + + // Call our strstr with non-null terminated needle + StringValue needle_str_val(const_cast<char*>(needle), needle_len); + StringSearchSSE needle2(&needle_str_val); + int not_null_offset = needle2.Search(haystack_str_val); + EXPECT_EQ(not_null_offset, libc_offset); + + // Ensure haystack/needle are unmodified + EXPECT_EQ(strlen(needle_null_terminated), needle_len); + EXPECT_EQ(strlen(haystack_null_terminated), haystack_len); + EXPECT_EQ(strcmp(haystack, haystack_orig), 0); + EXPECT_EQ(strcmp(needle, needle_orig), 0); +} + +TEST(StringSearchTest, Basic) { + // Basic Tests + TestSearch("abcd", "a"); + TestSearch("abcd", "ab"); + TestSearch("abcd", "c"); + TestSearch("abcd", "cd"); + TestSearch("", ""); + TestSearch("abc", ""); + TestSearch("", "a"); + + // Test single chars + TestSearch("a", "a"); + TestSearch("a", "b"); + TestSearch("abc", "b"); + + // Haystack is not full string + TestSearch("abcabcd", "abc", 5); + TestSearch("abcabcd", "abcd", 5); + TestSearch("abcabcd", "abcd", 0); + + // Haystack and needle not full len + TestSearch("abcde", "abaaaaaa", 4, 2); + TestSearch("abcde", "abaaaaaa", 4, 3); + TestSearch("abcdabaaaaa", "abaaaaaa", 4, 3); + TestSearch("abcdabaaaaa", "abaaaaaa", 4, 0); + TestSearch("abcdabaaaaa", "abaaaaaa", 0, 0); + + // Test last bit, this is the interesting edge case + TestSearch("abcde", "e"); + TestSearch("abcde", "de"); + TestSearch("abcdede", "cde"); + TestSearch("abcdacde", "cde"); +} +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/experiments/string-search-test.cc ---------------------------------------------------------------------- diff --git a/be/src/experiments/string-search-test.cc b/be/src/experiments/string-search-test.cc deleted file mode 100644 index cc46297..0000000 --- a/be/src/experiments/string-search-test.cc +++ /dev/null @@ -1,123 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <string> -#include <gtest/gtest.h> - -#include "experiments/string-search-sse.h" - -#include "common/names.h" - -namespace impala { - -// Test string search for haystack/needle, up to len for each. -// If the length is -1, use the full string length -// haystack/needle are null terminated -void TestSearch(const char* haystack_orig, const char* needle_orig, - int haystack_len = -1, int needle_len = -1) { - - string haystack_copy(haystack_orig); - string needle_copy(needle_orig); - const char* haystack = haystack_copy.c_str(); - const char* needle = needle_copy.c_str(); - - string haystack_buffer, needle_buffer; - - const char* haystack_null_terminated = haystack; - const char* needle_null_terminated = needle; - - // Make null terminated versions (for libc). - if (haystack_len != -1) { - haystack_buffer = string(haystack, haystack_len); - haystack_null_terminated = haystack_buffer.c_str(); - } else { - haystack_len = strlen(haystack); - } - - if (needle_len != -1) { - needle_buffer = string(needle, needle_len); - needle_null_terminated = needle_buffer.c_str(); - } else { - needle_len = strlen(needle); - } - - // Call libc for correct result - const char* libc_result = strstr(haystack_null_terminated, needle_null_terminated); - int libc_offset = (libc_result == NULL) ? -1 : libc_result - haystack_null_terminated; - - StringValue haystack_str_val(const_cast<char*>(haystack), haystack_len); - - // Call our strstr with null terminated needle - StringSearchSSE needle1(needle_null_terminated); - int null_offset = needle1.Search(haystack_str_val); - EXPECT_EQ(null_offset, libc_offset); - - // Call our strstr with non-null terminated needle - StringValue needle_str_val(const_cast<char*>(needle), needle_len); - StringSearchSSE needle2(&needle_str_val); - int not_null_offset = needle2.Search(haystack_str_val); - EXPECT_EQ(not_null_offset, libc_offset); - - // Ensure haystack/needle are unmodified - EXPECT_EQ(strlen(needle_null_terminated), needle_len); - EXPECT_EQ(strlen(haystack_null_terminated), haystack_len); - EXPECT_EQ(strcmp(haystack, haystack_orig), 0); - EXPECT_EQ(strcmp(needle, needle_orig), 0); -} - - -TEST(StringSearchTest, Basic) { - // Basic Tests - TestSearch("abcd", "a"); - TestSearch("abcd", "ab"); - TestSearch("abcd", "c"); - TestSearch("abcd", "cd"); - TestSearch("", ""); - TestSearch("abc", ""); - TestSearch("", "a"); - - // Test single chars - TestSearch("a", "a"); - TestSearch("a", "b"); - TestSearch("abc", "b"); - - // Haystack is not full string - TestSearch("abcabcd", "abc", 5); - TestSearch("abcabcd", "abcd", 5); - TestSearch("abcabcd", "abcd", 0); - - // Haystack and needle not full len - TestSearch("abcde", "abaaaaaa", 4, 2); - TestSearch("abcde", "abaaaaaa", 4, 3); - TestSearch("abcdabaaaaa", "abaaaaaa", 4, 3); - TestSearch("abcdabaaaaa", "abaaaaaa", 4, 0); - TestSearch("abcdabaaaaa", "abaaaaaa", 0, 0); - - // Test last bit, this is the interesting edge case - TestSearch("abcde", "e"); - TestSearch("abcde", "de"); - TestSearch("abcdede", "cde"); - TestSearch("abcdacde", "cde"); -} - -} - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/exprs/expr-test.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index 67bbe0a..8a19603 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -2064,9 +2064,74 @@ TEST_F(ExprTest, StringFunctions) { TestValue("instr('', '')", TYPE_INT, 0); TestValue("instr('', 'abc')", TYPE_INT, 0); TestValue("instr('abc', '')", TYPE_INT, 0); + TestValue("instr('abcdef', 'xyz')", TYPE_INT, 0); + TestValue("instr('xyz', 'abcdef')", TYPE_INT, 0); TestValue("instr('abc', 'abc')", TYPE_INT, 1); TestValue("instr('xyzabc', 'abc')", TYPE_INT, 4); TestValue("instr('xyzabcxyz', 'bcx')", TYPE_INT, 5); + + TestValue("instr('', '', -1)", TYPE_INT, 0); + TestValue("instr('', 'abc', -1)", TYPE_INT, 0); + TestValue("instr('abc', '', -1)", TYPE_INT, 0); + TestValue("instr('abc', 'abc', -1)", TYPE_INT, 1); + TestValue("instr('xyzabc', 'abc', -1)", TYPE_INT, 4); + TestValue("instr('xyzabcxyz', 'bcx', -1)", TYPE_INT, 5); + + TestValue("instr('corporate floor', 'or', 0)", TYPE_INT, 0); + TestValue("instr('corporate floor', 'or', 14)", TYPE_INT, 14); + TestValue("instr('corporate floor', 'or', 15)", TYPE_INT, 0); + TestValue("instr('corporate floor', 'or', -14)", TYPE_INT, 2); + TestValue("instr('corporate floor', 'or', -15)", TYPE_INT, 0); + + TestValue("instr('corporate floor', 'or', 0, 1)", TYPE_INT, 0); + TestValue("instr('corporate floor', 'or', 14, 1)", TYPE_INT, 14); + TestValue("instr('corporate floor', 'or', 15, 1)", TYPE_INT, 0); + TestValue("instr('corporate floor', 'or', -14, 1)", TYPE_INT, 2); + TestValue("instr('corporate floor', 'or', -15, 1)", TYPE_INT, 0); + + TestValue("instr('corporate floor', 'or', 2, 2)", TYPE_INT, 5); + TestValue("instr('corporate floor', 'or', 3, 2)", TYPE_INT, 14); + TestValue("instr('corporate floor', 'or', -3, 2)", TYPE_INT, 2); + TestValue("instr('corporate floor', 'or', -2, 2)", TYPE_INT, 5); + + TestValue("instr('corporate floor', 'or', 3, 3)", TYPE_INT, 0); + TestValue("instr('corporate floor', 'or', -3, 3)", TYPE_INT, 0); + TestValue("instr('corporate floor', '', 3, 3)", TYPE_INT, 0); + TestValue("instr('corporate floor', '', -3, 3)", TYPE_INT, 0); + + TestValue("instr('abababa', 'aba', 1, 1)", TYPE_INT, 1); + TestValue("instr('abababa', 'aba', 1, 2)", TYPE_INT, 3); + TestValue("instr('abababa', 'aba', 1, 3)", TYPE_INT, 5); + TestValue("instr('abababa', 'aba', cast(1 as bigint), cast(3 as bigint))", TYPE_INT, 5); + + TestError("instr('corporate floor', 'or', 0, 0)"); + TestError("instr('corporate floor', 'or', 0, -1)"); + TestError("instr('corporate floor', 'or', 1, 0)"); + TestError("instr('corporate floor', 'or', 1, -1)"); + + TestIsNull("instr(NULL, 'or', 2)", TYPE_INT); + TestIsNull("instr('corporate floor', NULL, 2)", TYPE_INT); + TestIsNull("instr('corporate floor', 'or', NULL)", TYPE_INT); + + TestIsNull("instr(NULL, 'or', 2, 2)", TYPE_INT); + TestIsNull("instr('corporate floor', NULL, 2, 2)", TYPE_INT); + TestIsNull("instr('corporate floor', 'or', NULL, 2)", TYPE_INT); + TestIsNull("instr('corporate floor', 'or', 2, NULL)", TYPE_INT); + + TestValue("instr('a', 'a', 1, 1)", TYPE_INT, 1); + TestValue("instr('axyz', 'a', 1, 1)", TYPE_INT, 1); + TestValue("instr('xyza', 'a', 1, 1)", TYPE_INT, 4); + TestValue("instr('a', 'a', 1, 2)", TYPE_INT, 0); + TestValue("instr('axyz', 'a', 1, 2)", TYPE_INT, 0); + TestValue("instr('xyza', 'a', 1, 2)", TYPE_INT, 0); + + TestValue("instr('a', 'a', -1, 1)", TYPE_INT, 1); + TestValue("instr('axyz', 'a', -1, 1)", TYPE_INT, 1); + TestValue("instr('xyza', 'a', -1, 1)", TYPE_INT, 4); + TestValue("instr('a', 'a', -1, 2)", TYPE_INT, 0); + TestValue("instr('axyz', 'a', -1, 2)", TYPE_INT, 0); + TestValue("instr('xyza', 'a', -1, 2)", TYPE_INT, 0); + TestIsNull("instr(NULL, 'bcx')", TYPE_INT); TestIsNull("instr('xyzabcxyz', NULL)", TYPE_INT); TestIsNull("instr(NULL, NULL)", TYPE_INT); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/exprs/string-functions-ir.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc index 06b16af..39cf898 100644 --- a/be/src/exprs/string-functions-ir.cc +++ b/be/src/exprs/string-functions-ir.cc @@ -284,13 +284,68 @@ IntVal StringFunctions::Ascii(FunctionContext* context, const StringVal& str) { } IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str, - const StringVal& substr) { - if (str.is_null || substr.is_null) return IntVal::null(); - StringValue str_sv = StringValue::FromStringVal(str); - StringValue substr_sv = StringValue::FromStringVal(substr); - StringSearch search(&substr_sv); - // Hive returns positions starting from 1. - return IntVal(search.Search(&str_sv) + 1); + const StringVal& substr, const BigIntVal& start_position, + const BigIntVal& occurrence) { + if (str.is_null || substr.is_null || start_position.is_null || occurrence.is_null) { + return IntVal::null(); + } + if (occurrence.val <= 0) { + stringstream ss; + ss << "Invalid occurrence parameter to instr function: " << occurrence.val; + context->SetError(ss.str().c_str()); + return IntVal(0); + } + if (start_position.val == 0) return IntVal(0); + + StringValue haystack = StringValue::FromStringVal(str); + StringValue needle = StringValue::FromStringVal(substr); + StringSearch search(&needle); + if (start_position.val > 0) { + // A positive starting position indicates regular searching from the left. + int search_start_pos = start_position.val - 1; + if (search_start_pos >= haystack.len) return IntVal(0); + int match_pos = -1; + for (int match_num = 0; match_num < occurrence.val; ++match_num) { + DCHECK_LE(search_start_pos, haystack.len); + StringValue haystack_substring = haystack.Substring(search_start_pos); + int match_pos_in_substring = search.Search(&haystack_substring); + if (match_pos_in_substring < 0) return IntVal(0); + match_pos = search_start_pos + match_pos_in_substring; + search_start_pos = match_pos + 1; + } + // Return positions starting from 1 at the leftmost position. + return IntVal(match_pos + 1); + } else { + // A negative starting position indicates searching from the right. + int search_start_pos = haystack.len + start_position.val; + // The needle must fit between search_start_pos and the end of the string + if (search_start_pos + needle.len > haystack.len) { + search_start_pos = haystack.len - needle.len; + } + if (search_start_pos < 0) return IntVal(0); + int match_pos = -1; + for (int match_num = 0; match_num < occurrence.val; ++match_num) { + DCHECK_GE(search_start_pos + needle.len, 0); + DCHECK_LE(search_start_pos + needle.len, haystack.len); + StringValue haystack_substring = + haystack.Substring(0, search_start_pos + needle.len); + match_pos = search.RSearch(&haystack_substring); + if (match_pos < 0) return IntVal(0); + search_start_pos = match_pos - 1; + } + // Return positions starting from 1 at the leftmost position. + return IntVal(match_pos + 1); + } +} + +IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str, + const StringVal& substr, const BigIntVal& start_position) { + return Instr(context, str, substr, start_position, BigIntVal(1)); +} + +IntVal StringFunctions::Instr( + FunctionContext* context, const StringVal& str, const StringVal& substr) { + return Instr(context, str, substr, BigIntVal(1), BigIntVal(1)); } IntVal StringFunctions::Locate(FunctionContext* context, const StringVal& substr, http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/exprs/string-functions.h ---------------------------------------------------------------------- diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h index 85fda0e..fb3a900 100644 --- a/be/src/exprs/string-functions.h +++ b/be/src/exprs/string-functions.h @@ -60,7 +60,12 @@ class StringFunctions { static StringVal Ltrim(FunctionContext*, const StringVal& str); static StringVal Rtrim(FunctionContext*, const StringVal& str); static IntVal Ascii(FunctionContext*, const StringVal& str); + static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr, + const BigIntVal& start_position, const BigIntVal& occurrence); + static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr, + const BigIntVal& start_position); static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr); + static IntVal Locate(FunctionContext*, const StringVal& substr, const StringVal& str); static IntVal LocatePos(FunctionContext*, const StringVal& substr, const StringVal& str, const BigIntVal& start_pos); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/runtime/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt index f0f722d..6bdce48 100644 --- a/be/src/runtime/CMakeLists.txt +++ b/be/src/runtime/CMakeLists.txt @@ -78,6 +78,7 @@ ADD_BE_TEST(disk-io-mgr-test) ADD_BE_TEST(buffered-block-mgr-test) ADD_BE_TEST(parallel-executor-test) ADD_BE_TEST(raw-value-test) +ADD_BE_TEST(string-search-test) ADD_BE_TEST(string-value-test) ADD_BE_TEST(thread-resource-mgr-test) ADD_BE_TEST(mem-tracker-test) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/runtime/string-search-test.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/string-search-test.cc b/be/src/runtime/string-search-test.cc new file mode 100644 index 0000000..a7804cc --- /dev/null +++ b/be/src/runtime/string-search-test.cc @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include "runtime/string-search.h" + +namespace impala { + +StringValue StrValFromCString(const char* str, int len) { + if (len == -1) len = strlen(str); + return StringValue(const_cast<char*>(str), len); +} + +// Test string search for haystack/needle, up to len for each. +// If the length is -1, use the full string length. +// Searching in a substring allows checking whether the search respects the +// length stored in the StringValue or also reads parts of the string which +// should be excluded. +int TestSearch(const char* haystack, const char* needle, int haystack_len = -1, + int needle_len = -1) { + StringValue needle_str_val = StrValFromCString(needle, needle_len); + StringValue haystack_str_val = StrValFromCString(haystack, haystack_len); + StringSearch search(&needle_str_val); + return search.Search(&haystack_str_val); +} + +// Test reverse string search for haystack/needle, up to len for each. +// If the length is -1, use the full string length. +// Searching in a substring allows checking whether the search respects the +// length stored in the StringValue or also reads parts of the string which +// should be excluded. +int TestRSearch(const char* haystack, const char* needle, int haystack_len = -1, + int needle_len = -1) { + StringValue needle_str_val = StrValFromCString(needle, needle_len); + StringValue haystack_str_val = StrValFromCString(haystack, haystack_len); + StringSearch search(&needle_str_val); + return search.RSearch(&haystack_str_val); +} + +TEST(StringSearchTest, RegularSearch) { + // Basic Tests + EXPECT_EQ(0, TestSearch("abcdabcd", "a")); + EXPECT_EQ(0, TestSearch("abcdabcd", "ab")); + EXPECT_EQ(2, TestSearch("abcdabcd", "c")); + EXPECT_EQ(2, TestSearch("abcdabcd", "cd")); + EXPECT_EQ(-1, TestSearch("", "")); + EXPECT_EQ(-1, TestSearch("abc", "")); + EXPECT_EQ(-1, TestSearch("", "a")); + + // Test single chars + EXPECT_EQ(0, TestSearch("a", "a")); + EXPECT_EQ(-1, TestSearch("a", "b")); + EXPECT_EQ(1, TestSearch("abc", "b")); + + // Test searching in a substring of the haystack + EXPECT_EQ(0, TestSearch("abcabcd", "abc", 5)); + EXPECT_EQ(-1, TestSearch("abcabcd", "abcd", 5)); + EXPECT_EQ(-1, TestSearch("abcabcd", "abcd", 0)); + + // Test searching a substring of the needle in a substring of the haystack. + EXPECT_EQ(0, TestSearch("abcde", "abaaaaaa", 4, 2)); + EXPECT_EQ(-1, TestSearch("abcde", "abaaaaaa", 4, 3)); + EXPECT_EQ(-1, TestSearch("abcdabaaaaa", "abaaaaaa", 4, 3)); + EXPECT_EQ(-1, TestSearch("abcdabaaaaa", "abaaaaaa", 4, 0)); + EXPECT_EQ(-1, TestSearch("abcdabaaaaa", "abaaaaaa")); + + // Test with needle matching the end of the substring. + EXPECT_EQ(4, TestSearch("abcde", "e")); + EXPECT_EQ(3, TestSearch("abcde", "de")); + EXPECT_EQ(2, TestSearch("abcdede", "cde")); + EXPECT_EQ(5, TestSearch("abcdacde", "cde")); + + // Test correct skip_ handling, which depends on which char of the needle is + // the same as the last one. + EXPECT_EQ(2, TestSearch("ababcacac", "abcacac")); +} + +TEST(StringSearchTest, ReverseSearch) { + // Basic Tests + EXPECT_EQ(4, TestRSearch("abcdabcd", "a")); + EXPECT_EQ(4, TestRSearch("abcdabcd", "ab")); + EXPECT_EQ(6, TestRSearch("abcdabcd", "c")); + EXPECT_EQ(6, TestRSearch("abcdabcd", "cd")); + EXPECT_EQ(-1, TestRSearch("", "")); + EXPECT_EQ(-1, TestRSearch("abc", "")); + EXPECT_EQ(-1, TestRSearch("", "a")); + + // Test single chars + EXPECT_EQ(0, TestRSearch("a", "a")); + EXPECT_EQ(-1, TestRSearch("a", "b")); + EXPECT_EQ(1, TestRSearch("abc", "b")); + + // Test searching in a substring of the haystack + EXPECT_EQ(0, TestRSearch("abcabcd", "abc", 5)); + EXPECT_EQ(-1, TestRSearch("abcabcd", "abcd", 5)); + EXPECT_EQ(-1, TestRSearch("abcabcd", "abcd", 0)); + + // Test searching a substring of the needle in a substring of the haystack. + EXPECT_EQ(0, TestRSearch("abcde", "abaaaaaa", 4, 2)); + EXPECT_EQ(-1, TestRSearch("abcde", "abaaaaaa", 4, 3)); + EXPECT_EQ(-1, TestRSearch("abcdabaaaaa", "abaaaaaa", 4, 3)); + EXPECT_EQ(-1, TestRSearch("abcdabaaaaa", "abaaaaaa", 4, 0)); + EXPECT_EQ(-1, TestRSearch("abcdabaaaaa", "abaaaaaa")); + + // Test with needle matching the end of the substring. + EXPECT_EQ(4, TestRSearch("abcde", "e")); + EXPECT_EQ(3, TestRSearch("abcde", "de")); + EXPECT_EQ(2, TestRSearch("abcdede", "cde")); + EXPECT_EQ(5, TestRSearch("abcdacde", "cde")); + + // Test correct rskip_ handling, which depends on which char of the needle is + // the same as the first one. + EXPECT_EQ(0, TestRSearch("cacacbaba", "cacacba")); +} +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/runtime/string-search.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/string-search.h b/be/src/runtime/string-search.h index e1a45e0..5bb5d02 100644 --- a/be/src/runtime/string-search.h +++ b/be/src/runtime/string-search.h @@ -15,12 +15,13 @@ // specific language governing permissions and limitations // under the License. - #ifndef IMPALA_RUNTIME_STRING_SEARCH_H #define IMPALA_RUNTIME_STRING_SEARCH_H -#include <vector> +#include <algorithm> #include <cstring> +#include <vector> + #include <boost/cstdint.hpp> #include "common/logging.h" @@ -44,7 +45,8 @@ class StringSearch { StringSearch() : pattern_(NULL), mask_(0) {} /// Initialize/Precompute a StringSearch object from the pattern - StringSearch(const StringValue* pattern) : pattern_(pattern), mask_(0), skip_(0) { + StringSearch(const StringValue* pattern) + : pattern_(pattern), mask_(0), skip_(0), rskip_(0) { // Special cases if (pattern_->len <= 1) { return; @@ -53,13 +55,25 @@ class StringSearch { // Build compressed lookup table int mlast = pattern_->len - 1; skip_ = mlast - 1; + rskip_ = mlast - 1; + // In the Python implementation, building the bloom filter happens at the + // beginning of the Search operation. We build it during construction + // instead, so that the same StringSearch instance can be reused multiple + // times without rebuilding the bloom filter every time. for (int i = 0; i < mlast; ++i) { BloomAdd(pattern_->ptr[i]); if (pattern_->ptr[i] == pattern_->ptr[mlast]) skip_ = mlast - i - 1; } BloomAdd(pattern_->ptr[mlast]); + + // The order of iteration doesn't have any effect on the bloom filter, but + // it does on skip_. For this reason we need to calculate a separate rskip_ + // for reverse search. + for (int i = mlast; i > 0; i--) { + if (pattern_->ptr[i] == pattern_->ptr[0]) rskip_ = i - 1; + } } /// Search for this pattern in str. @@ -114,6 +128,54 @@ class StringSearch { return -1; } + /// Search for this pattern in str backwards. + /// Returns the offset into str if the pattern exists + /// Returns -1 if the pattern is not found + int RSearch(const StringValue* str) const { + // Special cases + if (str == NULL || pattern_ == NULL || pattern_->len == 0) { + return -1; + } + + int mlast = pattern_->len - 1; + int w = str->len - pattern_->len; + int n = str->len; + int m = pattern_->len; + const char* s = str->ptr; + const char* p = pattern_->ptr; + + // Special case if pattern->len == 1 + if (m == 1) { + const char* result = reinterpret_cast<const char*>(memrchr(s, p[0], n)); + if (result != NULL) return result - s; + return -1; + } + + // General case. + int j; + for (int i = w; i >= 0; i--) { + if (s[i] == p[0]) { + // candidate match + for (j = mlast; j > 0; j--) + if (s[i + j] != p[j]) break; + if (j == 0) { + return i; + } + // miss: check if previous character is part of pattern + if (i > 0 && !BloomQuery(s[i - 1])) + i = i - m; + else + i = i - rskip_; + } else { + // skip: check if previous character is part of pattern + if (i > 0 && !BloomQuery(s[i - 1])) { + i = i - m; + } + } + } + return -1; + } + private: static const int BLOOM_WIDTH = 64; @@ -127,7 +189,8 @@ class StringSearch { const StringValue* pattern_; int64_t mask_; - int64_t skip_; + int skip_; + int rskip_; }; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/common/function-registry/impala_functions.py ---------------------------------------------------------------------- diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py index 10aac10..2141a35 100644 --- a/common/function-registry/impala_functions.py +++ b/common/function-registry/impala_functions.py @@ -431,6 +431,9 @@ visible_functions = [ [['rtrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Rtrim'], [['ascii'], 'INT', ['STRING'], 'impala::StringFunctions::Ascii'], [['instr'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Instr'], + [['instr'], 'INT', ['STRING', 'STRING', 'BIGINT'], 'impala::StringFunctions::Instr'], + [['instr'], 'INT', ['STRING', 'STRING', 'BIGINT', 'BIGINT'], + 'impala::StringFunctions::Instr'], [['locate'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Locate'], [['locate'], 'INT', ['STRING', 'STRING', 'BIGINT'], 'impala::StringFunctions::LocatePos'],
