This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0-beta in repository https://gitbox.apache.org/repos/asf/doris.git
commit 9e44e20be570bd9780f900bf739841b9a57804ba Author: ZhangYu0123 <[email protected]> AuthorDate: Mon Jun 5 12:43:14 2023 +0800 [Optimize](function) Optimize locate function by compare across strings (#20290) Optimize locate function by compare across strings. about 90% speed up test by sum() --- be/src/vec/functions/function_string.cpp | 44 +++++++++++++++------- .../string_functions/test_string_function.out | 24 +++++++++--- .../string_functions/test_string_function.groovy | 16 +++++--- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index 670644bba4..dbfbff8800 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -205,28 +205,46 @@ struct StringInStrImpl { res.resize(size); if (rdata.size == 0) { - for (int i = 0; i < size; ++i) { - res[i] = 1; - } + std::fill(res.begin(), res.end(), 1); return Status::OK(); } + const UInt8* begin = ldata.data(); + const UInt8* end = begin + ldata.size(); + const UInt8* pos = begin; + + /// Current index in the array of strings. + size_t i = 0; + std::fill(res.begin(), res.end(), 0); + StringRef rstr_ref(rdata.data, rdata.size); StringSearch search(&rstr_ref); - for (int i = 0; i < size; ++i) { - const char* l_raw_str = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]); - int l_str_size = loffsets[i] - loffsets[i - 1]; + while (pos < end) { + // search return matched substring start offset + pos = (UInt8*)search.search((char*)pos, end - pos); + if (pos >= end) { + break; + } - StringRef lstr_ref(l_raw_str, l_str_size); + /// Determine which index it refers to. + /// begin + value_offsets[i] is the start offset of string at i+1 + while (begin + loffsets[i] < pos) { + ++i; + } - // Hive returns positions starting from 1. - int loc = search.search(&lstr_ref); - if (loc > 0) { - size_t len = std::min(lstr_ref.size, (size_t)loc); - loc = simd::VStringFunctions::get_char_len(lstr_ref.data, len); + /// We check that the entry does not pass through the boundaries of strings. + if (pos + rdata.size <= begin + loffsets[i]) { + int loc = pos - begin - loffsets[i - 1]; + int l_str_size = loffsets[i] - loffsets[i - 1]; + size_t len = std::min(l_str_size, loc); + loc = simd::VStringFunctions::get_char_len((char*)(begin + loffsets[i - 1]), len); + res[i] = loc + 1; } - res[i] = loc + 1; + + // move to next string offset + pos = begin + loffsets[i]; + ++i; } return Status::OK(); diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out index 587319531e..b51fb32d61 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out @@ -122,18 +122,24 @@ A -- !sql -- AB --- !sql -- +-- !sql_instr -- 2 --- !sql -- +-- !sql_instr -- 0 --- !sql -- +-- !sql_instr -- \N --- !sql -- +-- !sql_instr -- \N +-- !sql_instr -- +1 + +-- !sql_instr -- +5 + -- !sql -- abc123 @@ -152,12 +158,18 @@ doris -- !sql -- 3 --- !sql -- +-- !sql_locate -- 4 --- !sql -- +-- !sql_locate -- 0 +-- !sql_locate -- +1 + +-- !sql_locate -- +5 + -- !sql -- xyxhi diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy index ae33e448e7..6a06992322 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy @@ -69,10 +69,12 @@ suite("test_string_function") { qt_sql "select unhex('41');" qt_sql "select unhex('4142');" - qt_sql "select instr(\"abc\", \"b\");" - qt_sql "select instr(\"abc\", \"d\");" - qt_sql "select instr(\"abc\", null);" - qt_sql "select instr(null, \"a\");" + qt_sql_instr "select instr(\"abc\", \"b\");" + qt_sql_instr "select instr(\"abc\", \"d\");" + qt_sql_instr "select instr(\"abc\", null);" + qt_sql_instr "select instr(null, \"a\");" + qt_sql_instr "SELECT instr('foobar', '');" + qt_sql_instr "SELECT instr('上海天津北京杭州', '北京');" qt_sql "SELECT lcase(\"AbC123\");" qt_sql "SELECT lower(\"AbC123\");" @@ -84,8 +86,10 @@ suite("test_string_function") { qt_sql "select length(\"abc\");" - qt_sql "SELECT LOCATE('bar', 'foobarbar');" - qt_sql "SELECT LOCATE('xbar', 'foobar');" + qt_sql_locate "SELECT LOCATE('bar', 'foobarbar');" + qt_sql_locate "SELECT LOCATE('xbar', 'foobar');" + qt_sql_locate "SELECT LOCATE('', 'foobar');" + qt_sql_locate "SELECT LOCATE('北京', '上海天津北京杭州');" qt_sql "SELECT lpad(\"hi\", 5, \"xy\");" qt_sql "SELECT lpad(\"hi\", 1, \"xy\");" --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
