This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 13d4f79d975 [opt](inverted index) the "unicode" tokenizer can be
configured to disable stop words #33982 (#34376)
13d4f79d975 is described below
commit 13d4f79d9750bc270f497c0aa8fa44913e7d74de
Author: zzzxl <[email protected]>
AuthorDate: Fri May 3 12:04:26 2024 +0800
[opt](inverted index) the "unicode" tokenizer can be configured to disable
stop words #33982 (#34376)
---
be/src/clucene | 2 +-
be/src/olap/inverted_index_parser.cpp | 9 ++++
be/src/olap/inverted_index_parser.h | 5 ++
.../char_filter/char_replace_char_filter.h | 4 +-
.../rowset/segment_v2/inverted_index_reader.cpp | 30 ++++++++---
.../olap/rowset/segment_v2/inverted_index_reader.h | 11 ++++
.../rowset/segment_v2/inverted_index_writer.cpp | 26 +++++++---
be/src/vec/functions/function_tokenize.cpp | 4 ++
.../apache/doris/analysis/InvertedIndexUtil.java | 11 +++-
.../data/inverted_index_p0/test_stopwords.out | 23 +++++++++
.../data/inverted_index_p0/test_tokenize.out | 6 +++
.../inverted_index_p0/test_compound_inlist.groovy | 40 +++++++--------
.../suites/inverted_index_p0/test_stopwords.groovy | 60 ++++++++++++++++++++++
.../suites/inverted_index_p0/test_tokenize.groovy | 3 ++
14 files changed, 199 insertions(+), 35 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index 847f4609bf2..f10bc3f512f 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 847f4609bf2d2359a256cb8b56430a58043f257c
+Subproject commit f10bc3f512f66cfd4fc633cbd0dbeecf852e94eb
diff --git a/be/src/olap/inverted_index_parser.cpp
b/be/src/olap/inverted_index_parser.cpp
index 3f8d4f9c1be..d6128007be1 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -126,4 +126,13 @@ std::string get_parser_ignore_above_value_from_properties(
}
}
+std::string get_parser_stopwords_from_properties(
+ const std::map<std::string, std::string>& properties) {
+ if (properties.find(INVERTED_INDEX_PARSER_STOPWORDS_KEY) !=
properties.end()) {
+ return properties.at(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
+ } else {
+ return "";
+ }
+}
+
} // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h
b/be/src/olap/inverted_index_parser.h
index 8d79f7bbbd9..d78d2e29487 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -77,6 +77,8 @@ const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE =
"256";
const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";
+const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
+
std::string inverted_index_parser_type_to_string(InvertedIndexParserType
parser_type);
InvertedIndexParserType get_inverted_index_parser_type_from_string(const
std::string& parser_str);
@@ -111,4 +113,7 @@ std::string get_parser_lowercase_from_properties(
}
}
+std::string get_parser_stopwords_from_properties(
+ const std::map<std::string, std::string>& properties);
+
} // namespace doris
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
index 2867890b3e0..d9e5080d2d5 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
@@ -28,12 +28,14 @@ class CharReplaceCharFilter : public
lucene::analysis::CharFilter {
public:
CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern,
const std::string& replacement);
- virtual ~CharReplaceCharFilter() = default;
+ ~CharReplaceCharFilter() override = default;
void init(const void* _value, int32_t _length, bool copyData) override;
int32_t read(const void** start, int32_t min, int32_t max) override;
int32_t readCopy(void* start, int32_t off, int32_t len) override;
+ size_t size() override { return _buf.size(); }
+
private:
void fill();
void process_pattern(std::string& buf);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 3da6f7b9a76..6b17f2ceeab 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -290,12 +290,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
inverted_index_ctx->char_filter_map =
get_parser_char_filter_map_from_properties(_index_meta.properties());
auto analyzer = create_analyzer(inverted_index_ctx.get());
- auto lowercase =
get_parser_lowercase_from_properties(_index_meta.properties());
- if (lowercase == "true") {
- analyzer->set_lowercase(true);
- } else if (lowercase == "false") {
- analyzer->set_lowercase(false);
- }
+ setup_analyzer_lowercase(analyzer, _index_meta.properties());
+ setup_analyzer_use_stopwords(analyzer, _index_meta.properties());
auto reader = create_reader(inverted_index_ctx.get(), search_str);
inverted_index_ctx->analyzer = analyzer.get();
get_analyse_result(analyse_result, reader.get(), analyzer.get(),
column_name,
@@ -597,6 +593,28 @@ InvertedIndexReaderType FullTextIndexReader::type() {
return InvertedIndexReaderType::FULLTEXT;
}
+void FullTextIndexReader::setup_analyzer_lowercase(
+ std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
+ const std::map<string, string>& properties) {
+ auto lowercase = get_parser_lowercase_from_properties(properties);
+ if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
+ analyzer->set_lowercase(true);
+ } else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
+ analyzer->set_lowercase(false);
+ }
+}
+
+void FullTextIndexReader::setup_analyzer_use_stopwords(
+ std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
+ const std::map<string, string>& properties) {
+ auto stop_words = get_parser_stopwords_from_properties(properties);
+ if (stop_words == "none") {
+ analyzer->set_stopwords(nullptr);
+ } else {
+ analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
+ }
+}
+
Status StringTypeInvertedIndexReader::new_iterator(
OlapReaderStatistics* stats, RuntimeState* runtime_state,
std::unique_ptr<InvertedIndexIterator>* iterator) {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 117cd0fedd5..9c8d9dabf79 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -145,6 +145,12 @@ public:
InvertedIndexReaderType type() override;
+ static void
setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
+ const std::map<string, string>&
properties);
+
+ static void
setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>&
analyzer,
+ const std::map<string, string>&
properties);
+
private:
Status normal_index_search(OlapReaderStatistics* stats,
InvertedIndexQueryType query_type,
const IndexSearcherPtr& index_searcher,
@@ -274,6 +280,11 @@ public:
InvertedIndexReaderType type() override;
Status get_bkd_reader(std::shared_ptr<lucene::util::bkd::bkd_reader>*
reader);
+ static void
setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
+ const std::map<string, string>&
properties);
+ static void
setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>&
analyzer,
+ const std::map<string, string>&
properties);
+
private:
const TypeInfo* _type_info {};
const KeyCoder* _value_key_coder {};
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 10800a9942a..9fec0a7ecf3 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -210,12 +210,8 @@ public:
// ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
_analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
- auto lowercase =
get_parser_lowercase_from_properties<true>(_index_meta->properties());
- if (lowercase == "true") {
- _analyzer->set_lowercase(true);
- } else if (lowercase == "false") {
- _analyzer->set_lowercase(false);
- }
+ setup_analyzer_lowercase(_analyzer);
+ setup_analyzer_use_stopwords(_analyzer);
} catch (CLuceneError& e) {
return
Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
@@ -248,6 +244,24 @@ public:
return Status::OK();
}
+ void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>&
analyzer) {
+ auto lowercase =
get_parser_lowercase_from_properties<true>(_index_meta->properties());
+ if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
+ analyzer->set_lowercase(true);
+ } else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
+ analyzer->set_lowercase(false);
+ }
+ }
+
+ void
setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>&
analyzer) {
+ auto stop_words =
get_parser_stopwords_from_properties(_index_meta->properties());
+ if (stop_words == "none") {
+ analyzer->set_stopwords(nullptr);
+ } else {
+ analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
+ }
+ }
+
Status add_document() {
try {
_index_writer->addDocument(_doc.get());
diff --git a/be/src/vec/functions/function_tokenize.cpp
b/be/src/vec/functions/function_tokenize.cpp
index 1e7a5d3c9bb..5f362bb1323 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -26,6 +26,7 @@
#include "CLucene/StdHeader.h"
#include "CLucene/config/repl_wchar.h"
#include "olap/inverted_index_parser.h"
+#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "vec/columns/column.h"
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
@@ -149,6 +150,9 @@ Status FunctionTokenize::execute_impl(FunctionContext*
/*context*/, Block& block
return
Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
}
+
doris::segment_v2::FullTextIndexReader::setup_analyzer_lowercase(analyzer,
properties);
+
doris::segment_v2::FullTextIndexReader::setup_analyzer_use_stopwords(analyzer,
+
properties);
inverted_index_ctx.analyzer = analyzer.get();
_do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column,
dest_offsets,
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index 05464a148f5..ea06db40c1d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -52,6 +52,8 @@ public class InvertedIndexUtil {
public static String INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";
+ public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
+
public static String getInvertedIndexParser(Map<String, String>
properties) {
String parser = properties == null ? null :
properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
@@ -136,7 +138,8 @@ public class InvertedIndexUtil {
INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN,
INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT,
INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY,
- INVERTED_INDEX_PARSER_LOWERCASE_KEY
+ INVERTED_INDEX_PARSER_LOWERCASE_KEY,
+ INVERTED_INDEX_PARSER_STOPWORDS_KEY
));
for (String key : properties.keySet()) {
@@ -152,6 +155,7 @@ public class InvertedIndexUtil {
String charFilterPattern =
properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
String ignoreAbove =
properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
+ String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
if (parser != null &&
!parser.matches("none|english|unicode|chinese|standard")) {
throw new AnalysisException("Invalid inverted index 'parser'
value: " + parser
@@ -194,5 +198,10 @@ public class InvertedIndexUtil {
throw new AnalysisException(
"Invalid inverted index 'lower_case' value: " + lowerCase
+ ", lower_case must be true or false");
}
+
+ if (stopWords != null && !stopWords.matches("none")) {
+ throw new AnalysisException("Invalid inverted index 'stopWords'
value: " + stopWords
+ + ", stopWords must be none");
+ }
}
}
diff --git a/regression-test/data/inverted_index_p0/test_stopwords.out
b/regression-test/data/inverted_index_p0/test_stopwords.out
new file mode 100644
index 00000000000..ba4940bcc5b
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_stopwords.out
@@ -0,0 +1,23 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+1 华夏智胜新税股票A 华夏智胜新税股票A
+2 Life is like a box of chocolates, you never know what you are going to
get. Life is like a box of chocolates, you never know what you are going to
get.
+
+-- !sql --
+2 Life is like a box of chocolates, you never know what you are going to
get. Life is like a box of chocolates, you never know what you are going to
get.
+
+-- !sql --
+2 Life is like a box of chocolates, you never know what you are going to
get. Life is like a box of chocolates, you never know what you are going to
get.
+
+-- !sql --
+2 Life is like a box of chocolates, you never know what you are going to
get. Life is like a box of chocolates, you never know what you are going to
get.
+
+-- !sql --
+2 Life is like a box of chocolates, you never know what you are going to
get. Life is like a box of chocolates, you never know what you are going to
get.
+
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out
b/regression-test/data/inverted_index_p0/test_tokenize.out
index 350e218f575..a3984ca9105 100644
--- a/regression-test/data/inverted_index_p0/test_tokenize.out
+++ b/regression-test/data/inverted_index_p0/test_tokenize.out
@@ -22,3 +22,9 @@
-- !tokenize_sql --
["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]
+-- !tokenize_sql --
+["华", "夏", "智", "胜", "新", "税", "股", "票"]
+
+-- !tokenize_sql --
+["华", "夏", "智", "胜", "新", "税", "股", "票", "a"]
+
diff --git
a/regression-test/suites/inverted_index_p0/test_compound_inlist.groovy
b/regression-test/suites/inverted_index_p0/test_compound_inlist.groovy
index f3883819f49..cf18a9d6e68 100644
--- a/regression-test/suites/inverted_index_p0/test_compound_inlist.groovy
+++ b/regression-test/suites/inverted_index_p0/test_compound_inlist.groovy
@@ -104,35 +104,35 @@ suite("test_compound_inlist", "p0"){
sql "sync"
- qt_sql """ select count() from ${indexTbName1} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) and status in (1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName2} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) and status in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName1} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) and status in (1, 2, 200)); """
- qt_sql """ select count() from ${indexTbName2} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) and status in (1, 2, 200)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 200)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 200)); """
- qt_sql """ select count() from ${indexTbName1} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) and status not in (1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName2} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) and status not in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status not in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status not in (1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName1} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) and status not in (1, 2, 200)); """
- qt_sql """ select count() from ${indexTbName2} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) and status not in (1, 2, 200)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status not in (1, 2, 200)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status not in (1, 2, 200)); """
- qt_sql """ select count() from ${indexTbName1} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) or status in (1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName2} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) or status in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) or status in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) or status in (1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName1} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) or status in (1, 2, 200)); """
- qt_sql """ select count() from ${indexTbName2} where (((request
match_phrase 'images' and clientip match_phrase '3') or (request match_phrase
'english' and clientip match_phrase '4')) or status in (1, 2, 200)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) or status in (1, 2, 200)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) or status in (1, 2, 200)); """
- qt_sql """ select count() from ${indexTbName1} where ((request
match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'jpg'
and clientip match_phrase '2')) or (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName2} where ((request
match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'jpg'
and clientip match_phrase '2')) or (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where ((request match_phrase 'hm' and clientip
match_phrase '1') or (request match_phrase 'jpg' and clientip match_phrase
'2')) or (((request match_phrase 'images' and clientip match_phrase '3') or
(request match_phrase 'english' and clientip match_phrase '4')) and status in
(1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where ((request match_phrase 'hm' and clientip
match_phrase '1') or (request match_phrase 'jpg' and clientip match_phrase
'2')) or (((request match_phrase 'images' and clientip match_phrase '3') or
(request match_phrase 'english' and clientip match_phrase '4')) and status in
(1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName1} where ((request
match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'jpg'
and clientip match_phrase '2')) or (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 200)); """
- qt_sql """ select count() from ${indexTbName2} where ((request
match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'jpg'
and clientip match_phrase '2')) or (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 200)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where ((request match_phrase 'hm' and clientip
match_phrase '1') or (request match_phrase 'jpg' and clientip match_phrase
'2')) or (((request match_phrase 'images' and clientip match_phrase '3') or
(request match_phrase 'english' and clientip match_phrase '4')) and status in
(1, 2, 200)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where ((request match_phrase 'hm' and clientip
match_phrase '1') or (request match_phrase 'jpg' and clientip match_phrase
'2')) or (((request match_phrase 'images' and clientip match_phrase '3') or
(request match_phrase 'english' and clientip match_phrase '4')) and status in
(1, 2, 200)); """
- qt_sql """ select count() from ${indexTbName1} where ((request
match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'jpg'
and clientip match_phrase '2')) or (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status not in (1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName2} where ((request
match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'jpg'
and clientip match_phrase '2')) or (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status not in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where ((request match_phrase 'hm' and clientip
match_phrase '1') or (request match_phrase 'jpg' and clientip match_phrase
'2')) or (((request match_phrase 'images' and clientip match_phrase '3') or
(request match_phrase 'english' and clientip match_phrase '4')) and status not
in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where ((request match_phrase 'hm' and clientip
match_phrase '1') or (request match_phrase 'jpg' and clientip match_phrase
'2')) or (((request match_phrase 'images' and clientip match_phrase '3') or
(request match_phrase 'english' and clientip match_phrase '4')) and status not
in (1, 2, 304)); """
- qt_sql """ select count() from ${indexTbName1} where ((request
match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'jpg'
and clientip match_phrase '2')) or (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 304, 200) and status not in (1, 2,
304)); """
- qt_sql """ select count() from ${indexTbName2} where ((request
match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'jpg'
and clientip match_phrase '2')) or (((request match_phrase 'images' and
clientip match_phrase '3') or (request match_phrase 'english' and clientip
match_phrase '4')) and status in (1, 2, 304, 200) and status not in (1, 2,
304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName1} where ((request match_phrase 'hm' and clientip
match_phrase '1') or (request match_phrase 'jpg' and clientip match_phrase
'2')) or (((request match_phrase 'images' and clientip match_phrase '3') or
(request match_phrase 'english' and clientip match_phrase '4')) and status in
(1, 2, 304, 200) and status not in (1, 2, 304)); """
+ qt_sql """ select /*+ SET_VAR(inverted_index_skip_threshold = 0) */
count() from ${indexTbName2} where ((request match_phrase 'hm' and clientip
match_phrase '1') or (request match_phrase 'jpg' and clientip match_phrase
'2')) or (((request match_phrase 'images' and clientip match_phrase '3') or
(request match_phrase 'english' and clientip match_phrase '4')) and status in
(1, 2, 304, 200) and status not in (1, 2, 304)); """
} finally {
//try_sql("DROP TABLE IF EXISTS ${testTable}")
diff --git a/regression-test/suites/inverted_index_p0/test_stopwords.groovy
b/regression-test/suites/inverted_index_p0/test_stopwords.groovy
new file mode 100644
index 00000000000..4f7c577dc57
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_stopwords.groovy
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_stopwords", "p0"){
+ def indexTbName = "test_stopwords"
+
+ sql "DROP TABLE IF EXISTS ${indexTbName}"
+
+ sql """
+ CREATE TABLE ${indexTbName} (
+ `a` int(11) NULL COMMENT "",
+ `b` text NULL COMMENT "",
+ `c` text NULL COMMENT "",
+ INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "unicode")
COMMENT '',
+ INDEX c_idx (`c`) USING INVERTED PROPERTIES("parser" = "unicode",
"stopwords" = "none") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`a`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """ INSERT INTO ${indexTbName} VALUES (1, "华夏智胜新税股票A", "华夏智胜新税股票A");
"""
+ sql """ INSERT INTO ${indexTbName} VALUES (2, "Life is like a box of
chocolates, you never know what you are going to get. ", "Life is like a box of
chocolates, you never know what you are going to get. "); """
+
+ try {
+ sql "sync"
+
+ qt_sql """ select * from ${indexTbName} where b match 'a'; """
+ qt_sql """ select * from ${indexTbName} where b match 'are'; """
+ qt_sql """ select * from ${indexTbName} where b match 'to'; """
+
+ qt_sql """ select * from ${indexTbName} where c match 'a'; """
+ qt_sql """ select * from ${indexTbName} where c match 'are'; """
+ qt_sql """ select * from ${indexTbName} where c match 'to'; """
+
+ qt_sql """ select * from ${indexTbName} where b match_phrase 'like a
box'; """
+ qt_sql """ select * from ${indexTbName} where c match_phrase 'like a
box'; """
+
+ } finally {
+ //try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
+}
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index 5b5c4f02a44..2fd825d934f 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -93,4 +93,7 @@ suite("test_tokenize"){
qt_sql "SELECT TOKENIZE(c,
\"'parser'='chinese','parser_mode'='fine_grained'\") FROM $indexTblName3";
qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0
test:abc=bcd','"parser"="unicode","char_filter_type" =
"char_replace","char_filter_pattern" = "._=:,","char_filter_replacement" = "
"');"""
+
+ qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="unicode"');"""
+ qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A',
'"parser"="unicode","stopwords" = "none"');"""
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]