(doris) branch master updated: [opt](inverted index) the "unicode" tokenizer can be configured to disable stop words. (#33982)

jianliangqi Mon, 06 May 2024 01:08:29 -0700

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new fa4d035ccaa [opt](inverted index) the "unicode" tokenizer can be 
configured to disable stop words. (#33982)
fa4d035ccaa is described below

commit fa4d035ccaab07b3c7f7b8a10c9dd3043bd2a90f
Author: zzzxl <[email protected]>
AuthorDate: Mon May 6 16:08:17 2024 +0800

    [opt](inverted index) the "unicode" tokenizer can be configured to disable 
stop words. (#33982)
    
    1. properties: "parser" = "unicode", "use_stopwords" = "none" disable stop 
words.
---
 be/src/clucene                                     |  2 +-
 be/src/olap/inverted_index_parser.cpp              |  9 ++++
 be/src/olap/inverted_index_parser.h                |  5 ++
 .../char_filter/char_replace_char_filter.h         |  4 +-
 .../rowset/segment_v2/inverted_index_reader.cpp    | 30 ++++++++---
 .../olap/rowset/segment_v2/inverted_index_reader.h |  5 ++
 .../rowset/segment_v2/inverted_index_writer.cpp    | 14 ++++-
 be/src/vec/functions/function_tokenize.cpp         |  4 ++
 .../apache/doris/analysis/InvertedIndexUtil.java   | 11 +++-
 .../data/inverted_index_p0/test_stopwords.out      | 23 +++++++++
 .../data/inverted_index_p0/test_tokenize.out       |  6 +++
 .../suites/inverted_index_p0/test_stopwords.groovy | 60 ++++++++++++++++++++++
 .../suites/inverted_index_p0/test_tokenize.groovy  |  3 ++
 13 files changed, 165 insertions(+), 11 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index 9f849a47f70..d3de160871d 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 9f849a47f70625a57fedbaa1f5a6f89bc8f32967
+Subproject commit d3de160871dc1e2e293e5702e5b870e220ed42e4
diff --git a/be/src/olap/inverted_index_parser.cpp 
b/be/src/olap/inverted_index_parser.cpp
index 07a587dd2dd..a9ed7ec062e 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -126,4 +126,13 @@ std::string get_parser_ignore_above_value_from_properties(
     }
 }
 
+std::string get_parser_stopwords_from_properties(
+        const std::map<std::string, std::string>& properties) {
+    if (properties.find(INVERTED_INDEX_PARSER_STOPWORDS_KEY) != 
properties.end()) {
+        return properties.at(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
+    } else {
+        return "";
+    }
+}
+
 } // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index 2eff4d7caf3..87ea7267237 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -79,6 +79,8 @@ const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = 
"256";
 
 const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";
 
+const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
+
 std::string inverted_index_parser_type_to_string(InvertedIndexParserType 
parser_type);
 
 InvertedIndexParserType get_inverted_index_parser_type_from_string(const 
std::string& parser_str);
@@ -112,4 +114,7 @@ std::string get_parser_lowercase_from_properties(
     }
 }
 
+std::string get_parser_stopwords_from_properties(
+        const std::map<std::string, std::string>& properties);
+
 } // namespace doris
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
index 2867890b3e0..d9e5080d2d5 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
@@ -28,12 +28,14 @@ class CharReplaceCharFilter : public 
lucene::analysis::CharFilter {
 public:
     CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern,
                           const std::string& replacement);
-    virtual ~CharReplaceCharFilter() = default;
+    ~CharReplaceCharFilter() override = default;
 
     void init(const void* _value, int32_t _length, bool copyData) override;
     int32_t read(const void** start, int32_t min, int32_t max) override;
     int32_t readCopy(void* start, int32_t off, int32_t len) override;
 
+    size_t size() override { return _buf.size(); }
+
 private:
     void fill();
     void process_pattern(std::string& buf);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index c831f086deb..dc4aa0870b6 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -329,12 +329,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
                     
get_parser_mode_string_from_properties(_index_meta.properties()),
                     
get_parser_char_filter_map_from_properties(_index_meta.properties()));
             auto analyzer = create_analyzer(inverted_index_ctx.get());
-            auto lowercase = 
get_parser_lowercase_from_properties(_index_meta.properties());
-            if (lowercase == "true") {
-                analyzer->set_lowercase(true);
-            } else if (lowercase == "false") {
-                analyzer->set_lowercase(false);
-            }
+            setup_analyzer_lowercase(analyzer, _index_meta.properties());
+            setup_analyzer_use_stopwords(analyzer, _index_meta.properties());
             inverted_index_ctx->analyzer = analyzer.get();
             auto reader = create_reader(inverted_index_ctx.get(), search_str);
             get_analyse_result(query_info.terms, reader.get(), analyzer.get(), 
column_name,
@@ -423,6 +419,28 @@ InvertedIndexReaderType FullTextIndexReader::type() {
     return InvertedIndexReaderType::FULLTEXT;
 }
 
+void FullTextIndexReader::setup_analyzer_lowercase(
+        std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
+        const std::map<string, string>& properties) {
+    auto lowercase = get_parser_lowercase_from_properties(properties);
+    if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
+        analyzer->set_lowercase(true);
+    } else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
+        analyzer->set_lowercase(false);
+    }
+}
+
+void FullTextIndexReader::setup_analyzer_use_stopwords(
+        std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
+        const std::map<string, string>& properties) {
+    auto stop_words = get_parser_stopwords_from_properties(properties);
+    if (stop_words == "none") {
+        analyzer->set_stopwords(nullptr);
+    } else {
+        analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
+    }
+}
+
 Status StringTypeInvertedIndexReader::new_iterator(
         OlapReaderStatistics* stats, RuntimeState* runtime_state,
         std::unique_ptr<InvertedIndexIterator>* iterator) {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 48450b974ac..8ea430cd1da 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -175,6 +175,11 @@ public:
 
     InvertedIndexReaderType type() override;
 
+    static void 
setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
+                                         const std::map<string, string>& 
properties);
+    static void 
setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& 
analyzer,
+                                             const std::map<string, string>& 
properties);
+
 private:
     Status match_index_search(OlapReaderStatistics* stats, RuntimeState* 
runtime_state,
                               InvertedIndexQueryType query_type,
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 7774dc0c1dd..bc8fc1b5045 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -224,6 +224,7 @@ public:
                 break;
             }
             setup_analyzer_lowercase(analyzer);
+            setup_analyzer_use_stopwords(analyzer);
             return Status::OK();
         } catch (CLuceneError& e) {
             return 
Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
@@ -233,13 +234,22 @@ public:
 
     void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& 
analyzer) {
         auto lowercase = 
get_parser_lowercase_from_properties<true>(_index_meta->properties());
-        if (lowercase == "true") {
+        if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
             analyzer->set_lowercase(true);
-        } else if (lowercase == "false") {
+        } else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
             analyzer->set_lowercase(false);
         }
     }
 
+    void 
setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& 
analyzer) {
+        auto stop_words = 
get_parser_stopwords_from_properties(_index_meta->properties());
+        if (stop_words == "none") {
+            analyzer->set_stopwords(nullptr);
+        } else {
+            analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
+        }
+    }
+
     Status init_fulltext_index() {
         RETURN_IF_ERROR(open_index_directory());
         RETURN_IF_ERROR(create_char_string_reader(_char_string_reader));
diff --git a/be/src/vec/functions/function_tokenize.cpp 
b/be/src/vec/functions/function_tokenize.cpp
index ddb509ddf49..e7dc2debe62 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -26,6 +26,7 @@
 #include "CLucene/StdHeader.h"
 #include "CLucene/config/repl_wchar.h"
 #include "olap/inverted_index_parser.h"
+#include "olap/rowset/segment_v2/inverted_index_reader.h"
 #include "vec/columns/column.h"
 #include "vec/common/string_ref.h"
 #include "vec/core/block.h"
@@ -151,6 +152,9 @@ Status FunctionTokenize::execute_impl(FunctionContext* 
/*context*/, Block& block
                 return 
Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
                         "inverted index create analyzer failed: {}", e.what());
             }
+            
doris::segment_v2::FullTextIndexReader::setup_analyzer_lowercase(analyzer, 
properties);
+            
doris::segment_v2::FullTextIndexReader::setup_analyzer_use_stopwords(analyzer,
+                                                                               
  properties);
 
             inverted_index_ctx.analyzer = analyzer.get();
             _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, 
dest_offsets,
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index b57eae7746b..a2b0aa623c4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -52,6 +52,8 @@ public class InvertedIndexUtil {
 
     public static String INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";
 
+    public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
+
     public static String getInvertedIndexParser(Map<String, String> 
properties) {
         String parser = properties == null ? null : 
properties.get(INVERTED_INDEX_PARSER_KEY);
         // default is "none" if not set
@@ -136,7 +138,8 @@ public class InvertedIndexUtil {
                 INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN,
                 INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT,
                 INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY,
-                INVERTED_INDEX_PARSER_LOWERCASE_KEY
+                INVERTED_INDEX_PARSER_LOWERCASE_KEY,
+                INVERTED_INDEX_PARSER_STOPWORDS_KEY
         ));
 
         for (String key : properties.keySet()) {
@@ -152,6 +155,7 @@ public class InvertedIndexUtil {
         String charFilterPattern = 
properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
         String ignoreAbove = 
properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
         String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
+        String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
 
         if (parser != null && 
!parser.matches("none|english|unicode|chinese|standard")) {
             throw new AnalysisException("Invalid inverted index 'parser' 
value: " + parser
@@ -194,5 +198,10 @@ public class InvertedIndexUtil {
             throw new AnalysisException(
                     "Invalid inverted index 'lower_case' value: " + lowerCase 
+ ", lower_case must be true or false");
         }
+
+        if (stopWords != null && !stopWords.matches("none")) {
+            throw new AnalysisException("Invalid inverted index 'stopWords' 
value: " + stopWords
+                    + ", stopWords must be none");
+        }
     }
 }
diff --git a/regression-test/data/inverted_index_p0/test_stopwords.out 
b/regression-test/data/inverted_index_p0/test_stopwords.out
new file mode 100644
index 00000000000..ba4940bcc5b
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_stopwords.out
@@ -0,0 +1,23 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+1      华夏智胜新税股票A       华夏智胜新税股票A
+2      Life is like a box of chocolates, you never know what you are going to 
get.     Life is like a box of chocolates, you never know what you are going to 
get. 
+
+-- !sql --
+2      Life is like a box of chocolates, you never know what you are going to 
get.     Life is like a box of chocolates, you never know what you are going to 
get. 
+
+-- !sql --
+2      Life is like a box of chocolates, you never know what you are going to 
get.     Life is like a box of chocolates, you never know what you are going to 
get. 
+
+-- !sql --
+2      Life is like a box of chocolates, you never know what you are going to 
get.     Life is like a box of chocolates, you never know what you are going to 
get. 
+
+-- !sql --
+2      Life is like a box of chocolates, you never know what you are going to 
get.     Life is like a box of chocolates, you never know what you are going to 
get. 
+
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out 
b/regression-test/data/inverted_index_p0/test_tokenize.out
index 96902bef1c0..ae22daffe15 100644
--- a/regression-test/data/inverted_index_p0/test_tokenize.out
+++ b/regression-test/data/inverted_index_p0/test_tokenize.out
@@ -31,3 +31,9 @@
 -- !tokenize_sql --
 ["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]
 
+-- !tokenize_sql --
+["华", "夏", "智", "胜", "新", "税", "股", "票"]
+
+-- !tokenize_sql --
+["华", "夏", "智", "胜", "新", "税", "股", "票", "a"]
+
diff --git a/regression-test/suites/inverted_index_p0/test_stopwords.groovy 
b/regression-test/suites/inverted_index_p0/test_stopwords.groovy
new file mode 100644
index 00000000000..4f7c577dc57
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_stopwords.groovy
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_stopwords", "p0"){
+    def indexTbName = "test_stopwords"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName}"
+
+    sql """
+      CREATE TABLE ${indexTbName} (
+        `a` int(11) NULL COMMENT "",
+        `b` text NULL COMMENT "",
+        `c` text NULL COMMENT "",
+        INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "unicode") 
COMMENT '',
+        INDEX c_idx (`c`) USING INVERTED PROPERTIES("parser" = "unicode", 
"stopwords" = "none") COMMENT ''
+      ) ENGINE=OLAP
+        DUPLICATE KEY(`a`)
+        COMMENT "OLAP"
+        DISTRIBUTED BY RANDOM BUCKETS 1
+        PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1"
+      );
+    """
+
+    sql """ INSERT INTO ${indexTbName} VALUES (1, "华夏智胜新税股票A", "华夏智胜新税股票A"); 
"""
+    sql """ INSERT INTO ${indexTbName} VALUES (2, "Life is like a box of 
chocolates, you never know what you are going to get. ", "Life is like a box of 
chocolates, you never know what you are going to get. "); """
+
+    try {
+        sql "sync"
+
+        qt_sql """ select * from ${indexTbName} where b match 'a'; """
+        qt_sql """ select * from ${indexTbName} where b match 'are'; """
+        qt_sql """ select * from ${indexTbName} where b match 'to'; """
+
+        qt_sql """ select * from ${indexTbName} where c match 'a'; """
+        qt_sql """ select * from ${indexTbName} where c match 'are'; """
+        qt_sql """ select * from ${indexTbName} where c match 'to'; """
+
+        qt_sql """ select * from ${indexTbName} where b match_phrase 'like a 
box'; """
+        qt_sql """ select * from ${indexTbName} where c match_phrase 'like a 
box'; """
+
+    } finally {
+        //try_sql("DROP TABLE IF EXISTS ${testTable}")
+    }
+}
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy 
b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index bf3d958f8e2..8d7e2dac42e 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -95,4 +95,7 @@ suite("test_tokenize"){
 
     qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0 
test:abc=bcd','"parser"="unicode","char_filter_type" = 
"char_replace","char_filter_pattern" = "._=:,","char_filter_replacement" = " 
"');"""
     qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0 
test:abc=bcd', '"parser"="unicode","char_filter_type" = "char_replace", 
"char_filter_pattern" = "._=:,", "char_filter_replacement" = " "');"""
+
+    qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="unicode"');"""
+    qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', 
'"parser"="unicode","stopwords" = "none"');"""
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch master updated: [opt](inverted index) the "unicode" tokenizer can be configured to disable stop words. (#33982)

Reply via email to