This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d42fd68d6b8 [opt](invert index) Empty strings are not written to the 
index in the case of TOKENIZED (#28822)
d42fd68d6b8 is described below

commit d42fd68d6b870b217322bd143767772df1110d05
Author: zzzxl <[email protected]>
AuthorDate: Mon Dec 25 10:23:07 2023 +0800

    [opt](invert index) Empty strings are not written to the index in the case 
of TOKENIZED (#28822)
---
 .../rowset/segment_v2/inverted_index_writer.cpp    | 23 ++++------
 .../inverted_index_p0/test_index_empty_string.out  |  7 +++
 .../test_index_empty_string.groovy                 | 53 ++++++++++++++++++++++
 3 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index aab2a5a73f0..c2cc0bbbefa 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -258,7 +258,6 @@ public:
             }
 
             for (int i = 0; i < count; ++i) {
-                new_fulltext_field(empty_value.c_str(), 0);
                 RETURN_IF_ERROR(add_null_document());
             }
         }
@@ -305,13 +304,10 @@ public:
                     
get_parser_ignore_above_value_from_properties(_index_meta->properties());
             auto ignore_above = std::stoi(ignore_above_value);
             for (int i = 0; i < count; ++i) {
-                // only ignore_above UNTOKENIZED strings
-                if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
-                    v->get_size() > ignore_above) {
-                    VLOG_DEBUG << "fulltext index value length can be at most "
-                               << ignore_above_value << ", but got "
-                               << "value length:" << v->get_size() << ", 
ignore this value";
-                    new_fulltext_field(empty_value.c_str(), 0);
+                // only ignore_above UNTOKENIZED strings and empty strings not 
tokenized
+                if ((_parser_type == InvertedIndexParserType::PARSER_NONE &&
+                     v->get_size() > ignore_above) ||
+                    (_parser_type != InvertedIndexParserType::PARSER_NONE && 
v->empty())) {
                     RETURN_IF_ERROR(add_null_document());
                 } else {
                     new_fulltext_field(v->get_data(), v->get_size());
@@ -358,13 +354,10 @@ public:
                 }
 
                 auto value = join(strings, " ");
-                // only ignore_above UNTOKENIZED strings
-                if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
-                    value.length() > ignore_above) {
-                    VLOG_DEBUG << "fulltext index value length can be at most "
-                               << ignore_above_value << ", but got "
-                               << "value length:" << value.length() << ", 
ignore this value";
-                    new_fulltext_field(empty_value.c_str(), 0);
+                // only ignore_above UNTOKENIZED strings and empty strings not 
tokenized
+                if ((_parser_type == InvertedIndexParserType::PARSER_NONE &&
+                     value.length() > ignore_above) ||
+                    (_parser_type != InvertedIndexParserType::PARSER_NONE && 
value.empty())) {
                     RETURN_IF_ERROR(add_null_document());
                 } else {
                     new_fulltext_field(value.c_str(), value.length());
diff --git a/regression-test/data/inverted_index_p0/test_index_empty_string.out 
b/regression-test/data/inverted_index_p0/test_index_empty_string.out
new file mode 100644
index 00000000000..70b6b599194
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_empty_string.out
@@ -0,0 +1,7 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+1
+
+-- !sql --
+0
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_empty_string.groovy 
b/regression-test/suites/inverted_index_p0/test_index_empty_string.groovy
new file mode 100644
index 00000000000..2cf1d844d2b
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_empty_string.groovy
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_empty_string", "p0"){
+    def timeout = 60000
+    def delta_time = 1000
+    def alter_res = "null"
+    def useTime = 0
+
+    def indexTblName = "test_index_empty_string"
+
+    sql "DROP TABLE IF EXISTS ${indexTblName}"
+    // create 1 replica table
+    sql """
+       CREATE TABLE IF NOT EXISTS ${indexTblName}(
+           `id` int(11) NOT NULL,
+        `a` text NULL DEFAULT "",
+        `b` text NULL DEFAULT "",
+        INDEX a_idx(`a`) USING INVERTED COMMENT '',
+        INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" = "english") 
COMMENT ''
+       ) ENGINE=OLAP
+       DUPLICATE KEY(`id`)
+       COMMENT 'OLAP'
+       DISTRIBUTED BY HASH(`id`) BUCKETS 1
+       PROPERTIES(
+           "replication_allocation" = "tag.location.default: 1"
+       );
+    """
+    
+    sql """ 
+        INSERT INTO $indexTblName VALUES 
+        (1, '', '1'), 
+        (2, '2', '');
+    """ 
+
+    qt_sql "SELECT count() FROM $indexTblName WHERE a match '';"
+    qt_sql "SELECT count() FROM $indexTblName WHERE b match '';"
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to