This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d42fd68d6b8 [opt](invert index) Empty strings are not written to the
index in the case of TOKENIZED (#28822)
d42fd68d6b8 is described below
commit d42fd68d6b870b217322bd143767772df1110d05
Author: zzzxl <[email protected]>
AuthorDate: Mon Dec 25 10:23:07 2023 +0800
[opt](invert index) Empty strings are not written to the index in the case
of TOKENIZED (#28822)
---
.../rowset/segment_v2/inverted_index_writer.cpp | 23 ++++------
.../inverted_index_p0/test_index_empty_string.out | 7 +++
.../test_index_empty_string.groovy | 53 ++++++++++++++++++++++
3 files changed, 68 insertions(+), 15 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index aab2a5a73f0..c2cc0bbbefa 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -258,7 +258,6 @@ public:
}
for (int i = 0; i < count; ++i) {
- new_fulltext_field(empty_value.c_str(), 0);
RETURN_IF_ERROR(add_null_document());
}
}
@@ -305,13 +304,10 @@ public:
get_parser_ignore_above_value_from_properties(_index_meta->properties());
auto ignore_above = std::stoi(ignore_above_value);
for (int i = 0; i < count; ++i) {
- // only ignore_above UNTOKENIZED strings
- if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
- v->get_size() > ignore_above) {
- VLOG_DEBUG << "fulltext index value length can be at most "
- << ignore_above_value << ", but got "
- << "value length:" << v->get_size() << ",
ignore this value";
- new_fulltext_field(empty_value.c_str(), 0);
+ // only ignore_above UNTOKENIZED strings and empty strings not
tokenized
+ if ((_parser_type == InvertedIndexParserType::PARSER_NONE &&
+ v->get_size() > ignore_above) ||
+ (_parser_type != InvertedIndexParserType::PARSER_NONE &&
v->empty())) {
RETURN_IF_ERROR(add_null_document());
} else {
new_fulltext_field(v->get_data(), v->get_size());
@@ -358,13 +354,10 @@ public:
}
auto value = join(strings, " ");
- // only ignore_above UNTOKENIZED strings
- if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
- value.length() > ignore_above) {
- VLOG_DEBUG << "fulltext index value length can be at most "
- << ignore_above_value << ", but got "
- << "value length:" << value.length() << ",
ignore this value";
- new_fulltext_field(empty_value.c_str(), 0);
+ // only ignore_above UNTOKENIZED strings and empty strings not
tokenized
+ if ((_parser_type == InvertedIndexParserType::PARSER_NONE &&
+ value.length() > ignore_above) ||
+ (_parser_type != InvertedIndexParserType::PARSER_NONE &&
value.empty())) {
RETURN_IF_ERROR(add_null_document());
} else {
new_fulltext_field(value.c_str(), value.length());
diff --git a/regression-test/data/inverted_index_p0/test_index_empty_string.out
b/regression-test/data/inverted_index_p0/test_index_empty_string.out
new file mode 100644
index 00000000000..70b6b599194
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_empty_string.out
@@ -0,0 +1,7 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+1
+
+-- !sql --
+0
+
diff --git
a/regression-test/suites/inverted_index_p0/test_index_empty_string.groovy
b/regression-test/suites/inverted_index_p0/test_index_empty_string.groovy
new file mode 100644
index 00000000000..2cf1d844d2b
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_empty_string.groovy
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_empty_string", "p0"){
+ def timeout = 60000
+ def delta_time = 1000
+ def alter_res = "null"
+ def useTime = 0
+
+ def indexTblName = "test_index_empty_string"
+
+ sql "DROP TABLE IF EXISTS ${indexTblName}"
+ // create 1 replica table
+ sql """
+ CREATE TABLE IF NOT EXISTS ${indexTblName}(
+ `id` int(11) NOT NULL,
+ `a` text NULL DEFAULT "",
+ `b` text NULL DEFAULT "",
+ INDEX a_idx(`a`) USING INVERTED COMMENT '',
+ INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" = "english")
COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ COMMENT 'OLAP'
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES(
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """
+ INSERT INTO $indexTblName VALUES
+ (1, '', '1'),
+ (2, '2', '');
+ """
+
+ qt_sql "SELECT count() FROM $indexTblName WHERE a match '';"
+ qt_sql "SELECT count() FROM $indexTblName WHERE b match '';"
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]