This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 5d576b41d70 [opt](invert index) use lowercase by default #32405
(#32940)
5d576b41d70 is described below
commit 5d576b41d70611c6724a6ea9b87c7b2c489125e8
Author: zzzxl <[email protected]>
AuthorDate: Fri Mar 29 14:37:40 2024 +0800
[opt](invert index) use lowercase by default #32405 (#32940)
---
be/src/olap/compaction.cpp | 41 +++++++++++-
be/src/olap/inverted_index_parser.cpp | 9 ---
be/src/olap/inverted_index_parser.h | 23 ++++++-
.../rowset/segment_v2/inverted_index_writer.cpp | 2 +-
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 3 +-
be/src/olap/tablet_schema.cpp | 8 +++
.../test_index_lowercase_fault_injection.out | 13 ++++
.../data/inverted_index_p0/test_lowercase.out | 6 ++
.../test_index_lowercase_fault_injection.groovy | 76 ++++++++++++++++++++++
9 files changed, 166 insertions(+), 15 deletions(-)
diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp
index e852344688c..9bedbce11ab 100644
--- a/be/src/olap/compaction.cpp
+++ b/be/src/olap/compaction.cpp
@@ -460,9 +460,11 @@ Status Compaction::do_compaction_impl(int64_t permits) {
// src index files
// format: rowsetId_segmentId
std::vector<std::string> src_index_files(src_segment_num);
+ std::vector<RowsetId> src_rowset_ids;
for (const auto& m : src_seg_to_id_map) {
std::pair<RowsetId, uint32_t> p = m.first;
src_index_files[m.second] = p.first.to_string() + "_" +
std::to_string(p.second);
+ src_rowset_ids.push_back(p.first);
}
// dest index files
@@ -597,9 +599,36 @@ Status Compaction::do_compaction_impl(int64_t permits) {
}
};
+ Status status = Status::OK();
for (auto&& column_uniq_id : ctx.skip_inverted_index) {
auto col = _cur_tablet_schema->column_by_uid(column_uniq_id);
const auto* index_meta =
_cur_tablet_schema->get_inverted_index(col);
+
+ // if index properties are different, index compaction maybe
needs to be skipped.
+ bool is_continue = false;
+ std::optional<std::map<std::string, std::string>>
first_properties;
+ for (const auto& rowset_id : src_rowset_ids) {
+ auto rowset_ptr = _tablet->get_rowset(rowset_id);
+ const auto* tablet_index =
rowset_ptr->tablet_schema()->get_inverted_index(col);
+ const auto& properties = tablet_index->properties();
+ if (!first_properties.has_value()) {
+ first_properties = properties;
+ } else {
+ if (properties != first_properties.value()) {
+ error_handler(index_meta->index_id(),
column_uniq_id);
+ status =
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(
+ "if index properties are different, index
compaction needs to "
+ "be "
+ "skipped.");
+ is_continue = true;
+ break;
+ }
+ }
+ }
+ if (is_continue) {
+ continue;
+ }
+
std::vector<lucene::store::Directory*>
dest_index_dirs(dest_segment_num);
std::vector<lucene::store::Directory*>
src_index_dirs(src_segment_num);
try {
@@ -620,15 +649,21 @@ Status Compaction::do_compaction_impl(int64_t permits) {
fs, index_tmp_path, trans_vec,
dest_segment_num_rows);
if (!st.ok()) {
error_handler(index_meta->index_id(), column_uniq_id);
- return
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg());
+ status =
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg());
}
} catch (CLuceneError& e) {
error_handler(index_meta->index_id(), column_uniq_id);
- return
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(e.what());
+ status =
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(e.what());
}
}
for (auto& inverted_index_file_writer :
inverted_index_file_writers) {
- RETURN_IF_ERROR(inverted_index_file_writer->close());
+ if (Status st = inverted_index_file_writer->close(); !st.ok())
{
+ status =
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg());
+ }
+ }
+ // check index compaction status. If status is not ok, we should
return error and end this compaction round.
+ if (!status.ok()) {
+ return status;
}
LOG(INFO) << "succeed to do index compaction"
diff --git a/be/src/olap/inverted_index_parser.cpp
b/be/src/olap/inverted_index_parser.cpp
index 17cddc042f0..07a587dd2dd 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -126,13 +126,4 @@ std::string get_parser_ignore_above_value_from_properties(
}
}
-std::string get_parser_lowercase_from_properties(
- const std::map<std::string, std::string>& properties) {
- if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) !=
properties.end()) {
- return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
- } else {
- return "";
- }
-}
-
} // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h
b/be/src/olap/inverted_index_parser.h
index 4a84823d14c..9df825bf69d 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -21,6 +21,8 @@
#include <memory>
#include <string>
+#include "util/debug_points.h"
+
namespace lucene {
namespace analysis {
class Analyzer;
@@ -49,6 +51,9 @@ struct InvertedIndexCtx {
using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>;
+const std::string INVERTED_INDEX_PARSER_TRUE = "true";
+const std::string INVERTED_INDEX_PARSER_FALSE = "false";
+
const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained";
const std::string INVERTED_INDEX_PARSER_COARSE_GRANULARITY = "coarse_grained";
@@ -90,6 +95,22 @@ CharFilterMap get_parser_char_filter_map_from_properties(
// get parser ignore_above value from properties
std::string get_parser_ignore_above_value_from_properties(
const std::map<std::string, std::string>& properties);
+
+template <bool ReturnTrue = false>
std::string get_parser_lowercase_from_properties(
- const std::map<std::string, std::string>& properties);
+ const std::map<std::string, std::string>& properties) {
+ if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) !=
properties.end()) {
+ return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
+ } else {
+
DBUG_EXECUTE_IF("inverted_index_parser.get_parser_lowercase_from_properties",
+ { return ""; })
+
+ if constexpr (ReturnTrue) {
+ return INVERTED_INDEX_PARSER_TRUE;
+ } else {
+ return "";
+ }
+ }
+}
+
} // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 54f3feb3c5d..8b1ae50433c 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -222,7 +222,7 @@ public:
}
void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>&
analyzer) {
- auto lowercase =
get_parser_lowercase_from_properties(_index_meta->properties());
+ auto lowercase =
get_parser_lowercase_from_properties<true>(_index_meta->properties());
if (lowercase == "true") {
analyzer->set_lowercase(true);
} else if (lowercase == "false") {
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 67a5b9393e0..dc692fa9bc0 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1275,9 +1275,10 @@ Status SegmentIterator::_init_inverted_index_iterators()
{
}
for (auto cid : _schema->column_ids()) {
if (_inverted_index_iterators[cid] == nullptr) {
+ // Use segment’s own index_meta, for compatibility with future
indexing needs to default to lowercase.
RETURN_IF_ERROR(_segment->new_inverted_index_iterator(
_opts.tablet_schema->column(cid),
-
_opts.tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)),
+
_segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)),
_opts, &_inverted_index_iterators[cid]));
}
}
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index b0410a2b341..0e9376e09cf 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -799,6 +799,14 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const
{
(*index->mutable_properties())[kv.first] = kv.second;
}
index->set_index_suffix_name(_escaped_index_suffix_path);
+
+ DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", { return; })
+
+ // lowercase by default
+ if (!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
+ (*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] =
+ INVERTED_INDEX_PARSER_TRUE;
+ }
}
void TabletSchema::append_column(TabletColumn column, ColumnType col_type) {
diff --git
a/regression-test/data/fault_injection_p0/test_index_lowercase_fault_injection.out
b/regression-test/data/fault_injection_p0/test_index_lowercase_fault_injection.out
new file mode 100644
index 00000000000..196077986ec
--- /dev/null
+++
b/regression-test/data/fault_injection_p0/test_index_lowercase_fault_injection.out
@@ -0,0 +1,13 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+5
+
+-- !sql --
+0
+
+-- !sql --
+8
+
+-- !sql --
+3
+
diff --git a/regression-test/data/inverted_index_p0/test_lowercase.out
b/regression-test/data/inverted_index_p0/test_lowercase.out
index 03c2f57468f..2ca46501026 100644
--- a/regression-test/data/inverted_index_p0/test_lowercase.out
+++ b/regression-test/data/inverted_index_p0/test_lowercase.out
@@ -31,11 +31,17 @@
-- !sql --
1 hello 我来到北京清华大学
+2 HELLO 我爱你中国
+3 Hello 人民可以得到更多实惠
-- !sql --
+1 hello 我来到北京清华大学
2 HELLO 我爱你中国
+3 Hello 人民可以得到更多实惠
-- !sql --
+1 hello 我来到北京清华大学
+2 HELLO 我爱你中国
3 Hello 人民可以得到更多实惠
-- !sql --
diff --git
a/regression-test/suites/fault_injection_p0/test_index_lowercase_fault_injection.groovy
b/regression-test/suites/fault_injection_p0/test_index_lowercase_fault_injection.groovy
new file mode 100644
index 00000000000..0f522652bb4
--- /dev/null
+++
b/regression-test/suites/fault_injection_p0/test_index_lowercase_fault_injection.groovy
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_lowercase_fault_injection") {
+ // define a sql table
+ def testTable = "httplogs_lowercase"
+
+ def create_httplogs_unique_table = {testTablex ->
+ // multi-line sql
+ def result = sql """
+ CREATE TABLE ${testTablex} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` string NULL COMMENT "",
+ `request` string NULL COMMENT "",
+ `status` string NULL COMMENT "",
+ `size` string NULL COMMENT "",
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"chinese", "support_phrase" = "true") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+ }
+
+ try {
+ sql "DROP TABLE IF EXISTS ${testTable}"
+ create_httplogs_unique_table.call(testTable)
+
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("inverted_index_parser.get_parser_lowercase_from_properties")
+ GetDebugPoint().enableDebugPointForAllBEs("tablet_schema.to_schema_pb")
+
+ sql """ INSERT INTO ${testTable} VALUES (893964617, '40.135.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 200, 24736); """
+ sql """ INSERT INTO ${testTable} VALUES (893964653, '232.0.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 200, 3781); """
+ sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 304, 0); """
+ sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 304, 0); """
+ sql """ INSERT INTO ${testTable} VALUES (893964653, '232.0.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 200, 3781); """
+
+ sql 'sync'
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("inverted_index_parser.get_parser_lowercase_from_properties")
+
GetDebugPoint().disableDebugPointForAllBEs("tablet_schema.to_schema_pb")
+ }
+
+ qt_sql """ select count() from ${testTable} where (request match
'HTTP'); """
+ qt_sql """ select count() from ${testTable} where (request match
'http'); """
+
+ sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 304, 0); """
+ sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 304, 0); """
+ sql """ INSERT INTO ${testTable} VALUES (893964653, '232.0.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 200, 3781); """
+
+ sql 'sync'
+
+ qt_sql """ select count() from ${testTable} where (request match
'HTTP'); """
+ qt_sql """ select count() from ${testTable} where (request match
'http'); """
+ } finally {
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]