This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new a42538291a7 [fix](inverted index)Support Chinese column name with
inverted index (#36321)
a42538291a7 is described below
commit a42538291a7a2d2cd3281d5b1e324065c28660f2
Author: qiye <[email protected]>
AuthorDate: Sun Jun 16 10:02:14 2024 +0800
[fix](inverted index)Support Chinese column name with inverted index
(#36321)
1. `std::string` to `std::wstring` conversion only supports ASCII
characters. For non-ASCII characters, we need to use
`StringUtil::string_to_wstring`
2. Fix index_tool check_terms_stats_v2 and add field info to print
Issue Number: #34118
---
be/src/index-tools/index_tool.cpp | 12 ++++--
.../rowset/segment_v2/inverted_index_reader.cpp | 6 +--
.../rowset/segment_v2/inverted_index_writer.cpp | 2 +-
.../test_index_chinese_column.out | 7 ++++
.../test_index_chinese_column.groovy | 44 ++++++++++++++++++++++
5 files changed, 64 insertions(+), 7 deletions(-)
diff --git a/be/src/index-tools/index_tool.cpp
b/be/src/index-tools/index_tool.cpp
index 5f49bc268f4..d729cc3ff97 100644
--- a/be/src/index-tools/index_tool.cpp
+++ b/be/src/index-tools/index_tool.cpp
@@ -101,6 +101,9 @@ std::string get_usage(const std::string& progname) {
"--trans_vec_file=path/to/file\n";
ss << "./index_tool --operation=write_index_v2
--idx_file_path=path/to/index "
"--data_file_path=data/to/index\n";
+ ss << "./index_tool --operation=show_nested_files_v2
--idx_file_path=path/to/file\n";
+ ss << "./index_tool --operation=check_terms_stats_v2
--idx_file_path=path/to/file "
+ "--idx_id=index_id\n";
return ss.str();
}
@@ -205,7 +208,10 @@ void check_terms_stats(lucene::store::Directory* dir) {
/* empty */
std::string token =
lucene_wcstoutf8string(te->term(false)->text(),
te->term(false)->textLength());
+ std::string field = lucene_wcstoutf8string(te->term(false)->field(),
+
lenOfString(te->term(false)->field()));
+ printf("Field: %s ", field.c_str());
printf("Term: %s ", token.c_str());
printf("Freq: %d\n", te->docFreq());
if (FLAGS_print_doc_id) {
@@ -557,7 +563,7 @@ int main(int argc, char** argv) {
auto field_config = (int32_t)(lucene::document::Field::STORE_NO);
field_config |= (int32_t)(lucene::document::Field::INDEX_NONORMS);
field_config |= lucene::document::Field::INDEX_TOKENIZED;
- auto field_name = std::wstring(name.begin(), name.end());
+ auto field_name = StringUtil::string_to_wstring(name);
auto field = _CLNEW lucene::document::Field(field_name.c_str(),
field_config);
field->setOmitTermFreqAndPositions(false);
doc->add(*field);
@@ -632,7 +638,7 @@ int main(int argc, char** argv) {
std::cerr << "error occurred when show files: " << err.what() <<
std::endl;
}
} else if (FLAGS_operation == "check_terms_stats_v2") {
- if (FLAGS_idx_file_path == "") {
+ if (FLAGS_idx_file_path == "" || FLAGS_idx_id <= 0) {
std::cout << "no file flag for check " << std::endl;
return -1;
}
@@ -647,7 +653,7 @@ int main(int argc, char** argv) {
return -1;
}
std::vector<std::string> files;
- int64_t index_id = 1;
+ int64_t index_id = FLAGS_idx_id;
std::string index_suffix = "";
doris::TabletIndexPB index_pb;
index_pb.set_index_id(index_id);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 35286088a57..3639bff05c4 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -164,7 +164,7 @@ void
InvertedIndexReader::get_analyse_result(std::vector<std::string>& analyse_r
bool drop_duplicates) {
analyse_result.clear();
- std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
+ std::wstring field_ws = StringUtil::string_to_wstring(field_name);
std::unique_ptr<lucene::analysis::TokenStream> token_stream(
analyzer->tokenStream(field_ws.c_str(), reader));
@@ -353,7 +353,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
}
std::unique_ptr<lucene::search::Query> query;
- query_info.field_name = std::wstring(column_name.begin(),
column_name.end());
+ query_info.field_name = StringUtil::string_to_wstring(column_name);
if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
@@ -464,7 +464,7 @@ Status
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
// std::string search_str = reinterpret_cast<const
StringRef*>(query_value)->to_string();
VLOG_DEBUG << "begin to query the inverted index from clucene"
<< ", column_name: " << column_name << ", search_str: " <<
search_str;
- std::wstring column_name_ws = std::wstring(column_name.begin(),
column_name.end());
+ std::wstring column_name_ws = StringUtil::string_to_wstring(column_name);
std::wstring search_str_ws = StringUtil::string_to_wstring(search_str);
// unique_ptr with custom deleter
std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index cc2c89bb116..9f51098a052 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -88,7 +88,7 @@ public:
_parser_type = get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(_index_meta->properties()));
_value_key_coder = get_key_coder(field_type);
- _field_name = std::wstring(field_name.begin(), field_name.end());
+ _field_name = StringUtil::string_to_wstring(field_name);
}
~InvertedIndexColumnWriterImpl() override {
diff --git
a/regression-test/data/inverted_index_p0/test_index_chinese_column.out
b/regression-test/data/inverted_index_p0/test_index_chinese_column.out
new file mode 100644
index 00000000000..8b3ebab527e
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_chinese_column.out
@@ -0,0 +1,7 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+1 json love anny json anny 2023-10-10T12:11:11
+
+-- !sql --
+1 json love anny json anny 2023-10-10T12:11:11
+
diff --git
a/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy
b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy
new file mode 100644
index 00000000000..21a94e1ffef
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+suite("test_index_chinese_column", "inverted_index_select"){
+ def createAndInsertData = { table_name, inverted_index_storage_format ->
+ sql "DROP TABLE IF EXISTS ${table_name}"
+ sql """
+ CREATE TABLE ${table_name}
+ (
+ k1 int ,
+ 名称 string,
+ k3 char(50),
+ k4 varchar(200),
+ k5 datetime,
+ index index_str_k2 (`名称`) using inverted
properties("parser"="english","ignore_above"="257")
+ )
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES("replication_num" = "1","inverted_index_storage_format"
= "${inverted_index_storage_format}")
+ """
+ sql " insert into ${table_name} values(1, 'json love anny', 'json',
'anny', '2023-10-10 12:11:11') "
+ qt_sql "SELECT * FROM ${table_name} WHERE 名称 match_all 'json'"
+ }
+
+ def table_name_v1 = "test_index_chinese_column_v1"
+ def table_name_v2 = "test_index_chinese_column_v2"
+
+ sql "set enable_unicode_name_support=true"
+
+ createAndInsertData(table_name_v1, "V1")
+ createAndInsertData(table_name_v2, "V2")
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]