This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 90b4e127e3 [Feature](inverted index) add parser_mode properties for
inverted index parser (#20116)
90b4e127e3 is described below
commit 90b4e127e3e53cea3b1620e2c205cd0f00ffe7ff
Author: airborne12 <[email protected]>
AuthorDate: Mon May 29 23:21:52 2023 +0800
[Feature](inverted index) add parser_mode properties for inverted index
parser (#20116)
We add parser mode for inverted index, usage like this:
```
CREATE TABLE `inverted` (
`FIELD0` text NULL,
`FIELD1` text NULL,
`FIELD2` text NULL,
`FIELD3` text NULL,
INDEX idx_name1 (`FIELD0`) USING INVERTED PROPERTIES("parser" =
"chinese", "parser_mode" = "fine_grained") COMMENT '',
INDEX idx_name2 (`FIELD1`) USING INVERTED PROPERTIES("parser" =
"chinese", "parser_mode" = "coarse_grained") COMMENT ''
) ENGINE=OLAP
);
```
---
be/src/clucene | 2 +-
be/src/olap/inverted_index_parser.cpp | 8 +++
be/src/olap/inverted_index_parser.h | 6 ++
be/src/olap/rowset/segment_v2/column_reader.cpp | 10 +--
.../rowset/segment_v2/inverted_index_reader.cpp | 74 ++++++++++---------
.../olap/rowset/segment_v2/inverted_index_reader.h | 63 +++++++----------
.../rowset/segment_v2/inverted_index_writer.cpp | 14 ++--
be/src/olap/tablet_schema.h | 8 +++
.../inverted_index_p0/test_chinese_analyzer.out | 35 +++++++++
.../inverted_index_p0/test_chinese_analyzer.groovy | 82 ++++++++++++++++++++++
10 files changed, 217 insertions(+), 85 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index 3e493ab995..6033b8c33c 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 3e493ab99573cea5a7ed57f52d8fd9e03e2c17a9
+Subproject commit 6033b8c33c08fd45575d2799f93973d9ebd032ea
diff --git a/be/src/olap/inverted_index_parser.cpp
b/be/src/olap/inverted_index_parser.cpp
index 9407b52ee4..e920a4a930 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -62,4 +62,12 @@ std::string get_parser_string_from_properties(
}
}
+std::string get_parser_mode_string_from_properties(
+ const std::map<std::string, std::string>& properties) {
+ if (properties.find(INVERTED_INDEX_PARSER_MODE_KEY) != properties.end()) {
+ return properties.at(INVERTED_INDEX_PARSER_MODE_KEY);
+ } else {
+ return INVERTED_INDEX_PARSER_FINE_GRANULARITY;
+ }
+}
} // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h
b/be/src/olap/inverted_index_parser.h
index 0c870aa355..d36950e514 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -30,6 +30,10 @@ enum class InvertedIndexParserType {
PARSER_CHINESE = 4,
};
+const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
+const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained";
+const std::string INVERTED_INDEX_PARSER_COARSE_GRANULARITY = "coarse_grained";
+
const std::string INVERTED_INDEX_PARSER_KEY = "parser";
const std::string INVERTED_INDEX_PARSER_UNKNOWN = "unknown";
const std::string INVERTED_INDEX_PARSER_NONE = "none";
@@ -42,5 +46,7 @@ std::string
inverted_index_parser_type_to_string(InvertedIndexParserType parser_
InvertedIndexParserType get_inverted_index_parser_type_from_string(const
std::string& parser_str);
std::string get_parser_string_from_properties(const std::map<std::string,
std::string>& properties);
+std::string get_parser_mode_string_from_properties(
+ const std::map<std::string, std::string>& properties);
} // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index e55fc2a595..ed21161b71 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -230,7 +230,7 @@ Status ColumnReader::new_inverted_index_iterator(const
TabletIndex* index_meta,
InvertedIndexIterator**
iterator) {
RETURN_IF_ERROR(_ensure_inverted_index_loaded(index_meta));
if (_inverted_index) {
- RETURN_IF_ERROR(_inverted_index->new_iterator(index_meta, stats,
iterator));
+ RETURN_IF_ERROR(_inverted_index->new_iterator(stats, iterator));
}
return Status::OK();
}
@@ -479,15 +479,15 @@ Status ColumnReader::_load_inverted_index_index(const
TabletIndex* index_meta) {
if (is_string_type(type)) {
if (parser_type != InvertedIndexParserType::PARSER_NONE) {
_inverted_index.reset(new FullTextIndexReader(
- _file_reader->fs(), _file_reader->path().native(),
index_meta->index_id()));
+ _file_reader->fs(), _file_reader->path().native(),
index_meta));
return Status::OK();
} else {
_inverted_index.reset(new StringTypeInvertedIndexReader(
- _file_reader->fs(), _file_reader->path().native(),
index_meta->index_id()));
+ _file_reader->fs(), _file_reader->path().native(),
index_meta));
}
} else if (is_numeric_type(type)) {
- _inverted_index.reset(new BkdIndexReader(_file_reader->fs(),
_file_reader->path().native(),
- index_meta->index_id()));
+ _inverted_index.reset(
+ new BkdIndexReader(_file_reader->fs(),
_file_reader->path().native(), index_meta));
} else {
_inverted_index.reset();
}
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 97a87c4309..544620e68f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -92,8 +92,8 @@ Status
InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cach
// try to get query bitmap result from cache and return immediately on
cache hit
io::Path path(_path);
auto index_dir = path.parent_path();
- auto index_file_name =
- InvertedIndexDescriptor::get_index_file_name(path.filename(),
_index_id);
+ auto index_file_name =
InvertedIndexDescriptor::get_index_file_name(path.filename(),
+
_index_meta.index_id());
auto index_file_path = index_dir / index_file_name;
InvertedIndexQueryCache::CacheKey cache_key {
index_file_path, "", InvertedIndexQueryType::UNKNOWN_QUERY,
L"null_bitmap"};
@@ -140,11 +140,13 @@ Status
InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cach
}
std::vector<std::wstring> FullTextIndexReader::get_analyse_result(
- const std::wstring& field_name, const std::string& value,
InvertedIndexQueryType query_type,
- InvertedIndexParserType analyser_type) {
+ const std::wstring& field_name, const std::string& value,
+ InvertedIndexQueryType query_type) {
std::vector<std::wstring> analyse_result;
std::shared_ptr<lucene::analysis::Analyzer> analyzer;
std::unique_ptr<lucene::util::Reader> reader;
+ auto analyser_type = get_inverted_index_parser_type_from_string(
+ get_parser_string_from_properties(_index_meta.properties()));
if (analyser_type == InvertedIndexParserType::PARSER_STANDARD) {
analyzer =
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
reader.reset(
@@ -153,10 +155,18 @@ std::vector<std::wstring>
FullTextIndexReader::get_analyse_result(
auto chinese_analyzer =
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
chinese_analyzer->initDict(config::inverted_index_dict_path);
+ auto mode =
get_parser_mode_string_from_properties(_index_meta.properties());
+ if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
+ chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
+ } else {
+ chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+ }
analyzer = chinese_analyzer;
- reader.reset(new lucene::util::SimpleInputStreamReader(
- new lucene::util::AStringReader(value.c_str()),
- lucene::util::SimpleInputStreamReader::UTF8));
+ reader.reset(_CLNEW lucene::util::SStringReader<char>(value.c_str(),
strlen(value.c_str()),
+ false));
+ //reader.reset(new lucene::util::SimpleInputStreamReader(
+ // new lucene::util::AStringReader(value.c_str()),
+ // lucene::util::SimpleInputStreamReader::UTF8));
} else {
// default
analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<TCHAR>>();
@@ -189,15 +199,14 @@ std::vector<std::wstring>
FullTextIndexReader::get_analyse_result(
return analyse_result;
}
-Status FullTextIndexReader::new_iterator(const TabletIndex* index_meta,
OlapReaderStatistics* stats,
+Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats,
InvertedIndexIterator** iterator) {
- *iterator = new InvertedIndexIterator(index_meta, stats, this);
+ *iterator = new InvertedIndexIterator(stats, this);
return Status::OK();
}
Status FullTextIndexReader::query(OlapReaderStatistics* stats, const
std::string& column_name,
const void* query_value,
InvertedIndexQueryType query_type,
- InvertedIndexParserType analyser_type,
roaring::Roaring* bit_map) {
SCOPED_RAW_TIMER(&stats->inverted_index_query_timer);
@@ -207,14 +216,16 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, const std::string
io::Path path(_path);
auto index_dir = path.parent_path();
- auto index_file_name =
InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id);
+ auto index_file_name =
+ InvertedIndexDescriptor::get_index_file_name(path.filename(),
_index_meta.index_id());
auto index_file_path = index_dir / index_file_name;
std::unique_ptr<lucene::search::Query> query;
std::wstring field_ws = std::wstring(column_name.begin(),
column_name.end());
+
try {
std::vector<std::wstring> analyse_result =
- get_analyse_result(field_ws, search_str, query_type,
analyser_type);
+ get_analyse_result(field_ws, search_str, query_type);
if (analyse_result.empty()) {
LOG(WARNING) << "invalid input query_str: " << search_str
@@ -332,17 +343,15 @@ InvertedIndexReaderType FullTextIndexReader::type() {
return InvertedIndexReaderType::FULLTEXT;
}
-Status StringTypeInvertedIndexReader::new_iterator(const TabletIndex*
index_meta,
- OlapReaderStatistics* stats,
+Status StringTypeInvertedIndexReader::new_iterator(OlapReaderStatistics* stats,
InvertedIndexIterator**
iterator) {
- *iterator = new InvertedIndexIterator(index_meta, stats, this);
+ *iterator = new InvertedIndexIterator(stats, this);
return Status::OK();
}
Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
const std::string& column_name,
const void* query_value,
InvertedIndexQueryType query_type,
- InvertedIndexParserType
analyser_type,
roaring::Roaring* bit_map) {
SCOPED_RAW_TIMER(&stats->inverted_index_query_timer);
@@ -362,7 +371,8 @@ Status
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
io::Path path(_path);
auto index_dir = path.parent_path();
- auto index_file_name =
InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id);
+ auto index_file_name =
+ InvertedIndexDescriptor::get_index_file_name(path.filename(),
_index_meta.index_id());
auto index_file_path = index_dir / index_file_name;
// try to get query bitmap result from cache and return immediately on
cache hit
@@ -451,12 +461,12 @@ InvertedIndexReaderType
StringTypeInvertedIndexReader::type() {
}
BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const std::string& path,
- const uint32_t uniq_id)
- : InvertedIndexReader(fs, path, uniq_id), _compoundReader(nullptr) {
+ const TabletIndex* index_meta)
+ : InvertedIndexReader(fs, path, index_meta), _compoundReader(nullptr) {
io::Path io_path(_path);
auto index_dir = io_path.parent_path();
- auto index_file_name =
- InvertedIndexDescriptor::get_index_file_name(io_path.filename(),
_index_id);
+ auto index_file_name =
InvertedIndexDescriptor::get_index_file_name(io_path.filename(),
+
index_meta->index_id());
// check index file existence
auto index_file = index_dir / index_file_name;
@@ -469,9 +479,8 @@ BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const
std::string& path,
config::inverted_index_read_buffer_size);
}
-Status BkdIndexReader::new_iterator(const TabletIndex* index_meta,
OlapReaderStatistics* stats,
- InvertedIndexIterator** iterator) {
- *iterator = new InvertedIndexIterator(index_meta, stats, this);
+Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats,
InvertedIndexIterator** iterator) {
+ *iterator = new InvertedIndexIterator(stats, this);
return Status::OK();
}
@@ -511,7 +520,7 @@ Status BkdIndexReader::bkd_query(OlapReaderStatistics*
stats, const std::string&
Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const
std::string& column_name,
const void* query_value,
InvertedIndexQueryType query_type,
- InvertedIndexParserType analyser_type,
uint32_t* count) {
+ uint32_t* count) {
uint64_t start = UnixMillis();
auto visitor = std::make_unique<InvertedIndexVisitor>(nullptr, query_type,
true);
std::shared_ptr<lucene::util::bkd::bkd_reader> r;
@@ -537,12 +546,13 @@ Status BkdIndexReader::try_query(OlapReaderStatistics*
stats, const std::string&
Status BkdIndexReader::query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
- InvertedIndexParserType analyser_type,
roaring::Roaring* bit_map) {
+ roaring::Roaring* bit_map) {
SCOPED_RAW_TIMER(&stats->inverted_index_query_timer);
io::Path path(_path);
auto index_dir = path.parent_path();
- auto index_file_name =
InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id);
+ auto index_file_name =
+ InvertedIndexDescriptor::get_index_file_name(path.filename(),
_index_meta.index_id());
auto index_file_path = index_dir / index_file_name;
// std::string query_str {(const char *)query_value};
@@ -820,8 +830,7 @@ Status
InvertedIndexIterator::read_from_inverted_index(const std::string& column
}
}
- RETURN_IF_ERROR(
- _reader->query(_stats, column_name, query_value, query_type,
_analyser_type, bit_map));
+ RETURN_IF_ERROR(_reader->query(_stats, column_name, query_value,
query_type, bit_map));
return Status::OK();
}
@@ -835,16 +844,11 @@ Status
InvertedIndexIterator::try_read_from_inverted_index(const std::string& co
query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY ||
query_type == InvertedIndexQueryType::LESS_THAN_QUERY ||
query_type == InvertedIndexQueryType::EQUAL_QUERY) {
- RETURN_IF_ERROR(_reader->try_query(_stats, column_name, query_value,
query_type,
- _analyser_type, count));
+ RETURN_IF_ERROR(_reader->try_query(_stats, column_name, query_value,
query_type, count));
}
return Status::OK();
}
-InvertedIndexParserType
InvertedIndexIterator::get_inverted_index_analyser_type() const {
- return _analyser_type;
-}
-
InvertedIndexReaderType
InvertedIndexIterator::get_inverted_index_reader_type() const {
return _reader->type();
}
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index a9e263357d..80c653f418 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -78,19 +78,18 @@ enum class InvertedIndexQueryType {
class InvertedIndexReader {
public:
explicit InvertedIndexReader(io::FileSystemSPtr fs, const std::string&
path,
- const uint32_t index_id)
- : _fs(std::move(fs)), _path(path), _index_id(index_id) {}
+ const TabletIndex* index_meta)
+ : _fs(std::move(fs)), _path(path), _index_meta(*index_meta) {}
virtual ~InvertedIndexReader() = default;
// create a new column iterator. Client should delete returned iterator
- virtual Status new_iterator(const TabletIndex* index_meta,
OlapReaderStatistics* stats,
- InvertedIndexIterator** iterator) = 0;
+ virtual Status new_iterator(OlapReaderStatistics* stats,
InvertedIndexIterator** iterator) = 0;
virtual Status query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
- InvertedIndexParserType analyser_type,
roaring::Roaring* bit_map) = 0;
+ roaring::Roaring* bit_map) = 0;
virtual Status try_query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
- InvertedIndexParserType analyser_type, uint32_t*
count) = 0;
+ uint32_t* count) = 0;
Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle,
lucene::store::Directory* dir = nullptr);
@@ -98,56 +97,53 @@ public:
virtual InvertedIndexReaderType type() = 0;
bool indexExists(io::Path& index_file_path);
- uint32_t get_index_id() const { return _index_id; }
+ uint32_t get_index_id() const { return _index_meta.index_id(); }
protected:
bool _is_match_query(InvertedIndexQueryType query_type);
friend class InvertedIndexIterator;
io::FileSystemSPtr _fs;
std::string _path;
- uint32_t _index_id;
+ TabletIndex _index_meta;
};
class FullTextIndexReader : public InvertedIndexReader {
public:
explicit FullTextIndexReader(io::FileSystemSPtr fs, const std::string&
path,
- const int64_t uniq_id)
- : InvertedIndexReader(std::move(fs), path, uniq_id) {}
+ const TabletIndex* index_meta)
+ : InvertedIndexReader(std::move(fs), path, index_meta) {}
~FullTextIndexReader() override = default;
- Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics*
stats,
- InvertedIndexIterator** iterator) override;
+ Status new_iterator(OlapReaderStatistics* stats, InvertedIndexIterator**
iterator) override;
Status query(OlapReaderStatistics* stats, const std::string& column_name,
const void* query_value, InvertedIndexQueryType query_type,
- InvertedIndexParserType analyser_type, roaring::Roaring*
bit_map) override;
+ roaring::Roaring* bit_map) override;
Status try_query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
- InvertedIndexParserType analyser_type, uint32_t* count)
override {
+ uint32_t* count) override {
return Status::Error<ErrorCode::NOT_IMPLEMENTED_ERROR>();
}
InvertedIndexReaderType type() override;
std::vector<std::wstring> get_analyse_result(const std::wstring&
field_name,
const std::string& value,
- InvertedIndexQueryType
query_type,
- InvertedIndexParserType
analyser_type);
+ InvertedIndexQueryType
query_type);
};
class StringTypeInvertedIndexReader : public InvertedIndexReader {
public:
explicit StringTypeInvertedIndexReader(io::FileSystemSPtr fs, const
std::string& path,
- const int64_t uniq_id)
- : InvertedIndexReader(std::move(fs), path, uniq_id) {}
+ const TabletIndex* index_meta)
+ : InvertedIndexReader(std::move(fs), path, index_meta) {}
~StringTypeInvertedIndexReader() override = default;
- Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics*
stats,
- InvertedIndexIterator** iterator) override;
+ Status new_iterator(OlapReaderStatistics* stats, InvertedIndexIterator**
iterator) override;
Status query(OlapReaderStatistics* stats, const std::string& column_name,
const void* query_value, InvertedIndexQueryType query_type,
- InvertedIndexParserType analyser_type, roaring::Roaring*
bit_map) override;
+ roaring::Roaring* bit_map) override;
Status try_query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
- InvertedIndexParserType analyser_type, uint32_t* count)
override {
+ uint32_t* count) override {
return Status::Error<ErrorCode::NOT_IMPLEMENTED_ERROR>();
}
InvertedIndexReaderType type() override;
@@ -189,7 +185,8 @@ public:
class BkdIndexReader : public InvertedIndexReader {
public:
- explicit BkdIndexReader(io::FileSystemSPtr fs, const std::string& path,
const uint32_t uniq_id);
+ explicit BkdIndexReader(io::FileSystemSPtr fs, const std::string& path,
+ const TabletIndex* index_meta);
~BkdIndexReader() override {
if (_compoundReader != nullptr) {
_compoundReader->close();
@@ -198,15 +195,14 @@ public:
}
}
- Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics*
stats,
- InvertedIndexIterator** iterator) override;
+ Status new_iterator(OlapReaderStatistics* stats, InvertedIndexIterator**
iterator) override;
Status query(OlapReaderStatistics* stats, const std::string& column_name,
const void* query_value, InvertedIndexQueryType query_type,
- InvertedIndexParserType analyser_type, roaring::Roaring*
bit_map) override;
+ roaring::Roaring* bit_map) override;
Status try_query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
- InvertedIndexParserType analyser_type, uint32_t* count)
override;
+ uint32_t* count) override;
Status bkd_query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
std::shared_ptr<lucene::util::bkd::bkd_reader>& r,
@@ -223,13 +219,8 @@ private:
class InvertedIndexIterator {
public:
- InvertedIndexIterator(const TabletIndex* index_meta, OlapReaderStatistics*
stats,
- InvertedIndexReader* reader)
- : _index_meta(index_meta), _stats(stats), _reader(reader) {
- // TODO xk maybe change interface to use index
- _analyser_type = get_inverted_index_parser_type_from_string(
- get_parser_string_from_properties(_index_meta->properties()));
- }
+ InvertedIndexIterator(OlapReaderStatistics* stats, InvertedIndexReader*
reader)
+ : _stats(stats), _reader(reader) {}
Status read_from_inverted_index(const std::string& column_name, const
void* query_value,
InvertedIndexQueryType query_type,
uint32_t segment_num_rows,
@@ -242,15 +233,11 @@ public:
return _reader->read_null_bitmap(cache_handle, dir);
}
- InvertedIndexParserType get_inverted_index_analyser_type() const;
-
InvertedIndexReaderType get_inverted_index_reader_type() const;
private:
- const TabletIndex* _index_meta;
OlapReaderStatistics* _stats;
InvertedIndexReader* _reader;
- InvertedIndexParserType _analyser_type;
};
} // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 49e221212c..253a187fa4 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -154,6 +154,12 @@ public:
auto chinese_analyzer = _CLNEW
lucene::analysis::LanguageBasedAnalyzer();
chinese_analyzer->setLanguage(L"chinese");
chinese_analyzer->initDict(config::inverted_index_dict_path);
+ auto mode =
get_parser_mode_string_from_properties(_index_meta->properties());
+ if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
+
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
+ } else {
+ chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
+ }
_analyzer.reset(chinese_analyzer);
} else {
// ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
@@ -199,13 +205,9 @@ public:
}
void new_fulltext_field(const char* field_value_data, size_t
field_value_size) {
- if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
+ if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH ||
+ _parser_type == InvertedIndexParserType::PARSER_CHINESE) {
new_char_token_stream(field_value_data, field_value_size, _field);
- } else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
- auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader(
- new lucene::util::AStringReader(field_value_data,
field_value_size),
- lucene::util::SimpleInputStreamReader::UTF8);
- _field->setValue(stringReader);
} else {
new_field_value(field_value_data, field_value_size, _field);
}
diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h
index 3dad8ba875..a488ae82fd 100644
--- a/be/src/olap/tablet_schema.h
+++ b/be/src/olap/tablet_schema.h
@@ -152,6 +152,7 @@ class TabletSchema;
class TabletIndex {
public:
+ TabletIndex() = default;
void init_from_thrift(const TOlapTableIndex& index, const TabletSchema&
tablet_schema);
void init_from_thrift(const TOlapTableIndex& index, const
std::vector<int32_t>& column_uids);
void init_from_pb(const TabletIndexPB& index);
@@ -176,6 +177,13 @@ public:
return 0;
}
+ TabletIndex(const TabletIndex& other) {
+ _index_id = other._index_id;
+ _index_name = other._index_name;
+ _index_type = other._index_type;
+ _col_unique_ids = other._col_unique_ids;
+ _properties = other._properties;
+ }
private:
int64_t _index_id;
diff --git a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
new file mode 100644
index 0000000000..71489df784
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
@@ -0,0 +1,35 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+2 我爱你中国
+
+-- !sql --
+1 我来到北京清华大学
+
+-- !sql --
+1 我来到北京清华大学
+
+-- !sql --
+1 我来到北京清华大学
+
+-- !sql --
+1 我来到北京清华大学
+
+-- !sql --
+3 人民可以得到更多实惠
+
+-- !sql --
+2 我爱你中国
+
+-- !sql --
+1 我来到北京清华大学
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+1 我来到北京清华大学
+
+-- !sql --
+3 人民可以得到更多实惠
+
diff --git
a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
new file mode 100644
index 0000000000..f779e0bfce
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_chinese_analyzer"){
+ // prepare test table
+
+
+ def timeout = 60000
+ def delta_time = 1000
+ def alter_res = "null"
+ def useTime = 0
+
+ def indexTblName = "chinese_analyzer_test"
+
+ sql "DROP TABLE IF EXISTS ${indexTblName}"
+ // create 1 replica table
+ sql """
+ CREATE TABLE IF NOT EXISTS ${indexTblName}(
+ `id`int(11)NULL,
+ `c` text NULL,
+ INDEX c_idx(`c`) USING INVERTED
PROPERTIES("parser"="chinese","parser_mode"="fine_grained") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ COMMENT 'OLAP'
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES(
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ def var_result = sql "show variables"
+ logger.info("show variales result: " + var_result )
+
+ sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3,
'人民可以得到更多实惠');"
+ qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我爱你' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '大学' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华大学' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '人民' ORDER BY id;"
+
+ def indexTblName2 = "chinese_analyzer_test2"
+
+ sql "DROP TABLE IF EXISTS ${indexTblName2}"
+ // create 1 replica table
+ sql """
+ CREATE TABLE IF NOT EXISTS ${indexTblName2}(
+ `id`int(11)NULL,
+ `c` text NULL,
+ INDEX c_idx(`c`) USING INVERTED
PROPERTIES("parser"="chinese","parser_mode"="coarse_grained") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ COMMENT 'OLAP'
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES(
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3,
'人民可以得到更多实惠');"
+ qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我爱你' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;"
+ qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '人民' ORDER BY id;"
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]