This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 78bd61e6d5e [feat](inverted index) Adding Storage Format V3 for
Inverted Index (#44414)
78bd61e6d5e is described below
commit 78bd61e6d5e7fb1e869cbbd7c1f987a2dff01918
Author: zzzxl <[email protected]>
AuthorDate: Wed Dec 4 17:12:30 2024 +0800
[feat](inverted index) Adding Storage Format V3 for Inverted Index (#44414)
Problem Summary:
1. "Mainly added the functionality for compressing inverted index
position information and dictionary information."
2. "Position information compression must be enabled by setting
inverted_index_storage_format to v3 when creating the table."
e.g.
```
CREATE TABLE tbl (
...
) ENGINE=OLAP
DUPLICATE KEY(`x`)
COMMENT "OLAP"
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"inverted_index_storage_format" = "V3"
);
```
4. "The dictionary compression feature requires setting
inverted_index_storage_format to v3 and configuring dict_compression to
true in the properties."
e.g.
```
INDEX x_idx (`x`) USING INVERTED PROPERTIES("dict_compression" = "true")
COMMENT ''
```
---
be/src/clucene | 2 +-
be/src/olap/inverted_index_parser.cpp | 9 ++
be/src/olap/inverted_index_parser.h | 5 +
be/src/olap/rowset/beta_rowset.cpp | 18 +++-
.../char_filter/char_replace_char_filter.h | 2 +-
.../segment_v2/inverted_index_file_reader.cpp | 8 +-
.../rowset/segment_v2/inverted_index_file_reader.h | 2 +-
.../segment_v2/inverted_index_file_writer.cpp | 22 ++--
.../rowset/segment_v2/inverted_index_file_writer.h | 10 +-
.../rowset/segment_v2/inverted_index_writer.cpp | 22 ++++
be/src/olap/tablet_meta.cpp | 3 +
.../inverted_index/query/phrase_query_test.cpp | 1 -
.../segment_v2/inverted_index_file_writer_test.cpp | 8 +-
.../apache/doris/analysis/InvertedIndexUtil.java | 12 ++-
.../cloud/datasource/CloudInternalCatalog.java | 6 +-
.../apache/doris/common/util/PropertyAnalyzer.java | 6 ++
.../doris/analysis/InvertedIndexUtilTest.java | 47 +++++++++
.../apache/doris/common/PropertyAnalyzerTest.java | 50 +++++++++
gensrc/proto/olap_file.proto | 1 +
gensrc/thrift/Types.thrift | 3 +-
.../inverted_index_p0/test_inverted_index_v3.out | 25 +++++
.../test_inverted_index_v3_fault_injection.groovy | 60 +++++++++++
.../test_inverted_index_v3.groovy | 117 +++++++++++++++++++++
23 files changed, 406 insertions(+), 33 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index 48fa9cc4ec3..a506dbb6c52 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 48fa9cc4ec32b40bf3b02338d0a1b2cdbc6408cf
+Subproject commit a506dbb6c523aa65044eb1c527a066d236172543
diff --git a/be/src/olap/inverted_index_parser.cpp
b/be/src/olap/inverted_index_parser.cpp
index f7e511970d9..f1de5a5e0c1 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -136,4 +136,13 @@ std::string get_parser_stopwords_from_properties(
}
}
+std::string get_parser_dict_compression_from_properties(
+ const std::map<std::string, std::string>& properties) {
+ if (properties.find(INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY) !=
properties.end()) {
+ return properties.at(INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY);
+ } else {
+ return "";
+ }
+}
+
} // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h
b/be/src/olap/inverted_index_parser.h
index 0b8426d74c7..f1f85995a20 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -83,6 +83,8 @@ const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY =
"lower_case";
const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
+const std::string INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY =
"dict_compression";
+
std::string inverted_index_parser_type_to_string(InvertedIndexParserType
parser_type);
InvertedIndexParserType get_inverted_index_parser_type_from_string(const
std::string& parser_str);
@@ -119,4 +121,7 @@ std::string get_parser_lowercase_from_properties(
std::string get_parser_stopwords_from_properties(
const std::map<std::string, std::string>& properties);
+std::string get_parser_dict_compression_from_properties(
+ const std::map<std::string, std::string>& properties);
+
} // namespace doris
diff --git a/be/src/olap/rowset/beta_rowset.cpp
b/be/src/olap/rowset/beta_rowset.cpp
index bbb2ca72b4a..cd52deed0c8 100644
--- a/be/src/olap/rowset/beta_rowset.cpp
+++ b/be/src/olap/rowset/beta_rowset.cpp
@@ -703,10 +703,24 @@ Status
BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value,
rapidjson::Document::AllocatorType&
allocator) {
const auto& fs = _rowset_meta->fs();
auto storage_format = _schema->get_inverted_index_storage_format();
- auto format_str = storage_format == InvertedIndexStorageFormatPB::V1 ?
"V1" : "V2";
+ std::string format_str;
+ switch (storage_format) {
+ case InvertedIndexStorageFormatPB::V1:
+ format_str = "V1";
+ break;
+ case InvertedIndexStorageFormatPB::V2:
+ format_str = "V2";
+ break;
+ case InvertedIndexStorageFormatPB::V3:
+ format_str = "V3";
+ break;
+ default:
+ return Status::InternalError("inverted index storage format error");
+ break;
+ }
auto rs_id = rowset_id().to_string();
rowset_value->AddMember("rowset_id", rapidjson::Value(rs_id.c_str(),
allocator), allocator);
- rowset_value->AddMember("index_storage_format",
rapidjson::Value(format_str, allocator),
+ rowset_value->AddMember("index_storage_format",
rapidjson::Value(format_str.c_str(), allocator),
allocator);
rapidjson::Value segments(rapidjson::kArrayType);
for (int seg_id = 0; seg_id < num_segments(); ++seg_id) {
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
index d9e5080d2d5..1e5e6f5d5ce 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
@@ -17,7 +17,7 @@
#pragma once
-#include <CLucene.h>
+#include <CLucene.h> // IWYU pragma: keep
#include <CLucene/analysis/CharFilter.h>
#include <bitset>
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp
index 113833d560f..8d480829a0c 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp
@@ -30,8 +30,8 @@ namespace doris::segment_v2 {
Status InvertedIndexFileReader::init(int32_t read_buffer_size) {
if (!_inited) {
_read_buffer_size = read_buffer_size;
- if (_storage_format == InvertedIndexStorageFormatPB::V2) {
- auto st = _init_from_v2(read_buffer_size);
+ if (_storage_format >= InvertedIndexStorageFormatPB::V2) {
+ auto st = _init_from(read_buffer_size);
if (!st.ok()) {
return st;
}
@@ -41,7 +41,7 @@ Status InvertedIndexFileReader::init(int32_t
read_buffer_size) {
return Status::OK();
}
-Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) {
+Status InvertedIndexFileReader::_init_from(int32_t read_buffer_size) {
auto index_file_full_path =
InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix);
std::unique_lock<std::shared_mutex> lock(_mutex); // Lock for writing
@@ -79,7 +79,7 @@ Status InvertedIndexFileReader::_init_from_v2(int32_t
read_buffer_size) {
// 3. read file
int32_t version = _stream->readInt(); // Read version number
- if (version == InvertedIndexStorageFormatPB::V2) {
+ if (version >= InvertedIndexStorageFormatPB::V2) {
DCHECK(version == _storage_format);
int32_t numIndices = _stream->readInt(); // Read number of indices
ReaderFileEntry* entry = nullptr;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h
index 3b7161c7643..443d40cfaf0 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h
@@ -70,7 +70,7 @@ public:
int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 :
_stream->length(); }
private:
- Status _init_from_v2(int32_t read_buffer_size);
+ Status _init_from(int32_t read_buffer_size);
Result<std::unique_ptr<DorisCompoundReader>> _open(int64_t index_id,
const std::string&
index_suffix) const;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp
index bb373be5ee9..4d6892aa785 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp
@@ -150,7 +150,7 @@ Status InvertedIndexFileWriter::close() {
}
} else {
try {
- RETURN_IF_ERROR(write_v2());
+ RETURN_IF_ERROR(write());
for (const auto& entry : _indices_dirs) {
const auto& dir = entry.second;
// delete index path, which contains separated inverted index
files
@@ -293,7 +293,7 @@ Status InvertedIndexFileWriter::write_v1() {
return Status::OK();
}
-Status InvertedIndexFileWriter::write_v2() {
+Status InvertedIndexFileWriter::write() {
std::unique_ptr<lucene::store::Directory, DirectoryDeleter> out_dir =
nullptr;
std::unique_ptr<lucene::store::IndexOutput> compound_file_output = nullptr;
ErrorContext error_context;
@@ -301,10 +301,10 @@ Status InvertedIndexFileWriter::write_v2() {
// Calculate header length and initialize offset
int64_t current_offset = headerLength();
// Prepare file metadata
- auto file_metadata = prepare_file_metadata_v2(current_offset);
+ auto file_metadata = prepare_file_metadata(current_offset);
// Create output stream
- auto result = create_output_stream_v2();
+ auto result = create_output_stream();
out_dir = std::move(result.first);
compound_file_output = std::move(result.second);
@@ -315,7 +315,7 @@ Status InvertedIndexFileWriter::write_v2() {
write_index_headers_and_metadata(compound_file_output.get(),
file_metadata);
// Copy file data
- copy_files_data_v2(compound_file_output.get(), file_metadata);
+ copy_files_data(compound_file_output.get(), file_metadata);
_total_file_size = compound_file_output->getFilePointer();
_file_info.set_index_size(_total_file_size);
@@ -470,7 +470,7 @@ void
InvertedIndexFileWriter::write_header_and_data_v1(lucene::store::IndexOutpu
std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>,
std::unique_ptr<lucene::store::IndexOutput>>
-InvertedIndexFileWriter::create_output_stream_v2() {
+InvertedIndexFileWriter::create_output_stream() {
io::Path index_path
{InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)};
auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs,
index_path.parent_path().c_str());
@@ -486,15 +486,15 @@ InvertedIndexFileWriter::create_output_stream_v2() {
void
InvertedIndexFileWriter::write_version_and_indices_count(lucene::store::IndexOutput*
output) {
// Write the version number
- output->writeInt(InvertedIndexStorageFormatPB::V2);
+ output->writeInt(_storage_format);
// Write the number of indices
const auto num_indices = static_cast<uint32_t>(_indices_dirs.size());
output->writeInt(num_indices);
}
-std::vector<InvertedIndexFileWriter::FileMetadata>
-InvertedIndexFileWriter::prepare_file_metadata_v2(int64_t& current_offset) {
+std::vector<InvertedIndexFileWriter::FileMetadata>
InvertedIndexFileWriter::prepare_file_metadata(
+ int64_t& current_offset) {
std::vector<FileMetadata> file_metadata;
for (const auto& entry : _indices_dirs) {
@@ -546,8 +546,8 @@ void
InvertedIndexFileWriter::write_index_headers_and_metadata(
}
}
-void InvertedIndexFileWriter::copy_files_data_v2(lucene::store::IndexOutput*
output,
- const
std::vector<FileMetadata>& file_metadata) {
+void InvertedIndexFileWriter::copy_files_data(lucene::store::IndexOutput*
output,
+ const std::vector<FileMetadata>&
file_metadata) {
const int64_t buffer_length = 16384;
uint8_t buffer[buffer_length];
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
index ba42ffdceb1..ab7cdbff152 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
@@ -71,7 +71,7 @@ public:
Status delete_index(const TabletIndex* index_meta);
Status initialize(InvertedIndexDirectoryMap& indices_dirs);
virtual ~InvertedIndexFileWriter() = default;
- Status write_v2();
+ Status write();
Status write_v1();
Status close();
const InvertedIndexFileInfo* get_index_file_info() const {
@@ -122,7 +122,7 @@ private:
// Helper functions specific to write_v2
virtual std::pair<std::unique_ptr<lucene::store::Directory,
DirectoryDeleter>,
std::unique_ptr<lucene::store::IndexOutput>>
- create_output_stream_v2();
+ create_output_stream();
void write_version_and_indices_count(lucene::store::IndexOutput* output);
struct FileMetadata {
int64_t index_id;
@@ -141,11 +141,11 @@ private:
length(len),
directory(dir) {}
};
- std::vector<FileMetadata> prepare_file_metadata_v2(int64_t&
current_offset);
+ std::vector<FileMetadata> prepare_file_metadata(int64_t& current_offset);
virtual void write_index_headers_and_metadata(lucene::store::IndexOutput*
output,
const
std::vector<FileMetadata>& file_metadata);
- void copy_files_data_v2(lucene::store::IndexOutput* output,
- const std::vector<FileMetadata>& file_metadata);
+ void copy_files_data(lucene::store::IndexOutput* output,
+ const std::vector<FileMetadata>& file_metadata);
Status _insert_directory_into_map(int64_t index_id, const std::string&
index_suffix,
std::shared_ptr<DorisFSDirectory> dir);
// Member variables...
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 86a8f89e4c9..02edf2f1976 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -212,6 +212,28 @@ public:
(*field)->setOmitTermFreqAndPositions(
!(get_parser_phrase_support_string_from_properties(_index_meta->properties()) ==
INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES));
+ DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::create_field_v3", {
+ if (_index_file_writer->get_storage_format() !=
InvertedIndexStorageFormatPB::V3) {
+ return
Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
+ "debug point:
InvertedIndexColumnWriterImpl::create_field_v3 error");
+ }
+ })
+ if (_index_file_writer->get_storage_format() >=
InvertedIndexStorageFormatPB::V3) {
+ (*field)->setIndexVersion(IndexVersion::kV3);
+ // Only effective in v3
+ std::string dict_compression =
+
get_parser_dict_compression_from_properties(_index_meta->properties());
+
DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::create_field_dic_compression", {
+ if (dict_compression != INVERTED_INDEX_PARSER_TRUE) {
+ return
Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
+ "debug point: "
+
"InvertedIndexColumnWriterImpl::create_field_dic_compression error");
+ }
+ })
+ if (dict_compression == INVERTED_INDEX_PARSER_TRUE) {
+ (*field)->updateFlag(FlagBits::DICT_COMPRESS);
+ }
+ }
return Status::OK();
}
diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp
index fc9fc034b0b..d1746836e23 100644
--- a/be/src/olap/tablet_meta.cpp
+++ b/be/src/olap/tablet_meta.cpp
@@ -203,6 +203,9 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t
partition_id, int64_t tablet_id
case TInvertedIndexFileStorageFormat::V2:
schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
break;
+ case TInvertedIndexFileStorageFormat::V3:
+
schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
+ break;
default:
schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
break;
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp
index f3fb9763c9b..b8e11bece7b 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp
@@ -62,7 +62,6 @@ TEST_F(PhraseQueryTest, test_parser_info) {
EXPECT_EQ(query_info.slop, res3);
EXPECT_EQ(query_info.ordered, res4);
EXPECT_EQ(query_info.additional_terms.size(), res5);
- std::cout << "--- 1 ---: " << query_info.to_string() << std::endl;
};
// "english/history off.gif ~20+" sequential_opt = true
diff --git a/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
index b454080434a..41703d49d5e 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
@@ -506,7 +506,7 @@ TEST_F(InvertedIndexFileWriterTest,
WriteV2ExceptionHandlingTest) {
EXPECT_CALL(writer_mock, write_index_headers_and_metadata(::testing::_,
::testing::_))
.WillOnce(::testing::Throw(CLuceneError(CL_ERR_IO, "Simulated
exception", false)));
- Status status = writer_mock.write_v2();
+ Status status = writer_mock.write();
ASSERT_FALSE(status.ok());
ASSERT_EQ(status.code(), ErrorCode::INVERTED_INDEX_CLUCENE_ERROR);
}
@@ -523,7 +523,7 @@ public:
MOCK_METHOD((std::pair<std::unique_ptr<lucene::store::Directory,
DirectoryDeleter>,
std::unique_ptr<lucene::store::IndexOutput>>),
- create_output_stream_v2, (), (override));
+ create_output_stream, (), (override));
};
class InvertedIndexFileWriterMockCreateOutputStreamV1 : public
InvertedIndexFileWriter {
@@ -622,7 +622,7 @@ TEST_F(InvertedIndexFileWriterTest, WriteV2OutputTest) {
auto compound_file_output =
std::unique_ptr<DorisFSDirectory::FSIndexOutputV2>(mock_output_v2);
compound_file_output->init(file_writer.get());
- EXPECT_CALL(writer_mock, create_output_stream_v2())
+ EXPECT_CALL(writer_mock, create_output_stream())
.WillOnce(::testing::Invoke(
[&]() ->
std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>,
std::unique_ptr<lucene::store::IndexOutput>> {
@@ -680,7 +680,7 @@ TEST_F(InvertedIndexFileWriterTest,
WriteV2OutputCloseErrorTest) {
auto compound_file_output =
std::unique_ptr<DorisFSDirectory::FSIndexOutputV2>(mock_output_v2);
compound_file_output->init(file_writer.get());
- EXPECT_CALL(writer_mock, create_output_stream_v2())
+ EXPECT_CALL(writer_mock, create_output_stream())
.WillOnce(::testing::Invoke(
[&]() ->
std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>,
std::unique_ptr<lucene::store::IndexOutput>> {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index abba2762d56..dd6a1a7612a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -54,6 +54,8 @@ public class InvertedIndexUtil {
public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
+ public static String INVERTED_INDEX_DICT_COMPRESSION_KEY =
"dict_compression";
+
public static String getInvertedIndexParser(Map<String, String>
properties) {
String parser = properties == null ? null :
properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
@@ -157,7 +159,8 @@ public class InvertedIndexUtil {
INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT,
INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY,
INVERTED_INDEX_PARSER_LOWERCASE_KEY,
- INVERTED_INDEX_PARSER_STOPWORDS_KEY
+ INVERTED_INDEX_PARSER_STOPWORDS_KEY,
+ INVERTED_INDEX_DICT_COMPRESSION_KEY
));
for (String key : properties.keySet()) {
@@ -174,6 +177,7 @@ public class InvertedIndexUtil {
String ignoreAbove =
properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
+ String dictCompression =
properties.get(INVERTED_INDEX_DICT_COMPRESSION_KEY);
if (parser != null &&
!parser.matches("none|english|unicode|chinese|standard")) {
throw new AnalysisException("Invalid inverted index 'parser'
value: " + parser
@@ -221,5 +225,11 @@ public class InvertedIndexUtil {
throw new AnalysisException("Invalid inverted index 'stopWords'
value: " + stopWords
+ ", stopWords must be none");
}
+
+ if (dictCompression != null && !dictCompression.matches("true|false"))
{
+ throw new AnalysisException(
+ "Invalid inverted index 'dict_compression' value: "
+ + dictCompression + ", dict_compression must be
true or false");
+ }
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
index b8a364f9449..e14b4efb0d5 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
@@ -354,8 +354,12 @@ public class CloudInternalCatalog extends InternalCatalog {
if (invertedIndexFileStorageFormat != null) {
if (invertedIndexFileStorageFormat ==
TInvertedIndexFileStorageFormat.V1) {
schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V1);
- } else {
+ } else if (invertedIndexFileStorageFormat ==
TInvertedIndexFileStorageFormat.V2) {
schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V2);
+ } else if (invertedIndexFileStorageFormat ==
TInvertedIndexFileStorageFormat.V3) {
+
schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V3);
+ } else {
+ throw new DdlException("invalid inverted index storage
format");
}
}
schemaBuilder.setRowStorePageSize(pageSize);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
index 5721db0c27e..915b5d48f01 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
@@ -1126,6 +1126,8 @@ public class PropertyAnalyzer {
} else {
if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) {
return TInvertedIndexFileStorageFormat.V1;
+ } else if
(Config.inverted_index_storage_format.equalsIgnoreCase("V3")) {
+ return TInvertedIndexFileStorageFormat.V3;
} else {
return TInvertedIndexFileStorageFormat.V2;
}
@@ -1135,9 +1137,13 @@ public class PropertyAnalyzer {
return TInvertedIndexFileStorageFormat.V1;
} else if (invertedIndexFileStorageFormat.equalsIgnoreCase("v2")) {
return TInvertedIndexFileStorageFormat.V2;
+ } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("v3")) {
+ return TInvertedIndexFileStorageFormat.V3;
} else if (invertedIndexFileStorageFormat.equalsIgnoreCase("default"))
{
if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) {
return TInvertedIndexFileStorageFormat.V1;
+ } else if
(Config.inverted_index_storage_format.equalsIgnoreCase("V3")) {
+ return TInvertedIndexFileStorageFormat.V3;
} else {
return TInvertedIndexFileStorageFormat.V2;
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexUtilTest.java
b/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexUtilTest.java
new file mode 100644
index 00000000000..a9be242cf3f
--- /dev/null
+++
b/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexUtilTest.java
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.analysis;
+
+import org.apache.doris.common.AnalysisException;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class InvertedIndexUtilTest {
+ @Test
+ public void testCheckInvertedIndexProperties() throws AnalysisException {
+ Map<String, String> properties = new HashMap<>();
+ properties.put(InvertedIndexUtil.INVERTED_INDEX_DICT_COMPRESSION_KEY,
"true");
+
+ InvertedIndexUtil.checkInvertedIndexProperties(properties);
+
+ properties.put(InvertedIndexUtil.INVERTED_INDEX_DICT_COMPRESSION_KEY,
"invalid_value");
+ try {
+ InvertedIndexUtil.checkInvertedIndexProperties(properties);
+ Assertions.fail("Expected AnalysisException was not thrown");
+ } catch (AnalysisException e) {
+ Assertions.assertEquals(
+ "errCode = 2, detailMessage = Invalid inverted index
'dict_compression' value: invalid_value, "
+ + "dict_compression must be true or false",
+ e.getMessage());
+ }
+ }
+}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
index 041ca89bfc5..6d708aa0826 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
@@ -28,6 +28,7 @@ import org.apache.doris.catalog.Type;
import org.apache.doris.common.util.PropertyAnalyzer;
import org.apache.doris.common.util.TimeUtils;
import org.apache.doris.resource.Tag;
+import org.apache.doris.thrift.TInvertedIndexFileStorageFormat;
import org.apache.doris.thrift.TStorageFormat;
import org.apache.doris.thrift.TStorageMedium;
@@ -37,6 +38,7 @@ import com.google.common.collect.Sets;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
+import org.junit.jupiter.api.Assertions;
import org.junit.rules.ExpectedException;
import java.time.Instant;
@@ -236,4 +238,52 @@ public class PropertyAnalyzerTest {
Assert.assertTrue(e.getMessage().contains("Storage page size must
be between 4KB and 10MB"));
}
}
+
+ @Test
+ public void testAnalyzeInvertedIndexFileStorageFormat() throws
AnalysisException {
+ TInvertedIndexFileStorageFormat result =
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(null);
+ Assertions.assertEquals(TInvertedIndexFileStorageFormat.V2, result);
+
+ Config.inverted_index_storage_format = "V1";
+ result = PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(new
HashMap<>());
+ Assertions.assertEquals(TInvertedIndexFileStorageFormat.V1, result);
+
+ Map<String, String> propertiesWithV1 = new HashMap<>();
+
propertiesWithV1.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT,
"v1");
+ result =
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithV1);
+ Assertions.assertEquals(TInvertedIndexFileStorageFormat.V1, result);
+
+ Map<String, String> propertiesWithV2 = new HashMap<>();
+
propertiesWithV2.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT,
"v2");
+ result =
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithV2);
+ Assertions.assertEquals(TInvertedIndexFileStorageFormat.V2, result);
+
+ Map<String, String> propertiesWithV3 = new HashMap<>();
+
propertiesWithV3.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT,
"v3");
+ result =
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithV3);
+ Assertions.assertEquals(TInvertedIndexFileStorageFormat.V3, result);
+
+ Config.inverted_index_storage_format = "V1";
+ Map<String, String> propertiesWithDefaultV1 = new HashMap<>();
+
propertiesWithDefaultV1.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT,
"default");
+ result =
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithDefaultV1);
+ Assertions.assertEquals(TInvertedIndexFileStorageFormat.V1, result);
+
+ Config.inverted_index_storage_format = "V2";
+ Map<String, String> propertiesWithDefaultV2 = new HashMap<>();
+
propertiesWithDefaultV2.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT,
"default");
+ result =
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithDefaultV2);
+ Assertions.assertEquals(TInvertedIndexFileStorageFormat.V2, result);
+
+ Map<String, String> propertiesWithUnknown = new HashMap<>();
+
propertiesWithUnknown.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT,
"unknown_format");
+ try {
+
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithUnknown);
+ Assertions.fail("Expected AnalysisException was not thrown");
+ } catch (AnalysisException e) {
+ Assertions.assertEquals(
+ "errCode = 2, detailMessage = unknown inverted index
storage format: unknown_format",
+ e.getMessage());
+ }
+ }
}
diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto
index 259f9f2861a..41f8727ed1b 100644
--- a/gensrc/proto/olap_file.proto
+++ b/gensrc/proto/olap_file.proto
@@ -343,6 +343,7 @@ enum IndexType {
enum InvertedIndexStorageFormatPB {
V1 = 0;
V2 = 1;
+ V3 = 2;
}
message TabletIndexPB {
diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift
index 1912f950587..623da9ce067 100644
--- a/gensrc/thrift/Types.thrift
+++ b/gensrc/thrift/Types.thrift
@@ -124,7 +124,8 @@ enum TStorageBackendType {
enum TInvertedIndexFileStorageFormat {
DEFAULT = 0, // Default format, unspecified storage method.
V1 = 1, // Index per idx: Each index is stored separately based on
its identifier.
- V2 = 2 // Segment id per idx: Indexes are organized based on segment
identifiers, grouping indexes by their associated segment.
+ V2 = 2 // Segment id per idx: Indexes are organized based on segment
identifiers, grouping indexes by their associated segment.
+ V3 = 3 // Position and dictionary compression
}
struct TScalarType {
diff --git a/regression-test/data/inverted_index_p0/test_inverted_index_v3.out
b/regression-test/data/inverted_index_p0/test_inverted_index_v3.out
new file mode 100644
index 00000000000..9dc20f3e0e0
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_inverted_index_v3.out
@@ -0,0 +1,25 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+238
+
+-- !sql --
+104
+
+-- !sql --
+104
+
+-- !sql --
+105
+
+-- !sql --
+238
+
+-- !sql --
+104
+
+-- !sql --
+104
+
+-- !sql --
+105
+
diff --git
a/regression-test/suites/fault_injection_p0/test_inverted_index_v3_fault_injection.groovy
b/regression-test/suites/fault_injection_p0/test_inverted_index_v3_fault_injection.groovy
new file mode 100644
index 00000000000..98c0e110964
--- /dev/null
+++
b/regression-test/suites/fault_injection_p0/test_inverted_index_v3_fault_injection.groovy
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_inverted_index_v3_fault_injection", "nonConcurrent"){
+ def indexTbName1 = "test_inverted_index_v3_fault_injection"
+
+ sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+ sql """
+ CREATE TABLE ${indexTbName1} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT "",
+ INDEX clientip_idx (`clientip`) COMMENT '',
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "inverted_index_storage_format" = "V3"
+ );
+ """
+
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3")
+
+ sql """ INSERT INTO ${indexTbName1} VALUES (1, '40.135.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 200, 24736); """
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3")
+ }
+
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3")
+
GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_dic_compression")
+
+ sql """ INSERT INTO ${indexTbName1} VALUES (2, '40.135.0.0', 'GET
/images/hm_bg.jpg HTTP/1.0', 200, 24736); """
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3")
+
GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_dic_compression")
+ }
+}
\ No newline at end of file
diff --git
a/regression-test/suites/inverted_index_p0/test_inverted_index_v3.groovy
b/regression-test/suites/inverted_index_p0/test_inverted_index_v3.groovy
new file mode 100644
index 00000000000..ea7dd0b595f
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_inverted_index_v3.groovy
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_inverted_index_v3", "p0"){
+ def indexTbName1 = "test_inverted_index_v3_1"
+ def indexTbName2 = "test_inverted_index_v3_2"
+
+ sql "DROP TABLE IF EXISTS ${indexTbName1}"
+ sql "DROP TABLE IF EXISTS ${indexTbName2}"
+
+ sql """
+ CREATE TABLE ${indexTbName1} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT "",
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "inverted_index_storage_format" = "V2"
+ );
+ """
+
+ sql """
+ CREATE TABLE ${indexTbName2} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT "",
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "inverted_index_storage_format" = "V3"
+ );
+ """
+
+ def load_httplogs_data = {table_name, label, read_flag, format_flag,
file_name, ignore_failure=false,
+ expected_succ_rows = -1, load_to_single_tablet =
'true' ->
+
+ // load the json data
+ streamLoad {
+ table "${table_name}"
+
+ // set http request header params
+ set 'label', label + "_" + UUID.randomUUID().toString()
+ set 'read_json_by_line', read_flag
+ set 'format', format_flag
+ file file_name // import json file
+ time 10000 // limit inflight 10s
+ if (expected_succ_rows >= 0) {
+ set 'max_filter_ratio', '1'
+ }
+
+ // if declared a check callback, the default check condition will
ignore.
+ // So you must check all condition
+ check { result, exception, startTime, endTime ->
+ if (ignore_failure && expected_succ_rows < 0) { return }
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ if (expected_succ_rows >= 0) {
+ assertEquals(json.NumberLoadedRows, expected_succ_rows)
+ } else {
+ assertEquals(json.NumberTotalRows,
json.NumberLoadedRows + json.NumberUnselectedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes
> 0)
+ }
+ }
+ }
+ }
+
+ try {
+ load_httplogs_data.call(indexTbName1, indexTbName1, 'true', 'json',
'documents-1000.json')
+ load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json',
'documents-1000.json')
+
+ sql "sync"
+
+ qt_sql """ select count() from ${indexTbName1} where request match_any
'hm bg'; """
+ qt_sql """ select count() from ${indexTbName1} where request match_all
'hm bg'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'hm bg'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase_prefix 'hm bg'; """
+
+ qt_sql """ select count() from ${indexTbName2} where request match_any
'hm bg'; """
+ qt_sql """ select count() from ${indexTbName2} where request match_all
'hm bg'; """
+ qt_sql """ select count() from ${indexTbName2} where request
match_phrase 'hm bg'; """
+ qt_sql """ select count() from ${indexTbName2} where request
match_phrase_prefix 'hm bg'; """
+
+ } finally {
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]