This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new fea57a03ee2 branch-4.0: [refact](inverted index) refact compound idx
writer #59219 (#59364)
fea57a03ee2 is described below
commit fea57a03ee2e0fde2936f161706f2c9e3c55c2ba
Author: Jack <[email protected]>
AuthorDate: Fri Dec 26 18:15:51 2025 +0800
branch-4.0: [refact](inverted index) refact compound idx writer #59219
(#59364)
cherry pick from #59219
---
.../rowset/segment_v2/index_storage_format_v2.cpp | 39 +++++++++-------------
.../rowset/segment_v2/index_storage_format_v2.h | 7 ++--
.../segment_v2/inverted_index_fs_directory.cpp | 20 +++++++++++
.../segment_v2/inverted_index_fs_directory.h | 5 +++
.../segment_v2/inverted_index_file_writer_test.cpp | 23 +++++--------
5 files changed, 53 insertions(+), 41 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/index_storage_format_v2.cpp
b/be/src/olap/rowset/segment_v2/index_storage_format_v2.cpp
index 2342e15d34b..0b0e81da191 100644
--- a/be/src/olap/rowset/segment_v2/index_storage_format_v2.cpp
+++ b/be/src/olap/rowset/segment_v2/index_storage_format_v2.cpp
@@ -39,7 +39,6 @@ IndexStorageFormatV2::IndexStorageFormatV2(IndexFileWriter*
index_file_writer)
: IndexStorageFormat(index_file_writer) {}
Status IndexStorageFormatV2::write() {
- std::unique_ptr<lucene::store::Directory, DirectoryDeleter> out_dir =
nullptr;
std::unique_ptr<lucene::store::IndexOutput> compound_file_output = nullptr;
ErrorContext error_context;
try {
@@ -48,11 +47,13 @@ Status IndexStorageFormatV2::write() {
// Prepare file metadata
auto file_metadata = prepare_file_metadata(current_offset);
- // Create output stream
- auto result = create_output_stream();
- out_dir = std::move(result.first);
- compound_file_output = std::move(result.second);
- VLOG_DEBUG << fmt::format("Output compound index file to streams: {}",
out_dir->toString());
+ // Create output stream directly without directory operations.
+ // This is important for cloud storage (like S3) where directory
operations are not
+ // supported or unnecessary.
+ compound_file_output = create_output_stream();
+ auto index_path = InvertedIndexDescriptor::get_index_file_path_v2(
+ _index_file_writer->_index_path_prefix);
+ VLOG_DEBUG << fmt::format("Output compound index file to: {}",
index_path);
// Write version and number of indices
write_version_and_indices_count(compound_file_output.get());
@@ -75,10 +76,7 @@ Status IndexStorageFormatV2::write() {
error_context.err_msg.append(err.what());
LOG(ERROR) << error_context.err_msg;
}
- FINALLY({
- FINALLY_CLOSE(compound_file_output);
- FINALLY_CLOSE(out_dir);
- })
+ FINALLY({ FINALLY_CLOSE(compound_file_output); })
return Status::OK();
}
@@ -177,21 +175,16 @@ std::vector<FileMetadata>
IndexStorageFormatV2::prepare_file_metadata(int64_t& c
return file_metadata;
}
-std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>,
- std::unique_ptr<lucene::store::IndexOutput>>
-IndexStorageFormatV2::create_output_stream() {
- io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(
- _index_file_writer->_index_path_prefix)};
-
- auto* out_dir =
DorisFSDirectoryFactory::getDirectory(_index_file_writer->_fs,
-
index_path.parent_path().c_str());
- out_dir->set_file_writer_opts(_index_file_writer->_opts);
- std::unique_ptr<lucene::store::Directory, DirectoryDeleter>
out_dir_ptr(out_dir);
-
+std::unique_ptr<lucene::store::IndexOutput>
IndexStorageFormatV2::create_output_stream() {
+ // For V2 format, we create the output stream directly using the file
writer,
+ // bypassing the directory layer entirely. This optimization is especially
important
+ // for cloud storage (like S3) where:
+ // 1. Directory operations (exists, create_directory) are unnecessary
overhead
+ // 2. S3 doesn't have a real directory concept - directories are just key
prefixes
+ // 3. The file writer is already created and ready to use
DCHECK(_index_file_writer->_idx_v2_writer != nullptr)
<< "inverted index file writer v2 is nullptr";
- auto compound_file_output =
out_dir->createOutputV2(_index_file_writer->_idx_v2_writer.get());
- return {std::move(out_dir_ptr), std::move(compound_file_output)};
+ return
DorisFSDirectory::FSIndexOutputV2::create(_index_file_writer->_idx_v2_writer.get());
}
void
IndexStorageFormatV2::write_version_and_indices_count(lucene::store::IndexOutput*
output) {
diff --git a/be/src/olap/rowset/segment_v2/index_storage_format_v2.h
b/be/src/olap/rowset/segment_v2/index_storage_format_v2.h
index 9b899afac28..1684855af62 100644
--- a/be/src/olap/rowset/segment_v2/index_storage_format_v2.h
+++ b/be/src/olap/rowset/segment_v2/index_storage_format_v2.h
@@ -43,9 +43,10 @@ public:
private:
int64_t header_length();
std::vector<FileMetadata> prepare_file_metadata(int64_t& current_offset);
- virtual std::pair<std::unique_ptr<lucene::store::Directory,
DirectoryDeleter>,
- std::unique_ptr<lucene::store::IndexOutput>>
- create_output_stream();
+ // Creates the output stream for writing the compound file.
+ // For V2 format, we directly create FSIndexOutputV2 using the file writer,
+ // avoiding unnecessary directory operations (important for cloud storage
like S3).
+ virtual std::unique_ptr<lucene::store::IndexOutput> create_output_stream();
void write_version_and_indices_count(lucene::store::IndexOutput* output);
virtual void write_index_headers_and_metadata(lucene::store::IndexOutput*
output,
const
std::vector<FileMetadata>& file_metadata);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp
index 936dda66e85..33a0ad801a8 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp
@@ -453,6 +453,26 @@ int64_t DorisFSDirectory::FSIndexOutputV2::length() const {
return _index_v2_file_writer->bytes_appended();
}
+std::unique_ptr<lucene::store::IndexOutput>
DorisFSDirectory::FSIndexOutputV2::create(
+ io::FileWriter* file_writer) {
+ auto ret = std::make_unique<FSIndexOutputV2>();
+ ErrorContext error_context;
+ try {
+ ret->init(file_writer);
+ } catch (CLuceneError& err) {
+ error_context.eptr = std::current_exception();
+ error_context.err_msg.append("FSIndexOutputV2::create init error: ");
+ error_context.err_msg.append(err.what());
+ LOG(ERROR) << error_context.err_msg;
+ }
+ FINALLY_EXCEPTION({
+ if (error_context.eptr) {
+ FINALLY_CLOSE(ret);
+ }
+ })
+ return ret;
+}
+
DorisFSDirectory::DorisFSDirectory() {
filemode = 0644;
this->lockFactory = nullptr;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h
b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h
index ec606e52911..7b1ac0bdf55 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h
@@ -246,6 +246,11 @@ public:
~FSIndexOutputV2() override;
void close() override;
int64_t length() const override;
+
+ // Static factory method to create FSIndexOutputV2 directly without
Directory object.
+ // This is useful for compound file creation where we already have a
FileWriter
+ // and don't need directory operations (especially for cloud storage like
S3).
+ static std::unique_ptr<lucene::store::IndexOutput> create(io::FileWriter*
file_writer);
};
/**
diff --git a/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
index 61d7be3099c..5ac8b773aef 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
@@ -692,9 +692,8 @@ public:
IndexStorageFormatV2MockCreateOutputStream(IndexFileWriter*
index_file_writer)
: IndexStorageFormatV2(index_file_writer) {}
- MOCK_METHOD((std::pair<std::unique_ptr<lucene::store::Directory,
DirectoryDeleter>,
- std::unique_ptr<lucene::store::IndexOutput>>),
- create_output_stream, (), (override));
+ MOCK_METHOD((std::unique_ptr<lucene::store::IndexOutput>),
create_output_stream, (),
+ (override));
};
class IndexFileWriterMockCreateOutputStreamV1 : public IndexFileWriter {
@@ -808,12 +807,9 @@ TEST_F(IndexFileWriterTest, WriteV2OutputTest) {
EXPECT_CALL(
*(IndexStorageFormatV2MockCreateOutputStream*)writer_mock._index_storage_format.get(),
create_output_stream())
- .WillOnce(::testing::Invoke(
- [&]() ->
std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>,
-
std::unique_ptr<lucene::store::IndexOutput>> {
- return std::make_pair(std::move(out_dir_ptr),
- std::move(compound_file_output));
- }));
+ .WillOnce(::testing::Invoke([&]() ->
std::unique_ptr<lucene::store::IndexOutput> {
+ return std::move(compound_file_output);
+ }));
int64_t index_id = 1;
std::string index_suffix = "suffix1";
@@ -871,12 +867,9 @@ TEST_F(IndexFileWriterTest, WriteV2OutputCloseErrorTest) {
EXPECT_CALL(
*(IndexStorageFormatV2MockCreateOutputStream*)writer_mock._index_storage_format.get(),
create_output_stream())
- .WillOnce(::testing::Invoke(
- [&]() ->
std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>,
-
std::unique_ptr<lucene::store::IndexOutput>> {
- return std::make_pair(std::move(out_dir_ptr),
- std::move(compound_file_output));
- }));
+ .WillOnce(::testing::Invoke([&]() ->
std::unique_ptr<lucene::store::IndexOutput> {
+ return std::move(compound_file_output);
+ }));
int64_t index_id = 1;
std::string index_suffix = "suffix1";
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]