gavinchou commented on code in PR #52316:
URL: https://github.com/apache/doris/pull/52316#discussion_r2179951325


##########
cloud/src/common/document_message.cpp:
##########
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "common/document_message.h"
+
+#include <gen_cpp/olap_file.pb.h>
+#include <glog/logging.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "common/config.h"
+#include "common/util.h"
+#include "meta-service/codec.h"
+#include "meta-service/txn_kv.h"
+#include "meta-service/txn_kv_error.h"
+
+namespace doris::cloud {
+namespace details {
+
+static std::string lexical_next(std::string_view str) {
+    return std::string(str) + '\0';
+}
+
+static std::string lexical_end(std::string_view str) {
+    // Since '\xff' is not used in the key encoding, we can safely use it as a 
sentinel
+    // to indicate the end of a key range.
+    return std::string(str) + '\xff';
+}
+
+void document_delete_single(Transaction* txn, std::string_view key) {
+    txn->remove(key);
+}
+
+void document_delete_range(Transaction* txn, std::string_view prefix) {
+    txn->remove(prefix, lexical_end(prefix));
+}
+
+} // namespace details
+
+static bool is_splitable_field(const google::protobuf::FieldDescriptor* field) 
{
+    return field && field->is_repeated() &&
+           field->cpp_type() == 
google::protobuf::FieldDescriptor::CPPTYPE_MESSAGE;

Review Comment:
   why do we have this restriction? can we relax this check in the future to 
support splitting non-repeated fields
   e.g. split a large single non-repeated field
   
   message M1 {
     ...
     M2 non_repeated_large_field;
     ...
   }



##########
cloud/src/common/document_message.cpp:
##########
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "common/document_message.h"
+
+#include <gen_cpp/olap_file.pb.h>
+#include <glog/logging.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "common/config.h"
+#include "common/util.h"
+#include "meta-service/codec.h"
+#include "meta-service/txn_kv.h"
+#include "meta-service/txn_kv_error.h"
+
+namespace doris::cloud {
+namespace details {
+
+static std::string lexical_next(std::string_view str) {
+    return std::string(str) + '\0';
+}
+
+static std::string lexical_end(std::string_view str) {
+    // Since '\xff' is not used in the key encoding, we can safely use it as a 
sentinel
+    // to indicate the end of a key range.
+    return std::string(str) + '\xff';
+}
+
+void document_delete_single(Transaction* txn, std::string_view key) {
+    txn->remove(key);
+}
+
+void document_delete_range(Transaction* txn, std::string_view prefix) {
+    txn->remove(prefix, lexical_end(prefix));
+}
+
+} // namespace details
+
+static bool is_splitable_field(const google::protobuf::FieldDescriptor* field) 
{
+    return field && field->is_repeated() &&
+           field->cpp_type() == 
google::protobuf::FieldDescriptor::CPPTYPE_MESSAGE;
+}
+
+static bool ensure_fields_are_splitable(const google::protobuf::Message& msg,
+                                        const std::vector<int32_t>& 
split_field_ids) {
+    for (int32_t field_id : split_field_ids) {
+        const google::protobuf::FieldDescriptor* field =
+                msg.GetDescriptor()->FindFieldByNumber(field_id);
+        if (!is_splitable_field(field)) {
+            LOG(ERROR) << "Field with id " << field_id
+                       << " is not a valid message field in message "
+                       << msg.GetDescriptor()->full_name();
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool verify_split_fields(const google::protobuf::Message& msg,
+                                const SplitSchemaPB& split_schema) {
+    for (int32_t field_id : split_schema.split_field_ids()) {
+        const google::protobuf::FieldDescriptor* field =
+                msg.GetDescriptor()->FindFieldByNumber(field_id);
+        if (!is_splitable_field(field)) {
+            LOG(ERROR) << "Field with id " << field_id
+                       << " is not a valid message field in message "
+                       << msg.GetDescriptor()->full_name();
+            return false;
+        }
+        const google::protobuf::Reflection* reflection = msg.GetReflection();
+        if (field->is_repeated()) {
+            int field_size = reflection->FieldSize(msg, field);
+            if (field_size == 0) {
+                LOG(ERROR) << "Repeated field with id " << field_id << " is 
empty in message "
+                           << msg.GetDescriptor()->full_name()
+                           << ", but it is expected to be present according to 
the split schema.";
+                return false;
+            }
+        } else {
+            DCHECK(field->cpp_type() != 
google::protobuf::FieldDescriptor::CPPTYPE_MESSAGE)
+                    << "Field with id " << field_id << " is not a message 
field in message "
+                    << msg.GetDescriptor()->full_name();
+            if (!reflection->HasField(msg, field)) {
+                LOG(ERROR) << "Field with id " << field_id << " is not set in 
message "
+                           << msg.GetDescriptor()->full_name()
+                           << ", but it is expected to be present according to 
the split schema.";
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+// Split the fields of a message into multiple keys in the transaction.
+// Only supports repeated fields or message fields. The split fields and
+// the number of split keys are recorded in the `split_schema` field of the 
message.
+//
+// If a field is not set (or empty for repeated fields), it will be skipped.
+static bool document_put_split_fields(Transaction* txn, std::string_view key,
+                                      google::protobuf::Message& msg,
+                                      const std::vector<int32_t>& 
split_field_ids,
+                                      SplitSchemaPB* split_schema) {
+    using google::protobuf::FieldDescriptor;
+    using google::protobuf::Reflection;
+    using google::protobuf::Message;
+
+    DCHECK(ensure_fields_are_splitable(msg, split_field_ids))
+            << "The split fields must be a repeated or message field";
+
+    size_t num_put_keys = txn->num_put_keys();
+    for (int32_t field_id : split_field_ids) {
+        const FieldDescriptor* field = 
msg.GetDescriptor()->FindFieldByNumber(field_id);
+        const Reflection* reflection = msg.GetReflection();
+        if (field->is_repeated()) {
+            int field_size = reflection->FieldSize(msg, field);
+            if (field_size == 0) {
+                continue; // Skip empty repeated fields
+            }
+            // Split the repeated message field
+            for (int i = 0; i < field_size; ++i) {
+                std::string split_key(key);
+                encode_bytes(field->name(), &split_key);

Review Comment:
   why do we accept split_field_ids but encode field_name?
   there is a drawback that we cannot rename the fields in the future, or we 
should add a mechanism to refuse renaming splitted fields (e.g. UT)



##########
cloud/src/common/document_message.cpp:
##########
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "common/document_message.h"
+
+#include <gen_cpp/olap_file.pb.h>
+#include <glog/logging.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "common/config.h"
+#include "common/util.h"
+#include "meta-service/codec.h"
+#include "meta-service/txn_kv.h"
+#include "meta-service/txn_kv_error.h"
+
+namespace doris::cloud {
+namespace details {
+
+static std::string lexical_next(std::string_view str) {
+    return std::string(str) + '\0';
+}
+
+static std::string lexical_end(std::string_view str) {
+    // Since '\xff' is not used in the key encoding, we can safely use it as a 
sentinel
+    // to indicate the end of a key range.
+    return std::string(str) + '\xff';
+}
+
+void document_delete_single(Transaction* txn, std::string_view key) {
+    txn->remove(key);
+}
+
+void document_delete_range(Transaction* txn, std::string_view prefix) {
+    txn->remove(prefix, lexical_end(prefix));
+}
+
+} // namespace details
+
+static bool is_splitable_field(const google::protobuf::FieldDescriptor* field) 
{
+    return field && field->is_repeated() &&
+           field->cpp_type() == 
google::protobuf::FieldDescriptor::CPPTYPE_MESSAGE;
+}
+
+static bool ensure_fields_are_splitable(const google::protobuf::Message& msg,
+                                        const std::vector<int32_t>& 
split_field_ids) {
+    for (int32_t field_id : split_field_ids) {
+        const google::protobuf::FieldDescriptor* field =
+                msg.GetDescriptor()->FindFieldByNumber(field_id);
+        if (!is_splitable_field(field)) {
+            LOG(ERROR) << "Field with id " << field_id
+                       << " is not a valid message field in message "
+                       << msg.GetDescriptor()->full_name();
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool verify_split_fields(const google::protobuf::Message& msg,
+                                const SplitSchemaPB& split_schema) {
+    for (int32_t field_id : split_schema.split_field_ids()) {
+        const google::protobuf::FieldDescriptor* field =
+                msg.GetDescriptor()->FindFieldByNumber(field_id);
+        if (!is_splitable_field(field)) {
+            LOG(ERROR) << "Field with id " << field_id
+                       << " is not a valid message field in message "
+                       << msg.GetDescriptor()->full_name();
+            return false;
+        }
+        const google::protobuf::Reflection* reflection = msg.GetReflection();
+        if (field->is_repeated()) {
+            int field_size = reflection->FieldSize(msg, field);
+            if (field_size == 0) {
+                LOG(ERROR) << "Repeated field with id " << field_id << " is 
empty in message "
+                           << msg.GetDescriptor()->full_name()
+                           << ", but it is expected to be present according to 
the split schema.";
+                return false;
+            }
+        } else {
+            DCHECK(field->cpp_type() != 
google::protobuf::FieldDescriptor::CPPTYPE_MESSAGE)
+                    << "Field with id " << field_id << " is not a message 
field in message "
+                    << msg.GetDescriptor()->full_name();
+            if (!reflection->HasField(msg, field)) {
+                LOG(ERROR) << "Field with id " << field_id << " is not set in 
message "
+                           << msg.GetDescriptor()->full_name()
+                           << ", but it is expected to be present according to 
the split schema.";
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+// Split the fields of a message into multiple keys in the transaction.
+// Only supports repeated fields or message fields. The split fields and
+// the number of split keys are recorded in the `split_schema` field of the 
message.
+//
+// If a field is not set (or empty for repeated fields), it will be skipped.
+static bool document_put_split_fields(Transaction* txn, std::string_view key,
+                                      google::protobuf::Message& msg,
+                                      const std::vector<int32_t>& 
split_field_ids,
+                                      SplitSchemaPB* split_schema) {
+    using google::protobuf::FieldDescriptor;
+    using google::protobuf::Reflection;
+    using google::protobuf::Message;
+
+    DCHECK(ensure_fields_are_splitable(msg, split_field_ids))
+            << "The split fields must be a repeated or message field";
+
+    size_t num_put_keys = txn->num_put_keys();
+    for (int32_t field_id : split_field_ids) {
+        const FieldDescriptor* field = 
msg.GetDescriptor()->FindFieldByNumber(field_id);
+        const Reflection* reflection = msg.GetReflection();
+        if (field->is_repeated()) {
+            int field_size = reflection->FieldSize(msg, field);
+            if (field_size == 0) {
+                continue; // Skip empty repeated fields
+            }
+            // Split the repeated message field
+            for (int i = 0; i < field_size; ++i) {
+                std::string split_key(key);
+                encode_bytes(field->name(), &split_key);
+                encode_int64(static_cast<int64_t>(i), &split_key);
+                Message* repeated_field = 
reflection->MutableRepeatedMessage(&msg, field, i);
+                if (!document_put(txn, split_key, std::move(*repeated_field))) 
{
+                    return false;
+                }
+            }
+        } else {
+            DCHECK(field->cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE);
+            if (!reflection->HasField(msg, field)) {
+                // Skip if the field is not set
+                continue;
+            }
+
+            std::string split_key(key);
+            encode_bytes(field->name(), &split_key);
+            Message* message_field = reflection->MutableMessage(&msg, field);
+            if (!document_put(txn, split_key, std::move(*message_field))) {
+                return false;
+            }
+        }
+
+        // Clear the field after putting it to avoid duplication
+        reflection->ClearField(&msg, field);
+    }
+
+    split_schema->Clear();
+    split_schema->set_num_split_keys(txn->num_put_keys() - num_put_keys);
+    split_schema->mutable_split_field_ids()->Add(split_field_ids.begin(), 
split_field_ids.end());
+    return true;
+}
+
+bool document_put(Transaction* txn, std::string_view key, 
google::protobuf::Message&& msg) {
+    const google::protobuf::Descriptor* descriptor = msg.GetDescriptor();
+    const std::string& full_name = descriptor->full_name();
+
+    if (full_name == RowsetMetaCloudPB::descriptor()->full_name() &&
+        config::enable_split_rowset_meta_pb &&
+        config::split_rowset_meta_pb_size < msg.ByteSizeLong()) {
+        auto& rowset_meta = static_cast<RowsetMetaCloudPB&>(msg);
+        std::vector<int32_t> split_field_ids 
{RowsetMetaCloudPB::kSegmentsKeyBoundsFieldNumber};
+        if (!document_put_split_fields(txn, key, rowset_meta, split_field_ids,
+                                       rowset_meta.mutable_split_schema())) {
+            return false;
+        }
+    }
+
+    std::string value;
+    if (!msg.SerializeToString(&value)) {
+        LOG(ERROR) << "Failed to serialize message, key: " << hex(key)
+                   << ", message size: " << msg.ByteSizeLong() << ", message 
type: " << full_name;
+        return false;
+    }
+
+    txn->put(key, value);
+    return true;
+}
+
+// Get split fields from the iterator and populate the message.
+//
+// This function will iterate through the keys that start with the given prefix
+// and extract the fields based on the split schema defined in the message.
+TxnErrorCode document_get_split_fields(FullRangeGetIterator* iter, 
std::string_view key_prefix,
+                                       const SplitSchemaPB& split_schema,
+                                       google::protobuf::Message* msg, bool 
snapshot) {
+    using google::protobuf::FieldDescriptor;
+    using google::protobuf::Reflection;
+    using google::protobuf::Message;
+    using google::protobuf::Descriptor;
+
+    const Reflection* reflection = msg->GetReflection();
+    const Descriptor* descriptor = doris::RowsetMetaCloudPB::GetDescriptor();
+    for (auto&& kv = iter->peek(); kv.has_value(); kv = iter->peek()) {
+        auto&& [key, value] = *kv;
+        if (!key.starts_with(key_prefix)) {
+            break; // Stop if the key does not match the prefix
+        }
+
+        std::string_view suffix(key);
+        suffix.remove_prefix(key_prefix.size());
+
+        std::string field_name;
+        if (decode_bytes(&suffix, &field_name)) {
+            LOG(ERROR) << "Failed to decode the field name from key: " << 
hex(key)
+                       << ", key prefix: " << hex(key_prefix);
+            return TxnErrorCode::TXN_INVALID_DATA;
+        }
+
+        const FieldDescriptor* field = descriptor->FindFieldByName(field_name);

Review Comment:
   ditto. consider rename



##########
gensrc/proto/olap_file.proto:
##########
@@ -234,6 +241,8 @@ message RowsetMetaCloudPB {
 
     optional bool enable_inverted_index_file_info = 106;
     repeated InvertedIndexFileInfo inverted_index_file_info = 107;
+
+    optional SplitSchemaPB split_schema = 108;

Review Comment:
   naming this field with special prefix like `__`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to