Copilot commented on code in PR #87: URL: https://github.com/apache/paimon-cpp/pull/87#discussion_r3418754675
########## src/paimon/core/manifest/index_manifest_file_handler.cpp: ########## @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/manifest/index_manifest_file_handler.h" + +#include <set> +#include <unordered_map> +#include <utility> + +#include "paimon/core/deletionvectors/deletion_vectors_index_file.h" +namespace paimon { + +using BucketIdentifier = std::tuple<BinaryRow, int32_t, std::string>; + +std::vector<IndexManifestEntry> IndexManifestFileHandler::BucketedCombiner::Combine( + const std::vector<IndexManifestEntry>& prev_index_files, + const std::vector<IndexManifestEntry>& new_index_files) const { + std::unordered_map<BucketIdentifier, IndexManifestEntry> index_entries; + for (const auto& entry : prev_index_files) { + index_entries.emplace( + BucketIdentifier(entry.partition, entry.bucket, entry.index_file->IndexType()), entry); + } Review Comment: This uses `std::unordered_map` with `BucketIdentifier` as a `std::tuple<...>` key, but the standard library does not provide `std::hash<std::tuple<...>>` (and may also require a hash for `BinaryRow`). This will fail to compile unless the project defines a custom hasher. Fix by either (mandatory): (1) switching these maps to `std::map<BucketIdentifier, ...>` (tuple has `operator<`), or (2) providing an explicit hasher (and equality, if needed) for `BucketIdentifier` (and ensuring `BinaryRow` is hashable) and passing it as the third template argument to `std::unordered_map`. ########## src/paimon/core/manifest/partition_entry.cpp: ########## @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/manifest/partition_entry.h" + +#include <algorithm> +#include <map> +#include <string> +#include <tuple> +#include <utility> + +#include "paimon/common/utils/binary_row_partition_computer.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_kind.h" + +namespace paimon { +PartitionEntry::PartitionEntry(const BinaryRow& partition, int64_t record_count, + int64_t file_size_in_bytes, int64_t file_count, + int64_t last_file_creation_time, int32_t total_buckets) + : partition_(partition), + record_count_(record_count), + file_size_in_bytes_(file_size_in_bytes), + file_count_(file_count), + last_file_creation_time_(last_file_creation_time), + total_buckets_(total_buckets) {} + +PartitionEntry PartitionEntry::Merge(const PartitionEntry& entry) const { + return PartitionEntry( + partition_, record_count_ + entry.record_count_, + file_size_in_bytes_ + entry.file_size_in_bytes_, file_count_ + entry.file_count_, + std::max(last_file_creation_time_, entry.last_file_creation_time_), entry.total_buckets_); +} Review Comment: `Merge` takes `total_buckets_` from the *rhs* (`entry.total_buckets_`). If `Merge` is called in the opposite order, the result can differ even when all other fields merge commutatively. If `total_buckets_` is expected to be identical for the same partition, use `total_buckets_` from `this` (or assert/validate equality and keep one). If there are valid cases where they differ, define a deterministic merge rule (e.g., max, or prefer non-negative) so the result is order-independent. ########## src/paimon/core/manifest/manifest_list.h: ########## @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <cstdint> +#include <functional> +#include <memory> +#include <optional> +#include <string> +#include <utility> +#include <vector> + +#include "paimon/core/manifest/manifest_file_meta.h" +#include "paimon/core/snapshot.h" +#include "paimon/core/utils/objects_file.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +class FileFormat; +class FileSystem; +class FileStorePathFactory; +class ManifestFileMeta; +class MemoryPool; +class PathFactory; +class ReaderBuilder; +class WriterBuilder; + +/// This file includes several `ManifestFileMeta`, representing all data of the whole +/// table at the corresponding snapshot. +class ManifestList : public ObjectsFile<ManifestFileMeta> { + public: + static Result<std::unique_ptr<ManifestList>> Create( + const std::shared_ptr<FileSystem>& file_system, + const std::shared_ptr<FileFormat>& file_format, const std::string& compression, + const std::shared_ptr<FileStorePathFactory>& path_factory, + const std::shared_ptr<MemoryPool>& pool); + + /// Write several `ManifestFileMeta`s into a manifest list. + /// + /// @note This method is atomic. + Result<std::pair<std::string, int64_t>> Write(const std::vector<ManifestFileMeta>& metas); + + /// Return all `ManifestFileMeta` instances for either data or changelog manifests in this + /// snapshot. + /// + /// @param snapshot The input snapshot. + /// @param manifests The vector to store the manifest file metadata. + /// @return Status indicating whether the operation was successful or not. + Status ReadAllManifests(const Snapshot& snapshot, + std::vector<ManifestFileMeta>* manifests) const { + PAIMON_RETURN_NOT_OK(ReadDataManifests(snapshot, manifests)); + return ReadChangelogManifests(snapshot, manifests); + } + + /// Return a `ManifestFileMeta` for each data manifest in this snapshot. + /// + /// @param snapshot The input snapshot. + /// @param manifests The vector to store the manifest file metadata. + /// @return Status indicating whether the operation was successful or not. + Status ReadDataManifests(const Snapshot& snapshot, + std::vector<ManifestFileMeta>* manifests) const { + PAIMON_RETURN_NOT_OK(ReadBaseManifests(snapshot, manifests)); + return ReadDeltaManifests(snapshot, manifests); + } + + /// Return a `ManifestFileMeta` for each base manifest in this snapshot. + /// + /// @param snapshot The input snapshot. + /// @param manifests The vector to store the manifest file metadata. + /// @return Status indicating whether the operation was successful or not. + Status ReadBaseManifests(const Snapshot& snapshot, + std::vector<ManifestFileMeta>* manifests) const { + return Read(snapshot.BaseManifestList(), /*filter=*/nullptr, manifests); + } + + /// Return a `ManifestFileMeta` for each delta manifest in this snapshot. + /// + /// @param snapshot The input snapshot. + /// @param manifests The vector to store the manifest file metadata. + /// @return Status indicating whether the operation was successful or not. + Status ReadDeltaManifests(const Snapshot& snapshot, + std::vector<ManifestFileMeta>* manifests) const { + return Read(snapshot.DeltaManifestList(), /*filter=*/nullptr, manifests); + } + + /// Return a `ManifestFileMeta` for each changelog manifest in this snapshot. Review Comment: The docstring states this method returns a `ManifestFileMeta` for each changelog manifest, but the implementation returns `NotImplemented` whenever a changelog manifest list is present and does not populate `manifests`. Update the documentation to match the current behavior (e.g., explicitly state it's not supported yet), or implement the commented-out read path and keep the method contract. ########## src/paimon/core/manifest/index_manifest_file_handler.cpp: ########## @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/manifest/index_manifest_file_handler.h" + +#include <set> +#include <unordered_map> +#include <utility> + +#include "paimon/core/deletionvectors/deletion_vectors_index_file.h" +namespace paimon { + +using BucketIdentifier = std::tuple<BinaryRow, int32_t, std::string>; Review Comment: This uses `std::unordered_map` with `BucketIdentifier` as a `std::tuple<...>` key, but the standard library does not provide `std::hash<std::tuple<...>>` (and may also require a hash for `BinaryRow`). This will fail to compile unless the project defines a custom hasher. Fix by either (mandatory): (1) switching these maps to `std::map<BucketIdentifier, ...>` (tuple has `operator<`), or (2) providing an explicit hasher (and equality, if needed) for `BucketIdentifier` (and ensuring `BinaryRow` is hashable) and passing it as the third template argument to `std::unordered_map`. ########## src/paimon/core/manifest/manifest_list.h: ########## @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <cstdint> +#include <functional> +#include <memory> +#include <optional> +#include <string> +#include <utility> +#include <vector> + +#include "paimon/core/manifest/manifest_file_meta.h" +#include "paimon/core/snapshot.h" +#include "paimon/core/utils/objects_file.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +class FileFormat; +class FileSystem; +class FileStorePathFactory; +class ManifestFileMeta; +class MemoryPool; +class PathFactory; +class ReaderBuilder; +class WriterBuilder; + +/// This file includes several `ManifestFileMeta`, representing all data of the whole +/// table at the corresponding snapshot. +class ManifestList : public ObjectsFile<ManifestFileMeta> { + public: + static Result<std::unique_ptr<ManifestList>> Create( + const std::shared_ptr<FileSystem>& file_system, + const std::shared_ptr<FileFormat>& file_format, const std::string& compression, + const std::shared_ptr<FileStorePathFactory>& path_factory, + const std::shared_ptr<MemoryPool>& pool); + + /// Write several `ManifestFileMeta`s into a manifest list. + /// + /// @note This method is atomic. + Result<std::pair<std::string, int64_t>> Write(const std::vector<ManifestFileMeta>& metas); + + /// Return all `ManifestFileMeta` instances for either data or changelog manifests in this + /// snapshot. + /// + /// @param snapshot The input snapshot. + /// @param manifests The vector to store the manifest file metadata. + /// @return Status indicating whether the operation was successful or not. + Status ReadAllManifests(const Snapshot& snapshot, + std::vector<ManifestFileMeta>* manifests) const { + PAIMON_RETURN_NOT_OK(ReadDataManifests(snapshot, manifests)); + return ReadChangelogManifests(snapshot, manifests); + } + + /// Return a `ManifestFileMeta` for each data manifest in this snapshot. + /// + /// @param snapshot The input snapshot. + /// @param manifests The vector to store the manifest file metadata. + /// @return Status indicating whether the operation was successful or not. + Status ReadDataManifests(const Snapshot& snapshot, + std::vector<ManifestFileMeta>* manifests) const { + PAIMON_RETURN_NOT_OK(ReadBaseManifests(snapshot, manifests)); + return ReadDeltaManifests(snapshot, manifests); + } + + /// Return a `ManifestFileMeta` for each base manifest in this snapshot. + /// + /// @param snapshot The input snapshot. + /// @param manifests The vector to store the manifest file metadata. + /// @return Status indicating whether the operation was successful or not. + Status ReadBaseManifests(const Snapshot& snapshot, + std::vector<ManifestFileMeta>* manifests) const { + return Read(snapshot.BaseManifestList(), /*filter=*/nullptr, manifests); + } + + /// Return a `ManifestFileMeta` for each delta manifest in this snapshot. + /// + /// @param snapshot The input snapshot. + /// @param manifests The vector to store the manifest file metadata. + /// @return Status indicating whether the operation was successful or not. + Status ReadDeltaManifests(const Snapshot& snapshot, + std::vector<ManifestFileMeta>* manifests) const { + return Read(snapshot.DeltaManifestList(), /*filter=*/nullptr, manifests); + } + + /// Return a `ManifestFileMeta` for each changelog manifest in this snapshot. + /// + /// @param snapshot The input snapshot. + /// @param manifests The vector to store the manifest file metadata. + /// @return Status indicating whether the operation was successful or not. + Status ReadChangelogManifests(const Snapshot& snapshot, + std::vector<ManifestFileMeta>* manifests) const { + const std::optional<std::string>& changelog_manifest_list = + snapshot.ChangelogManifestList(); + if (changelog_manifest_list) { + return Status::NotImplemented("do not support read changelog manifest list"); + // return Read(changelog_manifest_list.value(), /*filter=*/nullptr, manifests); + } else { + return Status::OK(); + } + } Review Comment: The docstring states this method returns a `ManifestFileMeta` for each changelog manifest, but the implementation returns `NotImplemented` whenever a changelog manifest list is present and does not populate `manifests`. Update the documentation to match the current behavior (e.g., explicitly state it's not supported yet), or implement the commented-out read path and keep the method contract. ########## src/paimon/core/manifest/index_manifest_entry.h: ########## @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <cstdint> +#include <memory> +#include <string> + +#include "paimon/common/data/binary_row.h" +#include "paimon/core/index/index_file_meta.h" +#include "paimon/core/manifest/file_kind.h" Review Comment: This header uses Arrow schema builders (`arrow::struct_`, `arrow::field`, `arrow::int8()`, etc.) and `fmt::format`, but does not include the corresponding Arrow and fmt headers. Relying on transitive includes can break builds when include order changes. Add the direct includes needed by this header (e.g., an Arrow header that provides `arrow::field/struct_` and `fmt/format.h`). ########## src/paimon/core/manifest/index_manifest_entry.h: ########## @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <cstdint> +#include <memory> +#include <string> + +#include "paimon/common/data/binary_row.h" +#include "paimon/core/index/index_file_meta.h" +#include "paimon/core/manifest/file_kind.h" + +namespace paimon { +/// Manifest entry for index file. +struct IndexManifestEntry { + static const std::shared_ptr<arrow::DataType>& DataType() { + static std::shared_ptr<arrow::DataType> data_type = arrow::struct_({ + arrow::field("_KIND", arrow::int8(), false), + arrow::field("_PARTITION", arrow::binary(), false), + arrow::field("_BUCKET", arrow::int32(), false), + arrow::field("_INDEX_TYPE", arrow::utf8(), false), + arrow::field("_FILE_NAME", arrow::utf8(), false), + arrow::field("_FILE_SIZE", arrow::int64(), false), + arrow::field("_ROW_COUNT", arrow::int64(), false), + arrow::field("_DELETIONS_VECTORS_RANGES", + arrow::list(arrow::field("item", DeletionVectorMeta::DataType(), true)), + true), + arrow::field("_EXTERNAL_PATH", arrow::utf8(), true), + arrow::field("_GLOBAL_INDEX", GlobalIndexMeta::DataType(), true), + }); + return data_type; + } + + IndexManifestEntry(const FileKind& _kind, const BinaryRow& _partition, int32_t _bucket, + const std::shared_ptr<IndexFileMeta>& _index_file) + : kind(_kind), partition(_partition), bucket(_bucket), index_file(_index_file) {} + + bool operator==(const IndexManifestEntry& other) const { + if (this == &other) { + return true; + } + return kind == other.kind && partition == other.partition && bucket == other.bucket && + *index_file == *(other.index_file); + } + + std::string ToString() const { + return fmt::format("IndexManifestEntry{{kind={}, partition={}, bucket={}, indexFile={}}}", + kind.ToByteValue(), partition.ToString(), bucket, + index_file->ToString()); + } Review Comment: This header uses Arrow schema builders (`arrow::struct_`, `arrow::field`, `arrow::int8()`, etc.) and `fmt::format`, but does not include the corresponding Arrow and fmt headers. Relying on transitive includes can break builds when include order changes. Add the direct includes needed by this header (e.g., an Arrow header that provides `arrow::field/struct_` and `fmt/format.h`). ########## src/paimon/core/manifest/manifest_list_test.cpp: ########## @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/manifest/manifest_list.h" + +#include <map> +#include <variant> + +#include "arrow/type.h" +#include "gtest/gtest.h" +#include "paimon/core/manifest/manifest_file_meta.h" +#include "paimon/core/stats/simple_stats.h" +#include "paimon/core/utils/file_store_path_factory.h" +#include "paimon/format/file_format.h" +#include "paimon/format/file_format_factory.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/testing/utils/binary_row_generator.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { +class ManifestListTest : public testing::Test { + public: + std::unique_ptr<ManifestList> CreateManifestList( + const std::string& file_format_str, const std::string& root_path, + const std::shared_ptr<MemoryPool>& pool) const { + std::shared_ptr<FileSystem> file_system = std::make_shared<LocalFileSystem>(); + EXPECT_OK_AND_ASSIGN(std::shared_ptr<FileFormat> file_format, + FileFormatFactory::Get(file_format_str, {})); + auto unused_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", arrow::utf8())})); + EXPECT_OK_AND_ASSIGN(std::shared_ptr<FileStorePathFactory> path_factory, + FileStorePathFactory::Create( + root_path, unused_schema, /*partition_keys=*/{}, + /*default_part_value=*/"", file_format->Identifier(), + /*data_file_prefix=*/"data-", + /*legacy_partition_name_enabled=*/true, /*external_paths=*/{}, + /*global_index_external_path=*/std::nullopt, + /*index_file_in_data_file_dir=*/false, pool)); + EXPECT_OK_AND_ASSIGN(auto manifest_list, ManifestList::Create(file_system, file_format, + "zstd", path_factory, pool)); + return manifest_list; + } + + std::vector<ManifestFileMeta> ReadManifestFileMeta( + const std::string& file_format_str, const std::string& root_path, + const std::string& file_name, const std::shared_ptr<MemoryPool>& pool) const { + auto manifest_list = CreateManifestList(file_format_str, root_path, pool); + std::vector<ManifestFileMeta> manifest_file_metas; + EXPECT_OK(manifest_list->Read(file_name, /*filter=*/nullptr, &manifest_file_metas)); + return manifest_file_metas; + } +}; + +TEST_F(ManifestListTest, TestSimple) { + auto pool = GetDefaultPool(); + auto manifest_file_metas = + ReadManifestFileMeta("orc", paimon::test::GetDataDir() + "/orc/append_09.db/append_09", + "manifest-list-f2d59cb8-3ec6-4860-b34b-050b1a533416-2", pool); + ASSERT_EQ(manifest_file_metas.size(), 4); + + std::vector<ManifestFileMeta> expected_manifest_file_metas; + auto expected_meta1 = + ManifestFileMeta("manifest-f8b15cfc-437a-4d21-a6a0-e45b639ae7ed-0", /*file_size=*/2666, + /*num_added_files=*/3, /*num_deleted_files=*/0, + BinaryRowGenerator::GenerateStats({10}, {20}, {0}, pool.get()), + /*schema_id=*/0, /*min_bucket=*/std::nullopt, /*max_bucket=*/std::nullopt, + /*min_level=*/std::nullopt, /*max_level=*/std::nullopt, + /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); + auto expected_meta2 = + ManifestFileMeta("manifest-3a44a0da-1008-463c-914e-28d271375e24-0", /*file_size=*/2617, + /*num_added_files=*/2, /*num_deleted_files=*/0, + BinaryRowGenerator::GenerateStats({10}, {20}, {0}, pool.get()), + /*schema_id=*/0, /*min_bucket=*/std::nullopt, /*max_bucket=*/std::nullopt, + /*min_level=*/std::nullopt, /*max_level=*/std::nullopt, + /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); + auto expected_meta3 = + ManifestFileMeta("manifest-c5904353-0236-46a2-891f-62a326dd8e5e-0", /*file_size=*/2360, + /*num_added_files=*/1, /*num_deleted_files=*/0, + BinaryRowGenerator::GenerateStats({10}, {10}, {0}, pool.get()), + /*schema_id=*/0, /*min_bucket=*/std::nullopt, /*max_bucket=*/std::nullopt, + /*min_level=*/std::nullopt, /*max_level=*/std::nullopt, + /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); + auto expected_meta4 = + ManifestFileMeta("manifest-3ea5ee21-d399-4f1c-a749-2fc63dbf0852-0", /*file_size=*/2366, + /*num_added_files=*/1, /*num_deleted_files=*/0, + BinaryRowGenerator::GenerateStats({10}, {10}, {0}, pool.get()), + /*schema_id=*/0, /*min_bucket=*/std::nullopt, /*max_bucket=*/std::nullopt, + /*min_level=*/std::nullopt, /*max_level=*/std::nullopt, + /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); + expected_manifest_file_metas.emplace_back(expected_meta1); + expected_manifest_file_metas.emplace_back(expected_meta2); + expected_manifest_file_metas.emplace_back(expected_meta3); + expected_manifest_file_metas.emplace_back(expected_meta4); + ASSERT_EQ(manifest_file_metas, expected_manifest_file_metas); +} + +TEST_F(ManifestListTest, TestReadWithBucketsAndLevel) { + auto pool = GetDefaultPool(); + auto manifest_file_metas = + ReadManifestFileMeta("orc", + paimon::test::GetDataDir() + + "/orc/pk_table_with_total_buckets.db/pk_table_with_total_buckets", + "manifest-list-673be11d-f405-4921-84dc-6f53028c55ea-1", pool); + ASSERT_EQ(manifest_file_metas.size(), 1); + + std::vector<ManifestFileMeta> expected_manifest_file_metas; + auto expected_meta1 = + ManifestFileMeta("manifest-2026dc88-7f67-4944-8c33-ea775d34108c-0", /*file_size=*/3046, + /*num_added_files=*/2, /*num_deleted_files=*/0, + BinaryRowGenerator::GenerateStats({10}, {10}, {0}, pool.get()), + /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/1, + /*min_level=*/0, /*max_level=*/0, + /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); + expected_manifest_file_metas.emplace_back(expected_meta1); + ASSERT_EQ(manifest_file_metas, expected_manifest_file_metas); +} + +TEST_F(ManifestListTest, TestEmptyManifestList) { + auto pool = GetDefaultPool(); + auto manifest_file_metas = + ReadManifestFileMeta("orc", paimon::test::GetDataDir() + "/orc/append_09.db/append_09", + "manifest-list-616d1847-a02c-495f-9cca-2c8b7def0fec-0", pool); + ASSERT_EQ(manifest_file_metas.size(), 0); +} + +TEST_F(ManifestListTest, TestManifestListCompatibleWithJavaPaimon09) { + auto pool = GetDefaultPool(); + auto manifest_file_metas = ReadManifestFileMeta("avro", paimon::test::GetDataDir() + "/avro", + "avro_manifest_list_09", pool); + ASSERT_EQ(manifest_file_metas.size(), 1); + + std::vector<ManifestFileMeta> expected_manifest_file_metas; + auto expected_meta = + ManifestFileMeta("manifest-b09d5588-5614-46e2-b441-f196d29e60dc-0", /*file_size=*/2010, + /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), + /*schema_id=*/0, /*min_bucket=*/std::nullopt, /*max_bucket=*/std::nullopt, + /*min_level=*/std::nullopt, /*max_level=*/std::nullopt, + /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); + expected_manifest_file_metas.emplace_back(expected_meta); + ASSERT_EQ(manifest_file_metas, expected_manifest_file_metas); +} + +TEST_F(ManifestListTest, TestManifestListCompatibleWithJavaPaimon11) { + auto pool = GetDefaultPool(); + auto manifest_file_metas = ReadManifestFileMeta("avro", paimon::test::GetDataDir() + "/avro", + "avro_manifest_list_11", pool); + ASSERT_EQ(manifest_file_metas.size(), 1); + + std::vector<ManifestFileMeta> expected_manifest_file_metas; + auto expected_meta = ManifestFileMeta( + "manifest-3c977409-ebff-4d68-8265-86d237e24e9a-0", /*file_size=*/2081, + /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), + /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/0, /*min_level=*/0, /*max_level=*/0, + /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); + expected_manifest_file_metas.emplace_back(expected_meta); + ASSERT_EQ(manifest_file_metas, expected_manifest_file_metas); +} + +TEST_F(ManifestListTest, TestReadWithMinAndMaxRowId) { + auto pool = GetDefaultPool(); + // test read meta from java + auto manifest_file_metas = ReadManifestFileMeta( + "orc", + paimon::test::GetDataDir() + "orc/append_with_global_index.db/append_with_global_index/", + "manifest-list-2bccccf8-9f5e-48f2-b706-5b33f8c3bfc0-0", pool); Review Comment: This path concatenation is missing a separator compared to other tests in this file (most use `GetDataDir() + \"/orc/...\"`). If `GetDataDir()` does not end with `/`, this will produce an invalid path and the test will fail. Make the separator consistent (either ensure `GetDataDir()` always ends with `/` or include the leading `/` here). ########## src/paimon/core/manifest/index_manifest_file_handler.h: ########## @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <map> +#include <memory> +#include <optional> +#include <string> +#include <vector> + +#include "paimon/core/manifest/index_manifest_file.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { +/// IndexManifestFile Handler. +class IndexManifestFileHandler { + public: + IndexManifestFileHandler() = delete; + ~IndexManifestFileHandler() = delete; + + static Result<std::string> Write(const std::optional<std::string>& previous_index_manifest, + const std::vector<IndexManifestEntry>& new_index_entries, + int32_t bucket_mode, IndexManifestFile* index_manifest_file); + + private: + class IndexManifestFileCombiner { + public: + virtual ~IndexManifestFileCombiner() = default; + virtual std::vector<IndexManifestEntry> Combine( + const std::vector<IndexManifestEntry>& prev_index_files, + const std::vector<IndexManifestEntry>& new_index_files) const = 0; + }; + + /// Combine previous and new global index files by file `BucketIdentifier`. + class BucketedCombiner : public IndexManifestFileCombiner { + public: + std::vector<IndexManifestEntry> Combine( + const std::vector<IndexManifestEntry>& prev_index_files, + const std::vector<IndexManifestEntry>& new_index_files) const override; + }; + + /// Combine previous and new global index files by file name. + class GlobalFileNameCombiner : public IndexManifestFileCombiner { + public: + std::vector<IndexManifestEntry> Combine( + const std::vector<IndexManifestEntry>& prev_index_files, + const std::vector<IndexManifestEntry>& new_index_files) const override; + }; + + static std::map<std::string, std::vector<IndexManifestEntry>> SeparateIndexEntries( + const std::vector<IndexManifestEntry>& index_entries); + + static Result<std::unique_ptr<IndexManifestFileCombiner>> GetIndexManifestFileCombine( + const std::string& index_type, int32_t bucket_mode); Review Comment: The method name `GetIndexManifestFileCombine` reads like a verb phrase and is inconsistent with the returned type (`IndexManifestFileCombiner`). Consider renaming to `GetIndexManifestFileCombiner` (or similar) to make the intent clearer. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
