Copilot commented on code in PR #59: URL: https://github.com/apache/paimon-cpp/pull/59#discussion_r3377265726
########## src/paimon/core/options/expire_config.h: ########## @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <cstdint> + +namespace paimon { + +class ExpireConfig { + public: + ExpireConfig() = default; + ExpireConfig(int32_t snapshot_retain_max, int32_t snapshot_retain_min, + int64_t snapshot_time_retain_ms, int32_t snapshot_max_deletes, + bool snapshot_clean_empty_directories) + : snapshot_retain_max_(snapshot_retain_max), + snapshot_retain_min_(snapshot_retain_min), + snapshot_time_retain_ms_(snapshot_time_retain_ms), + snapshot_max_deletes_(snapshot_max_deletes), + snapshot_clean_empty_directories_(snapshot_clean_empty_directories) {} Review Comment: The default constructor leaves all member fields uninitialized, so calling any getter after `ExpireConfig()` is undefined behavior. Add in-class default initializers (or initialize in the default constructor) to safe defaults, or remove the default constructor if it should always be explicitly configured. ########## src/paimon/core/utils/partition_path_utils_test.cpp: ########## @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "paimon/core/utils/partition_path_utils.h" + +#include "gtest/gtest.h" +#include "paimon/status.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +TEST(PartitionPathUtilsTest, TestEmptyInput) { + std::vector<std::pair<std::string, std::string>> partition_spec; + ASSERT_OK_AND_ASSIGN(std::string partition_path_str, + PartitionPathUtils::GeneratePartitionPath(partition_spec)); + ASSERT_EQ(partition_path_str, ""); +} + +TEST(PartitionPathUtilsTest, TestSimple) { + std::vector<std::pair<std::string, std::string>> partition_spec = { + {"f1", "v1"}, + {"f2", "这是一段不是特别长的中文"}, + {"f0", "v0"}, + }; + ASSERT_OK_AND_ASSIGN(std::string partition_path_str, + PartitionPathUtils::GeneratePartitionPath(partition_spec)); + ASSERT_EQ(partition_path_str, "f1=v1/f2=这是一段不是特别长的中文/f0=v0/"); +} + +TEST(PartitionPathUtilsTest, TestCharToEscape) { + std::vector<std::pair<std::string, std::string>> partition_spec = { + {"f0", "v0"}, + {"f1", "v1="}, + {"/f2?", "这是一段不是特别长\n的[中文]"}, + }; + ASSERT_OK_AND_ASSIGN(std::string partition_path_str, + PartitionPathUtils::GeneratePartitionPath(partition_spec)); + ASSERT_EQ(partition_path_str, "f0=v0/f1=v1%3D/%2Ff2%3F=这是一段不是特别长%0A的%5B中文%5D/"); +} + +TEST(PartitionPathUtilsTest, testGenerateHierarchicalPartitionPaths) { + std::vector<std::pair<std::string, std::string>> partition_spec = { + {"f2", "这是一段不是特别长的中文"}, + {"f0", "v0"}, + {"f1", "v1"}, + }; + ASSERT_OK_AND_ASSIGN(std::vector<std::string> partition_path_strs, + PartitionPathUtils::GenerateHierarchicalPartitionPaths(partition_spec)); + ASSERT_EQ(partition_path_strs.size(), 3u); + ASSERT_EQ(partition_path_strs[0], "f2=这是一段不是特别长的中文/"); + ASSERT_EQ(partition_path_strs[1], "f2=这是一段不是特别长的中文/f0=v0/"); + ASSERT_EQ(partition_path_strs[2], "f2=这是一段不是特别长的中文/f0=v0/f1=v1/"); +} + +TEST(PartitionPathUtilsTest, EscapeChar) { + std::stringstream ss; + PartitionPathUtils::EscapeChar(' ', &ss); + ASSERT_EQ(ss.str(), "%20"); Review Comment: `PartitionPathUtils::EscapeChar` is declared `private` in `partition_path_utils.h`, so this test will not compile. Prefer testing escaping via the public API (`EscapePathName`) or make `EscapeChar` public/test-accessible (e.g., a public test helper or `friend` for the test). ########## src/paimon/core/utils/file_utils.cpp: ########## @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "paimon/core/utils/file_utils.h" + +#include <algorithm> +#include <optional> +#include <utility> + +#include "fmt/format.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/common/utils/string_utils.h" +#include "paimon/fs/file_system.h" +#include "paimon/result.h" + +namespace paimon { + +Status FileUtils::ListVersionedFiles(const std::shared_ptr<FileSystem>& fs, const std::string& dir, + const std::string& prefix, std::vector<int64_t>* files) { + std::vector<std::string> file_strs; + PAIMON_RETURN_NOT_OK(ListOriginalVersionedFiles(fs, dir, prefix, &file_strs)); + for (const auto& file_str : file_strs) { + std::optional<int64_t> file_number = StringUtils::StringToValue<int64_t>(file_str); + if (file_number == std::nullopt) { + return Status::Invalid(fmt::format("fail to convert {} to number", file_str)); + } + files->emplace_back(file_number.value()); + } + return Status::OK(); +} Review Comment: `ListVersionedFiles` returns versions in filesystem iteration order, which can be nondeterministic across filesystems/runs. If callers expect increasing versions (common for snapshot/manifest listing), sort `*files` before returning (and consider also sorting `file_status_list` or `file_strs` in the other helpers for consistent behavior). ########## src/paimon/core/utils/offset_row.h: ########## @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> + +#include "fmt/format.h" +#include "paimon/common/data/binary_string.h" +#include "paimon/common/data/internal_row.h" +#include "paimon/data/decimal.h" +#include "paimon/data/timestamp.h" +#include "paimon/memory/bytes.h" +#include "paimon/result.h" + +namespace paimon { +class Bytes; +class InternalArray; +class InternalMap; +class RowKind; + +/// A `InternalRow` to wrap row with offset. +class OffsetRow : public InternalRow { + public: + OffsetRow(const InternalRow& row, int32_t arity, int32_t offset) + : row_(row), arity_(arity), offset_(offset) {} + + int32_t GetFieldCount() const override { + return arity_; + } + + Result<const RowKind*> GetRowKind() const override { + return row_.GetRowKind(); + } + + void SetRowKind(const RowKind* kind) override {} Review Comment: `SetRowKind` is a silent no-op, which can break callers expecting `InternalRow` semantics (setting row kind through the wrapper has no effect). If mutation should be supported, store a non-const `InternalRow&` (or pointer) and forward `SetRowKind(kind)`; otherwise, consider documenting the wrapper as read-only and failing fast (e.g., assert) to avoid hidden behavior differences. ########## src/paimon/core/utils/partition_path_utils.cpp: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "paimon/core/utils/partition_path_utils.h" + +#include <array> +#include <cstdint> +#include <optional> + +#include "paimon/status.h" + +namespace paimon { + +const std::bitset<128>& PartitionPathUtils::CharToEscape() { + constexpr auto char_to_escape = []() { + std::bitset<128> bitset; + for (char c = 0; c < ' '; c++) { + bitset.set(static_cast<unsigned char>(c)); + } + std::array<char, 48> clist = { + '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', + '\u0009', '\n', '\u000B', '\u000C', '\r', '\u000E', '\u000F', '\u0010', + '\u0011', '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', + '\u0019', '\u001A', '\u001B', '\u001C', '\u001D', '\u001E', '\u001F', '"', + '#', '%', '\'', '*', '/', ':', '=', '?', + '\\', '\u007F', '{', '}', '[', ']', '^'}; + for (char c : clist) { + bitset.set(static_cast<unsigned char>(c)); + } + return bitset; + }; + static std::bitset<128> bitset = char_to_escape(); + return bitset; +} + +Result<std::string> PartitionPathUtils::GeneratePartitionPath( + const std::vector<std::pair<std::string, std::string>>& partition_spec) { + if (partition_spec.empty()) { + return std::string(); + } + std::stringstream ss; + int32_t i = 0; + for (const auto& [key, value] : partition_spec) { + if (i > 0) { + ss << PATH_SEPARATOR; + } + PAIMON_ASSIGN_OR_RAISE(std::string key_esc, EscapePathName(key)); + PAIMON_ASSIGN_OR_RAISE(std::string value_esc, EscapePathName(value)); + ss << key_esc << "=" << value_esc; + i++; + } + ss << PATH_SEPARATOR; + return ss.str(); +} + +Result<std::string> PartitionPathUtils::EscapePathName(const std::string& path) { + if (path.empty()) { + return Status::Invalid("path should not be empty"); + } + + std::optional<std::stringstream> ss; + for (size_t i = 0; i < path.size(); i++) { + char c = path[i]; + if (NeedsEscaping(c)) { + if (ss == std::nullopt) { + ss = std::stringstream(); + for (size_t j = 0; j < i; j++) { + ss.value() << path[j]; + } + } + EscapeChar(c, &ss.value()); + } else if (ss != std::nullopt) { + ss.value() << c; + } + } + if (ss == std::nullopt) { + return path; + } + return ss.value().str(); +} + +void PartitionPathUtils::EscapeChar(char c, std::stringstream* ss_ptr) { + auto& ss = *ss_ptr; + ss << '%'; + auto uc = static_cast<unsigned char>(c); + if (uc < 16) { + ss << '0'; + } + std::stringstream hex_ss; + hex_ss << std::hex << std::uppercase << static_cast<int32_t>(uc); + ss << hex_ss.str(); +} Review Comment: This allocates a new `std::stringstream` for every escaped character. For long paths or many partitions this can be unnecessarily expensive; consider formatting directly into `ss` (e.g., using `std::hex` + `std::setw(2)` + `std::setfill('0')`, or a small lookup table for byte-to-hex) to avoid per-character stream construction. ########## src/paimon/core/options/compress_options.h: ########## @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once +#include <cstdint> +#include <string> + +namespace paimon { +/// Options of compression. +struct CompressOptions { + std::string compress; + int32_t zstd_level; +}; Review Comment: `zstd_level` is left uninitialized by default construction of `CompressOptions`, which can lead to undefined behavior if read before assignment. Provide a default value (e.g., `int32_t zstd_level = 0;`) and consider defaulting `compress` as well if consumers expect a known default. ########## src/paimon/core/options/lookup_strategy.h: ########## @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +namespace paimon { +/// Strategy for lookup. +struct LookupStrategy { + public: + bool operator==(const LookupStrategy& other) const { + if (this == &other) { + return true; + } + return is_first_row == other.is_first_row && produce_changelog == other.produce_changelog && + deletion_vector == other.deletion_vector && need_lookup == other.need_lookup; + } + + static LookupStrategy From(bool is_first_row, bool produce_changelog, bool deletion_vector, + bool force_lookup) { + return LookupStrategy(is_first_row, produce_changelog, deletion_vector, force_lookup); + } + + const bool need_lookup; + const bool is_first_row; + const bool produce_changelog; + const bool deletion_vector; Review Comment: Using `const` data members makes `LookupStrategy` non-assignable, which can be unnecessarily restrictive for a small value-type (e.g., storing/updating it in containers/structs). Consider removing `const` qualifiers and keeping the API-level immutability by constructing via `From(...)` (or make `need_lookup` a computed method instead of stored state). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
