wgtmac commented on code in PR #603: URL: https://github.com/apache/iceberg-cpp/pull/603#discussion_r3022851060
########## src/iceberg/puffin/json_serde_internal.h: ########## @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/json_serde_internal.h +/// JSON serialization/deserialization for Puffin file metadata. + +#include <string> +#include <string_view> + +#include <nlohmann/json_fwd.hpp> + +#include "iceberg/iceberg_export.h" +#include "iceberg/puffin/file_metadata.h" Review Comment: Can we remove this and add a `iceberg/puffin/type_fwd.h` for forward declarations? ########## src/iceberg/util/endian.h: ########## @@ -94,4 +95,19 @@ constexpr T FromBigEndian(T value) { } } +/// \brief Write a value in little-endian format to a buffer. +template <EndianConvertible T> +void WriteLittleEndian(T value, void* output) { + auto le = ToLittleEndian(value); + std::memcpy(output, &le, sizeof(le)); +} + +/// \brief Read a value in little-endian format from a buffer. +template <EndianConvertible T> +T ReadLittleEndian(const void* input) { Review Comment: ditto ########## src/iceberg/puffin/file_metadata.h: ########## @@ -107,6 +108,10 @@ struct ICEBERG_EXPORT BlobMetadata { ICEBERG_EXPORT std::string ToString(const BlobMetadata& blob_metadata); +inline std::ostream& operator<<(std::ostream& os, const BlobMetadata& b) { Review Comment: We don't have any precedence to add this. It is preferable to use std::format wherever possible. ########## src/iceberg/puffin/puffin_format.h: ########## @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/puffin_format.h +/// Puffin file format constants and utilities. + +#include <array> +#include <cstdint> +#include <span> + +#include "iceberg/iceberg_export.h" +#include "iceberg/puffin/file_metadata.h" +#include "iceberg/result.h" + +namespace iceberg::puffin { + +/// \brief Puffin file format constants. +struct ICEBERG_EXPORT PuffinFormat { + /// Magic bytes: "PFA1" (Puffin Fratercula arctica, version 1) + static constexpr std::array<uint8_t, 4> kMagicV1 = {0x50, 0x46, 0x41, 0x31}; + + static constexpr int32_t kMagicLength = 4; + static constexpr int32_t kFooterStartMagicOffset = 0; + static constexpr int32_t kFooterStartMagicLength = kMagicLength; + static constexpr int32_t kFooterStructPayloadSizeOffset = 0; + static constexpr int32_t kFooterStructFlagsOffset = kFooterStructPayloadSizeOffset + 4; + static constexpr int32_t kFooterStructFlagsLength = 4; + static constexpr int32_t kFooterStructMagicOffset = + kFooterStructFlagsOffset + kFooterStructFlagsLength; + + /// Total length of the footer struct: payload_size(4) + flags(4) + magic(4) + static constexpr int32_t kFooterStructLength = kFooterStructMagicOffset + kMagicLength; + + /// Default compression codec for footer payload. + static constexpr PuffinCompressionCodec kDefaultFooterCompressionCodec = + PuffinCompressionCodec::kLz4; +}; + +/// \brief Footer flags for Puffin files. +enum class PuffinFlag : uint8_t { + /// Whether the footer payload is compressed. + kFooterPayloadCompressed = 0, +}; + +/// \brief Check if a flag is set in the flags bytes. +ICEBERG_EXPORT bool IsFlagSet(std::span<const uint8_t, 4> flags, PuffinFlag flag); + +/// \brief Set a flag in the flags bytes. +ICEBERG_EXPORT void SetFlag(std::span<uint8_t, 4> flags, PuffinFlag flag); + +/// \brief Read a 32-bit integer from a buffer at the given offset in little-endian +/// format. +ICEBERG_EXPORT Result<int32_t> ReadInt32LittleEndian(std::span<const uint8_t> data, Review Comment: Should we move this to `endian.h`? If not, I'd be inclined to remove this for now and then add it back as an internal function in the source file where it is called. ########## src/iceberg/puffin/json_serde_internal.h: ########## @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/json_serde_internal.h +/// JSON serialization/deserialization for Puffin file metadata. + +#include <string> +#include <string_view> + +#include <nlohmann/json_fwd.hpp> + +#include "iceberg/iceberg_export.h" +#include "iceberg/puffin/file_metadata.h" +#include "iceberg/result.h" + +namespace iceberg::puffin { + +/// \brief Serialize a BlobMetadata to JSON. +ICEBERG_EXPORT nlohmann::json ToJson(const BlobMetadata& blob_metadata); + +/// \brief Deserialize a BlobMetadata from JSON. +ICEBERG_EXPORT Result<BlobMetadata> BlobMetadataFromJson(const nlohmann::json& json); + +/// \brief Serialize a FileMetadata to JSON. +ICEBERG_EXPORT nlohmann::json ToJson(const FileMetadata& file_metadata); + +/// \brief Deserialize a FileMetadata from JSON. +ICEBERG_EXPORT Result<FileMetadata> FileMetadataFromJson(const nlohmann::json& json); + +/// \brief Serialize a FileMetadata to a JSON string. +ICEBERG_EXPORT std::string ToJsonString(const FileMetadata& file_metadata, Review Comment: Do we really need to specialize this? ########## src/iceberg/test/puffin_format_test.cc: ########## @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/puffin/puffin_format.h" + +#include <array> +#include <cstdint> + +#include <gtest/gtest.h> + +#include "iceberg/util/endian.h" + +namespace iceberg::puffin { + +TEST(PuffinFormatTest, ByteOrderRoundTrip) { + std::array<uint8_t, 4> buf{}; Review Comment: This test seems unrelated to puffin. Move it to `endian_test.cc`? ########## src/iceberg/test/puffin_json_test.cc: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <string> + +#include <gtest/gtest.h> +#include <nlohmann/json.hpp> + +#include "iceberg/puffin/file_metadata.h" +#include "iceberg/puffin/json_serde_internal.h" +#include "iceberg/test/matchers.h" + +namespace iceberg::puffin { + +// ==================== BlobMetadata Parameterized Tests ==================== + +struct BlobMetadataJsonParam { + std::string name; + BlobMetadata blob; + std::string expected_json; +}; + +class BlobMetadataJsonTest : public ::testing::TestWithParam<BlobMetadataJsonParam> {}; + +TEST_P(BlobMetadataJsonTest, RoundTrip) { + const auto& param = GetParam(); + auto expected = nlohmann::json::parse(param.expected_json); + + EXPECT_EQ(ToJson(param.blob), expected); + EXPECT_THAT(BlobMetadataFromJson(expected), HasValue(::testing::Eq(param.blob))); +} + +INSTANTIATE_TEST_SUITE_P(PuffinJson, BlobMetadataJsonTest, + ::testing::Values( + BlobMetadataJsonParam{ + .name = "AllFields", + .blob = {.type = "apache-datasketches-theta-v1", + .input_fields = {1, 2}, + .snapshot_id = 12345, + .sequence_number = 67, + .offset = 100, + .length = 200, + .compression_codec = "zstd", + .properties = {{"key", "value"}}}, + .expected_json = R"({ + "type": "apache-datasketches-theta-v1", + "fields": [1, 2], + "snapshot-id": 12345, + "sequence-number": 67, + "offset": 100, + "length": 200, + "compression-codec": "zstd", + "properties": {"key": "value"} + })"}, + BlobMetadataJsonParam{.name = "MinimalFields", + .blob = {.type = "test-type", + .input_fields = {1}, + .snapshot_id = 100, + .sequence_number = 1, + .offset = 0, + .length = 50}, + .expected_json = R"({ + "type": "test-type", + "fields": [1], + "snapshot-id": 100, + "sequence-number": 1, + "offset": 0, + "length": 50 + })"}), + [](const ::testing::TestParamInfo<BlobMetadataJsonParam>& info) { + return info.param.name; + }); + +// ==================== BlobMetadata Invalid JSON Tests ==================== + +struct InvalidBlobMetadataJsonParam { + std::string name; + std::string json; +}; + +class InvalidBlobMetadataJsonTest + : public ::testing::TestWithParam<InvalidBlobMetadataJsonParam> {}; + +TEST_P(InvalidBlobMetadataJsonTest, DeserializeFails) { + auto json = nlohmann::json::parse(GetParam().json); + EXPECT_THAT(BlobMetadataFromJson(json), IsError(ErrorKind::kJsonParseError)); +} + +INSTANTIATE_TEST_SUITE_P( + PuffinJson, InvalidBlobMetadataJsonTest, + ::testing::Values( + InvalidBlobMetadataJsonParam{.name = "MissingType", Review Comment: Add a MissingSequenceNumber test case for completeness. ########## src/iceberg/util/endian.h: ########## @@ -94,4 +95,19 @@ constexpr T FromBigEndian(T value) { } } +/// \brief Write a value in little-endian format to a buffer. Review Comment: It would be good to add a comment to say that it is caller's responsibility to guarantee that `output` has sufficient length, otherwise it is a UB. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
