This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 743c3185 feat(puffin): add basic data structures and constants of
puffin (#588)
743c3185 is described below
commit 743c3185848dc01f5de990075f65eed904e970e4
Author: ZhaoXuan <[email protected]>
AuthorDate: Tue Mar 24 09:55:45 2026 +0800
feat(puffin): add basic data structures and constants of puffin (#588)
Add the foundational types for Puffin file format support:
- Blob, BlobMetadata, FileMetadata structs
- PuffinCompressionCodec enum with codec name conversion
- StandardBlobTypes and StandardPuffinProperties constants
- ToString functions for all types
---
src/iceberg/CMakeLists.txt | 2 +
src/iceberg/meson.build | 2 +
src/iceberg/puffin/CMakeLists.txt | 18 ++++++
src/iceberg/puffin/file_metadata.cc | 118 +++++++++++++++++++++++++++++++++++
src/iceberg/puffin/file_metadata.h | 120 ++++++++++++++++++++++++++++++++++++
src/iceberg/puffin/meson.build | 18 ++++++
6 files changed, 278 insertions(+)
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index ada9b473..b503a41e 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -62,6 +62,7 @@ set(ICEBERG_SOURCES
partition_field.cc
partition_spec.cc
partition_summary.cc
+ puffin/file_metadata.cc
row/arrow_array_wrapper.cc
row/manifest_wrapper.cc
row/partition_values.cc
@@ -167,6 +168,7 @@ add_subdirectory(catalog)
add_subdirectory(data)
add_subdirectory(expression)
add_subdirectory(manifest)
+add_subdirectory(puffin)
add_subdirectory(row)
add_subdirectory(update)
add_subdirectory(util)
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index 81af8dc3..2cf1065b 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -80,6 +80,7 @@ iceberg_sources = files(
'partition_field.cc',
'partition_spec.cc',
'partition_summary.cc',
+ 'puffin/file_metadata.cc',
'row/arrow_array_wrapper.cc',
'row/manifest_wrapper.cc',
'row/partition_values.cc',
@@ -222,6 +223,7 @@ install_headers(
subdir('catalog')
subdir('expression')
subdir('manifest')
+subdir('puffin')
subdir('row')
subdir('update')
subdir('util')
diff --git a/src/iceberg/puffin/CMakeLists.txt
b/src/iceberg/puffin/CMakeLists.txt
new file mode 100644
index 00000000..087ea09c
--- /dev/null
+++ b/src/iceberg/puffin/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+iceberg_install_all_headers(iceberg/puffin)
diff --git a/src/iceberg/puffin/file_metadata.cc
b/src/iceberg/puffin/file_metadata.cc
new file mode 100644
index 00000000..748329fc
--- /dev/null
+++ b/src/iceberg/puffin/file_metadata.cc
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/puffin/file_metadata.h"
+
+#include <format>
+#include <utility>
+
+#include "iceberg/util/formatter_internal.h"
+
+namespace iceberg::puffin {
+
+namespace {
+constexpr std::string_view kLz4CodecName = "lz4";
+constexpr std::string_view kZstdCodecName = "zstd";
+} // namespace
+
+std::string_view CodecName(PuffinCompressionCodec codec) {
+ switch (codec) {
+ case PuffinCompressionCodec::kNone:
+ return "";
+ case PuffinCompressionCodec::kLz4:
+ return kLz4CodecName;
+ case PuffinCompressionCodec::kZstd:
+ return kZstdCodecName;
+ }
+ std::unreachable();
+}
+
+Result<PuffinCompressionCodec> PuffinCompressionCodecFromName(
+ std::string_view codec_name) {
+ if (codec_name.empty()) {
+ return PuffinCompressionCodec::kNone;
+ }
+ if (codec_name == kLz4CodecName) {
+ return PuffinCompressionCodec::kLz4;
+ }
+ if (codec_name == kZstdCodecName) {
+ return PuffinCompressionCodec::kZstd;
+ }
+ return InvalidArgument("Unknown codec name: {}", codec_name);
+}
+
+std::string ToString(PuffinCompressionCodec codec) {
+ return std::string(CodecName(codec));
+}
+
+std::string ToString(const Blob& blob) {
+ std::string repr = "Blob[";
+ std::format_to(std::back_inserter(repr), "type='{}',inputFields={},",
blob.type,
+ blob.input_fields);
+ std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},",
+ blob.snapshot_id, blob.sequence_number);
+ std::format_to(std::back_inserter(repr), "dataSize={}", blob.data.size());
+ if (blob.requested_compression.has_value()) {
+ std::format_to(std::back_inserter(repr), ",requestedCompression={}",
+ ToString(*blob.requested_compression));
+ }
+ if (!blob.properties.empty()) {
+ std::format_to(std::back_inserter(repr), ",properties={}",
blob.properties);
+ }
+ std::format_to(std::back_inserter(repr), "]");
+ return repr;
+}
+
+std::string ToString(const BlobMetadata& blob_metadata) {
+ std::string repr = "BlobMetadata[";
+ std::format_to(std::back_inserter(repr), "type='{}',inputFields={},",
+ blob_metadata.type, blob_metadata.input_fields);
+ std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},",
+ blob_metadata.snapshot_id, blob_metadata.sequence_number);
+ std::format_to(std::back_inserter(repr), "offset={},length={}",
blob_metadata.offset,
+ blob_metadata.length);
+ if (!blob_metadata.compression_codec.empty()) {
+ std::format_to(std::back_inserter(repr), ",compressionCodec='{}'",
+ blob_metadata.compression_codec);
+ }
+ if (!blob_metadata.properties.empty()) {
+ std::format_to(std::back_inserter(repr), ",properties={}",
blob_metadata.properties);
+ }
+ std::format_to(std::back_inserter(repr), "]");
+ return repr;
+}
+
+std::string ToString(const FileMetadata& file_metadata) {
+ std::string repr = "FileMetadata[";
+ std::format_to(std::back_inserter(repr), "blobs=[");
+ for (size_t i = 0; i < file_metadata.blobs.size(); ++i) {
+ if (i > 0) {
+ std::format_to(std::back_inserter(repr), ",");
+ }
+ std::format_to(std::back_inserter(repr), "{}",
ToString(file_metadata.blobs[i]));
+ }
+ std::format_to(std::back_inserter(repr), "]");
+ if (!file_metadata.properties.empty()) {
+ std::format_to(std::back_inserter(repr), ",properties={}",
file_metadata.properties);
+ }
+ std::format_to(std::back_inserter(repr), "]");
+ return repr;
+}
+
+} // namespace iceberg::puffin
diff --git a/src/iceberg/puffin/file_metadata.h
b/src/iceberg/puffin/file_metadata.h
new file mode 100644
index 00000000..17ddad77
--- /dev/null
+++ b/src/iceberg/puffin/file_metadata.h
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/puffin/file_metadata.h
+/// Data structures for Puffin files.
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+
+namespace iceberg::puffin {
+
+/// \brief Compression codecs supported by Puffin files.
+enum class PuffinCompressionCodec {
+ kNone,
+ kLz4,
+ kZstd,
+};
+
+ICEBERG_EXPORT std::string_view CodecName(PuffinCompressionCodec codec);
+
+ICEBERG_EXPORT Result<PuffinCompressionCodec> PuffinCompressionCodecFromName(
+ std::string_view codec_name);
+
+ICEBERG_EXPORT std::string ToString(PuffinCompressionCodec codec);
+
+/// \brief Standard blob types defined by the Iceberg specification.
+struct StandardBlobTypes {
+ /// A serialized form of a "compact" Theta sketch produced by the
+ /// Apache DataSketches library.
+ static constexpr std::string_view kApacheDatasketchesThetaV1 =
+ "apache-datasketches-theta-v1";
+
+ /// A serialized deletion vector according to the Iceberg spec.
+ static constexpr std::string_view kDeletionVectorV1 = "deletion-vector-v1";
+};
+
+/// \brief Standard file-level properties for Puffin files.
+struct StandardPuffinProperties {
+ /// Human-readable identification of the application writing the file,
+ /// along with its version.
+ static constexpr std::string_view kCreatedBy = "created-by";
+};
+
+/// \brief A blob in a Puffin file.
+struct ICEBERG_EXPORT Blob {
+ /// See StandardBlobTypes for known types.
+ std::string type;
+ /// Ordered list of field IDs the blob was computed from.
+ std::vector<int32_t> input_fields;
+ /// ID of the Iceberg table's snapshot the blob was computed from.
+ int64_t snapshot_id;
+ /// Sequence number of the Iceberg table's snapshot the blob was computed
from.
+ int64_t sequence_number;
+ std::vector<uint8_t> data;
+ /// If not set, the writer's default codec will be used.
+ std::optional<PuffinCompressionCodec> requested_compression;
+ std::unordered_map<std::string, std::string> properties;
+
+ friend bool operator==(const Blob& lhs, const Blob& rhs) = default;
+};
+
+ICEBERG_EXPORT std::string ToString(const Blob& blob);
+
+/// \brief Metadata about a blob stored in a Puffin file footer.
+struct ICEBERG_EXPORT BlobMetadata {
+ /// See StandardBlobTypes for known types.
+ std::string type;
+ /// Ordered list of field IDs the blob was computed from.
+ std::vector<int32_t> input_fields;
+ /// ID of the Iceberg table's snapshot the blob was computed from.
+ int64_t snapshot_id;
+ /// Sequence number of the Iceberg table's snapshot the blob was computed
from.
+ int64_t sequence_number;
+ int64_t offset;
+ int64_t length;
+ /// Codec name (e.g. "lz4", "zstd"), or empty if not compressed.
+ std::string compression_codec;
+ std::unordered_map<std::string, std::string> properties;
+
+ friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) =
default;
+};
+
+ICEBERG_EXPORT std::string ToString(const BlobMetadata& blob_metadata);
+
+/// \brief Metadata about a Puffin file.
+struct ICEBERG_EXPORT FileMetadata {
+ std::vector<BlobMetadata> blobs;
+ std::unordered_map<std::string, std::string> properties;
+
+ friend bool operator==(const FileMetadata& lhs, const FileMetadata& rhs) =
default;
+};
+
+ICEBERG_EXPORT std::string ToString(const FileMetadata& file_metadata);
+
+} // namespace iceberg::puffin
diff --git a/src/iceberg/puffin/meson.build b/src/iceberg/puffin/meson.build
new file mode 100644
index 00000000..0655156e
--- /dev/null
+++ b/src/iceberg/puffin/meson.build
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+install_headers(['file_metadata.h'], subdir: 'iceberg/puffin')