This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new 046f149  feat: implement literal expressions with binary serialization 
 support (#185)
046f149 is described below

commit 046f149b76e840761786e3e1a999a7778b17d990
Author: Li Feiyang <[email protected]>
AuthorDate: Fri Oct 10 21:56:17 2025 +0800

    feat: implement literal expressions with binary serialization  support 
(#185)
    
    ## Summary
    Implements binary serialization and deserialization support for Literal
    values, enabling conversion between Literal objects and binary
    representations. Adds comprehensive formatting support for date, time,
    and timestamp types.
    
      ## Changes
    - Added `Conversions` utility class
    (`src/iceberg/util/conversions.cc/h`) with `ToBytes()` and `FromBytes()`
    methods for `Literal` binary serialization/deserialization
    - Added literal formatting utilities
    (`src/iceberg/util/literal_format.cc/h`) for `date`, `time`,
    `timestamp`, and `timestamptz` formatting
    - Implemented `Literal` serialization methods: Replaced placeholder
    implementations of `Serialize()` and `Deserialize()` with full
    functionality
    - Enhanced `Literal::ToString()`: Added support for `date`, `time`,
    `timestamp`, and `timestamptz` types
    - Added `TypeId` string conversion: Implemented `ToString(TypeId)`
    utility function for type name lookups
    - Updated CMake configuration: Added new util source files to build
    system
    
     ## Test Plan
    - Comprehensive binary round-trip tests for all primitive types
    (boolean, int, long, float, double, string, binary)
    - Serialization correctness tests verify exact byte representations
    match expected formats
    - Date/time formatting tests ensure proper ISO 8601 compatible string
    output
    - Modify existing test(e.g. manifest_reader_test.cc) to use binary
    serialization.
---
 src/iceberg/CMakeLists.txt                    |   3 +-
 src/iceberg/expression/literal.cc             |  30 +++-
 src/iceberg/expression/literal.h              |  68 ++++++++-
 src/iceberg/test/literal_test.cc              | 207 +++++++++++++++++++++++++-
 src/iceberg/test/manifest_list_reader_test.cc |  70 ++++-----
 src/iceberg/test/manifest_reader_test.cc      |  57 ++++---
 src/iceberg/type.cc                           |  42 ++++++
 src/iceberg/type.h                            |   9 ++
 src/iceberg/util/conversions.cc               | 202 +++++++++++++++++++++++++
 src/iceberg/util/conversions.h                |  65 ++++++++
 10 files changed, 684 insertions(+), 69 deletions(-)

diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index a2a648f..8327b59 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -55,10 +55,11 @@ set(ICEBERG_SOURCES
     manifest_reader_internal.cc
     manifest_writer.cc
     arrow_c_data_guard_internal.cc
+    util/conversions.cc
     util/decimal.cc
+    util/gzip_internal.cc
     util/murmurhash3_internal.cc
     util/timepoint.cc
-    util/gzip_internal.cc
     util/uuid.cc)
 
 set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
diff --git a/src/iceberg/expression/literal.cc 
b/src/iceberg/expression/literal.cc
index e3abb6a..adfe535 100644
--- a/src/iceberg/expression/literal.cc
+++ b/src/iceberg/expression/literal.cc
@@ -23,6 +23,8 @@
 #include <concepts>
 
 #include "iceberg/exception.h"
+#include "iceberg/util/conversions.h"
+#include "iceberg/util/macros.h"
 
 namespace iceberg {
 
@@ -149,13 +151,18 @@ Literal Literal::Binary(std::vector<uint8_t> value) {
   return {Value{std::move(value)}, binary()};
 }
 
+Literal Literal::Fixed(std::vector<uint8_t> value) {
+  auto length = static_cast<int32_t>(value.size());
+  return {Value{std::move(value)}, fixed(length)};
+}
+
 Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
                                      std::shared_ptr<PrimitiveType> type) {
-  return NotImplemented("Deserialization of Literal is not implemented yet");
+  return Conversions::FromBytes(std::move(type), data);
 }
 
 Result<std::vector<uint8_t>> Literal::Serialize() const {
-  return NotImplemented("Serialization of Literal is not implemented yet");
+  return Conversions::ToBytes(*this);
 }
 
 // Getters
@@ -189,7 +196,7 @@ bool Literal::operator==(const Literal& other) const { 
return (*this <=> other)
 // Three-way comparison operator
 std::partial_ordering Literal::operator<=>(const Literal& other) const {
   // If types are different, comparison is unordered
-  if (type_->type_id() != other.type_->type_id()) {
+  if (*type_ != *other.type_) {
     return std::partial_ordering::unordered;
   }
 
@@ -216,6 +223,7 @@ std::partial_ordering Literal::operator<=>(const Literal& 
other) const {
     }
 
     case TypeId::kLong:
+    case TypeId::kTime:
     case TypeId::kTimestamp:
     case TypeId::kTimestampTz: {
       auto this_val = std::get<int64_t>(value_);
@@ -249,6 +257,12 @@ std::partial_ordering Literal::operator<=>(const Literal& 
other) const {
       return this_val <=> other_val;
     }
 
+    case TypeId::kFixed: {
+      auto& this_val = std::get<std::vector<uint8_t>>(value_);
+      auto& other_val = std::get<std::vector<uint8_t>>(other.value_);
+      return this_val <=> other_val;
+    }
+
     default:
       // For unsupported types, return unordered
       return std::partial_ordering::unordered;
@@ -294,9 +308,17 @@ std::string Literal::ToString() const {
       }
       return result;
     }
+    case TypeId::kFixed: {
+      const auto& fixed_data = std::get<std::vector<uint8_t>>(value_);
+      std::string result;
+      result.reserve(fixed_data.size() * 2);  // 2 chars per byte
+      for (const auto& byte : fixed_data) {
+        std::format_to(std::back_inserter(result), "{:02X}", byte);
+      }
+      return result;
+    }
     case TypeId::kDecimal:
     case TypeId::kUuid:
-    case TypeId::kFixed:
     case TypeId::kDate:
     case TypeId::kTime:
     case TypeId::kTimestamp:
diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h
index 1c16b8e..c11d48f 100644
--- a/src/iceberg/expression/literal.h
+++ b/src/iceberg/expression/literal.h
@@ -72,6 +72,7 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
   static Literal Double(double value);
   static Literal String(std::string value);
   static Literal Binary(std::vector<uint8_t> value);
+  static Literal Fixed(std::vector<uint8_t> value);
 
   /// \brief Create a literal representing a null value.
   static Literal Null(std::shared_ptr<PrimitiveType> type) {
@@ -144,11 +145,76 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
  private:
   Literal(Value value, std::shared_ptr<PrimitiveType> type);
 
+  friend class Conversions;
   friend class LiteralCaster;
 
- private:
   Value value_;
   std::shared_ptr<PrimitiveType> type_;
 };
 
+template <TypeId type_id>
+struct LiteralTraits {
+  using ValueType = void;
+};
+
+template <>
+struct LiteralTraits<TypeId::kBoolean> {
+  using ValueType = bool;
+};
+
+template <>
+struct LiteralTraits<TypeId::kInt> {
+  using ValueType = int32_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kDate> {
+  using ValueType = int32_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kLong> {
+  using ValueType = int64_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kTime> {
+  using ValueType = int64_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kTimestamp> {
+  using ValueType = int64_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kTimestampTz> {
+  using ValueType = int64_t;
+};
+
+template <>
+struct LiteralTraits<TypeId::kFloat> {
+  using ValueType = float;
+};
+
+template <>
+struct LiteralTraits<TypeId::kDouble> {
+  using ValueType = double;
+};
+
+template <>
+struct LiteralTraits<TypeId::kString> {
+  using ValueType = std::string;
+};
+
+template <>
+struct LiteralTraits<TypeId::kBinary> {
+  using ValueType = std::vector<uint8_t>;
+};
+
+template <>
+struct LiteralTraits<TypeId::kFixed> {
+  using ValueType = std::vector<uint8_t>;
+};
+
 }  // namespace iceberg
diff --git a/src/iceberg/test/literal_test.cc b/src/iceberg/test/literal_test.cc
index e9ddd47..bd7544b 100644
--- a/src/iceberg/test/literal_test.cc
+++ b/src/iceberg/test/literal_test.cc
@@ -81,7 +81,7 @@ TEST(LiteralTest, IntCastTo) {
   auto long_result = int_literal.CastTo(iceberg::int64());
   ASSERT_THAT(long_result, IsOk());
   EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong);
-  EXPECT_EQ(long_result->ToString(), "42");
+  EXPECT_EQ(std::get<int64_t>(long_result->value()), 42L);
 
   // Cast to Float
   auto float_result = int_literal.CastTo(iceberg::float32());
@@ -137,7 +137,6 @@ TEST(LiteralTest, LongCastTo) {
 }
 
 TEST(LiteralTest, LongCastToIntOverflow) {
-  // Test overflow cases
   auto max_long =
       Literal::Long(static_cast<int64_t>(std::numeric_limits<int32_t>::max()) 
+ 1);
   auto min_long =
@@ -383,4 +382,208 @@ TEST(LiteralTest, DoubleZeroComparison) {
   EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less);
 }
 
+struct LiteralParam {
+  std::string test_name;
+  std::vector<uint8_t> serialized;
+  Literal value;
+  std::shared_ptr<PrimitiveType> type;
+};
+
+class LiteralSerDeParam : public ::testing::TestWithParam<LiteralParam> {};
+
+TEST_P(LiteralSerDeParam, RoundTrip) {
+  const auto& param = GetParam();
+
+  // Deserialize from bytes
+  Result<Literal> literal_result = Literal::Deserialize(param.serialized, 
param.type);
+  ASSERT_TRUE(literal_result.has_value())
+      << "Deserialization failed: " << literal_result.error().message;
+
+  // Check type and value
+  EXPECT_EQ(*literal_result, param.value);
+
+  // Serialize back to bytes
+  Result<std::vector<uint8_t>> bytes_result = literal_result->Serialize();
+  ASSERT_TRUE(bytes_result.has_value())
+      << "Serialization failed: " << bytes_result.error().message;
+  EXPECT_EQ(*bytes_result, param.serialized);
+
+  // Deserialize again to verify idempotency
+  Result<Literal> final_literal = Literal::Deserialize(*bytes_result, 
param.type);
+  ASSERT_TRUE(final_literal.has_value())
+      << "Final deserialization failed: " << final_literal.error().message;
+  EXPECT_EQ(*final_literal, param.value);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BinarySerialization, LiteralSerDeParam,
+    ::testing::Values(
+        // Basic types
+        LiteralParam{"BooleanTrue", {1}, Literal::Boolean(true), boolean()},
+        LiteralParam{"BooleanFalse", {0}, Literal::Boolean(false), boolean()},
+
+        LiteralParam{"Int", {32, 0, 0, 0}, Literal::Int(32), int32()},
+        LiteralParam{
+            "IntMaxValue", {255, 255, 255, 127}, Literal::Int(2147483647), 
int32()},
+        LiteralParam{"IntMinValue", {0, 0, 0, 128}, Literal::Int(-2147483648), 
int32()},
+        LiteralParam{"NegativeInt", {224, 255, 255, 255}, Literal::Int(-32), 
int32()},
+
+        LiteralParam{"Long", {32, 0, 0, 0, 0, 0, 0, 0}, Literal::Long(32), 
int64()},
+        LiteralParam{"LongMaxValue",
+                     {255, 255, 255, 255, 255, 255, 255, 127},
+                     Literal::Long(std::numeric_limits<int64_t>::max()),
+                     int64()},
+        LiteralParam{"LongMinValue",
+                     {0, 0, 0, 0, 0, 0, 0, 128},
+                     Literal::Long(std::numeric_limits<int64_t>::min()),
+                     int64()},
+        LiteralParam{"NegativeLong",
+                     {224, 255, 255, 255, 255, 255, 255, 255},
+                     Literal::Long(-32),
+                     int64()},
+
+        LiteralParam{"Float", {0, 0, 128, 63}, Literal::Float(1.0f), 
float32()},
+        LiteralParam{"FloatNegativeInfinity",
+                     {0, 0, 128, 255},
+                     Literal::Float(-std::numeric_limits<float>::infinity()),
+                     float32()},
+        LiteralParam{"FloatMaxValue",
+                     {255, 255, 127, 127},
+                     Literal::Float(std::numeric_limits<float>::max()),
+                     float32()},
+        LiteralParam{"FloatMinValue",
+                     {255, 255, 127, 255},
+                     Literal::Float(std::numeric_limits<float>::lowest()),
+                     float32()},
+
+        LiteralParam{
+            "Double", {0, 0, 0, 0, 0, 0, 240, 63}, Literal::Double(1.0), 
float64()},
+        LiteralParam{"DoubleNegativeInfinity",
+                     {0, 0, 0, 0, 0, 0, 240, 255},
+                     Literal::Double(-std::numeric_limits<double>::infinity()),
+                     float64()},
+        LiteralParam{"DoubleMaxValue",
+                     {255, 255, 255, 255, 255, 255, 239, 127},
+                     Literal::Double(std::numeric_limits<double>::max()),
+                     float64()},
+        LiteralParam{"DoubleMinValue",
+                     {255, 255, 255, 255, 255, 255, 239, 255},
+                     Literal::Double(std::numeric_limits<double>::lowest()),
+                     float64()},
+
+        LiteralParam{"String",
+                     {105, 99, 101, 98, 101, 114, 103},
+                     Literal::String("iceberg"),
+                     string()},
+        LiteralParam{"StringLong",
+                     {65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 
65, 65},
+                     Literal::String("AAAAAAAAAAAAAAAA"),
+                     string()},
+
+        LiteralParam{"BinaryData",
+                     {0x01, 0x02, 0x03, 0xFF},
+                     Literal::Binary({0x01, 0x02, 0x03, 0xFF}),
+                     binary()},
+        LiteralParam{"BinarySingleByte", {42}, Literal::Binary({42}), 
binary()},
+
+        // Fixed type
+        LiteralParam{"FixedLength4",
+                     {0x01, 0x02, 0x03, 0x04},
+                     Literal::Fixed({0x01, 0x02, 0x03, 0x04}),
+                     fixed(4)},
+        LiteralParam{"FixedLength8",
+                     {0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x00, 0x11},
+                     Literal::Fixed({0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x00, 
0x11}),
+                     fixed(8)},
+        LiteralParam{"FixedLength16",
+                     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 
0x09, 0x0A,
+                      0x0B, 0x0C, 0x0D, 0x0E, 0x0F},
+                     Literal::Fixed({0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 
0x07, 0x08,
+                                     0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 
0x0F}),
+                     fixed(16)},
+        LiteralParam{"FixedSingleByte", {0xFF}, Literal::Fixed({0xFF}), 
fixed(1)},
+
+        // Temporal types
+        LiteralParam{"DateEpoch", {0, 0, 0, 0}, Literal::Date(0), date()},
+        LiteralParam{"DateNextDay", {1, 0, 0, 0}, Literal::Date(1), date()},
+        LiteralParam{"DateY2K", {205, 42, 0, 0}, Literal::Date(10957), date()},
+        LiteralParam{"DateNegative", {255, 255, 255, 255}, Literal::Date(-1), 
date()},
+
+        LiteralParam{"TimeMidnight", {0, 0, 0, 0, 0, 0, 0, 0}, 
Literal::Time(0), time()},
+        LiteralParam{"TimeNoon",
+                     {128, 9, 230, 124, 10, 0, 0, 0},
+                     Literal::Time(45045123456),
+                     time()},
+        LiteralParam{
+            "TimeOneSecond", {64, 66, 15, 0, 0, 0, 0, 0}, 
Literal::Time(1000000), time()},
+
+        LiteralParam{"TimestampEpoch",
+                     {0, 0, 0, 0, 0, 0, 0, 0},
+                     Literal::Timestamp(0),
+                     timestamp()},
+        LiteralParam{"TimestampOneSecond",
+                     {64, 66, 15, 0, 0, 0, 0, 0},
+                     Literal::Timestamp(1000000),
+                     timestamp()},
+        LiteralParam{"TimestampNoon2024",
+                     {128, 9, 230, 124, 10, 0, 0, 0},
+                     Literal::Timestamp(45045123456),
+                     timestamp()},
+
+        LiteralParam{"TimestampTzEpoch",
+                     {0, 0, 0, 0, 0, 0, 0, 0},
+                     Literal::TimestampTz(0),
+                     timestamp_tz()},
+        LiteralParam{"TimestampTzOneHour",
+                     {0, 164, 147, 214, 0, 0, 0, 0},
+                     Literal::TimestampTz(3600000000),
+                     timestamp_tz()},
+
+        // Empty values
+        LiteralParam{"EmptyString", {}, Literal::String(""), string()},
+        LiteralParam{"EmptyBinary", {}, Literal::Binary({}), binary()}),
+
+    [](const testing::TestParamInfo<LiteralSerDeParam::ParamType>& info) {
+      return info.param.test_name;
+    });
+
+TEST(LiteralSerDeTest, EmptyString) {
+  auto empty_string = Literal::String("");
+  auto empty_bytes = empty_string.Serialize();
+  ASSERT_TRUE(empty_bytes.has_value());
+  EXPECT_TRUE(empty_bytes->empty());
+
+  auto deserialize_result = Literal::Deserialize(*empty_bytes, string());
+  ASSERT_THAT(deserialize_result, IsOk());
+  EXPECT_TRUE(std::get<std::string>(deserialize_result->value()).empty());
+}
+
+TEST(LiteralSerDeTest, EmptyBinary) {
+  auto empty_binary = Literal::Binary({});
+  auto empty_bytes = empty_binary.Serialize();
+  ASSERT_TRUE(empty_bytes.has_value());
+  EXPECT_TRUE(empty_bytes->empty());
+
+  auto deserialize_result = Literal::Deserialize(*empty_bytes, binary());
+  ASSERT_THAT(deserialize_result, IsOk());
+  
EXPECT_TRUE(std::get<std::vector<uint8_t>>(deserialize_result->value()).empty());
+}
+
+// Type promotion tests
+TEST(LiteralSerDeTest, TypePromotion) {
+  // 4-byte int data can be deserialized as long
+  std::vector<uint8_t> int_data = {32, 0, 0, 0};
+  auto long_result = Literal::Deserialize(int_data, int64());
+  ASSERT_TRUE(long_result.has_value());
+  EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong);
+  EXPECT_EQ(std::get<int64_t>(long_result->value()), 32L);
+
+  // 4-byte float data can be deserialized as double
+  std::vector<uint8_t> float_data = {0, 0, 128, 63};
+  auto double_result = Literal::Deserialize(float_data, float64());
+  ASSERT_TRUE(double_result.has_value());
+  EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble);
+  EXPECT_DOUBLE_EQ(std::get<double>(double_result->value()), 1.0);
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/test/manifest_list_reader_test.cc 
b/src/iceberg/test/manifest_list_reader_test.cc
index a3c08c3..9fd6e4c 100644
--- a/src/iceberg/test/manifest_list_reader_test.cc
+++ b/src/iceberg/test/manifest_list_reader_test.cc
@@ -23,6 +23,7 @@
 
 #include "iceberg/arrow/arrow_fs_file_io_internal.h"
 #include "iceberg/avro/avro_register.h"
+#include "iceberg/expression/literal.h"
 #include "iceberg/manifest_list.h"
 #include "iceberg/manifest_reader.h"
 #include "temp_file_test_base.h"
@@ -76,43 +77,38 @@ class ManifestListReaderV1Test : public 
ManifestListReaderTestBase {
     std::vector<int64_t> file_size = {6185, 6113};
     std::vector<int64_t> snapshot_id = {7532614258660258098, 
7532614258660258098};
 
-    std::vector<std::vector<std::uint8_t>> lower_bounds = {
-        {0x32, 0x30, 0x32, 0x32, 0x2D, 0x30, 0x32, 0x2D, 0x32, 0x32},
-        {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}};
-
-    std::vector<std::vector<std::uint8_t>> upper_bounds = {
-        {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33},
-        {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}};
-
-    return {{.manifest_path = paths[0],
-             .manifest_length = file_size[0],
-             .partition_spec_id = 0,
-             .added_snapshot_id = snapshot_id[0],
-             .added_files_count = 4,
-             .existing_files_count = 0,
-             .deleted_files_count = 0,
-             .added_rows_count = 6,
-             .existing_rows_count = 0,
-             .deleted_rows_count = 0,
-             .partitions = {{.contains_null = false,
-                             .contains_nan = false,
-                             .lower_bound = lower_bounds[0],
-                             .upper_bound = upper_bounds[0]}}},
-
-            {.manifest_path = paths[1],
-             .manifest_length = file_size[1],
-             .partition_spec_id = 0,
-             .added_snapshot_id = snapshot_id[1],
-             .added_files_count = 0,
-             .existing_files_count = 0,
-             .deleted_files_count = 2,
-             .added_rows_count = 0,
-             .existing_rows_count = 0,
-             .deleted_rows_count = 6,
-             .partitions = {{.contains_null = false,
-                             .contains_nan = false,
-                             .lower_bound = lower_bounds[1],
-                             .upper_bound = upper_bounds[1]}}}};
+    return {
+        {.manifest_path = paths[0],
+         .manifest_length = file_size[0],
+         .partition_spec_id = 0,
+         .added_snapshot_id = snapshot_id[0],
+         .added_files_count = 4,
+         .existing_files_count = 0,
+         .deleted_files_count = 0,
+         .added_rows_count = 6,
+         .existing_rows_count = 0,
+         .deleted_rows_count = 0,
+         .partitions = {{.contains_null = false,
+                         .contains_nan = false,
+                         .lower_bound = 
Literal::String("2022-02-22").Serialize().value(),
+                         .upper_bound =
+                             
Literal::String("2022-2-23").Serialize().value()}}},
+
+        {.manifest_path = paths[1],
+         .manifest_length = file_size[1],
+         .partition_spec_id = 0,
+         .added_snapshot_id = snapshot_id[1],
+         .added_files_count = 0,
+         .existing_files_count = 0,
+         .deleted_files_count = 2,
+         .added_rows_count = 0,
+         .existing_rows_count = 0,
+         .deleted_rows_count = 6,
+         .partitions = {
+             {.contains_null = false,
+              .contains_nan = false,
+              .lower_bound = Literal::String("2022-2-22").Serialize().value(),
+              .upper_bound = 
Literal::String("2022-2-23").Serialize().value()}}}};
   }
 
   std::vector<ManifestFile> PrepareComplexTypeTestData() {
diff --git a/src/iceberg/test/manifest_reader_test.cc 
b/src/iceberg/test/manifest_reader_test.cc
index db703c1..7381b29 100644
--- a/src/iceberg/test/manifest_reader_test.cc
+++ b/src/iceberg/test/manifest_reader_test.cc
@@ -94,24 +94,33 @@ class ManifestReaderV1Test : public ManifestReaderTestBase {
         "order_ts_hour=2021-01-26-00/"
         "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00004.parquet"};
     std::vector<int64_t> partitions = {447696, 473976, 465192, 447672};
+
+    // TODO(Li Feiyang): The Decimal type and its serialization logic are not 
yet fully
+    // implemented to support variable-length encoding as required by the 
Iceberg
+    // specification. Using Literal::Binary as a temporary substitute to 
represent the raw
+    // bytes for the decimal values.
     std::vector<std::map<int32_t, std::vector<uint8_t>>> bounds = {
-        {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {3, {0x12, 0xe2}},
-         {4, {0xc0, 'y', 0xe7, 0x98, 0xd6, 0xb9, 0x05, 0x00}}},
-        {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {3, {0x12, 0xe3}},
-         {4, {0xc0, 0x19, '#', '=', 0xe2, 0x0f, 0x06, 0x00}}},
-        {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {3, {0x0e, '"'}},
-         {4, {0xc0, 0xd9, '7', 0x93, 0x1f, 0xf3, 0x05, 0x00}}},
-        {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {3, {0x0e, '!'}},
-         {4, {0xc0, 0x19, 0x10, '{', 0xc2, 0xb9, 0x05, 0x00}}},
+        {{1, Literal::Long(1234).Serialize().value()},
+         {2, Literal::Long(5678).Serialize().value()},
+         {3, Literal::Binary({0x12, 0xe2}).Serialize().value()},
+
+         {4, Literal::Timestamp(1611706223000000LL).Serialize().value()}},
+        {{1, Literal::Long(1234).Serialize().value()},
+         {2, Literal::Long(5678).Serialize().value()},
+         {3, Literal::Binary({0x12, 0xe3}).Serialize().value()},
+
+         {4, Literal::Timestamp(1706314223000000LL).Serialize().value()}},
+        {{1, Literal::Long(123).Serialize().value()},
+         {2, Literal::Long(456).Serialize().value()},
+         {3, Literal::Binary({0x0e, 0x22}).Serialize().value()},
+
+         {4, Literal::Timestamp(1674691823000000LL).Serialize().value()}},
+        {{1, Literal::Long(123).Serialize().value()},
+         {2, Literal::Long(456).Serialize().value()},
+         {3, Literal::Binary({0x0e, 0x21}).Serialize().value()},
+         {4, Literal::Timestamp(1611619823000000LL).Serialize().value()}},
     };
+
     for (int i = 0; i < 4; ++i) {
       ManifestEntry entry;
       entry.status = ManifestStatus::kAdded;
@@ -159,16 +168,16 @@ class ManifestReaderV2Test : public 
ManifestReaderTestBase {
     std::vector<int64_t> record_counts = {4};
 
     std::vector<std::map<int32_t, std::vector<uint8_t>>> lower_bounds = {
-        {{1, {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 'f', 'o', 'u', 'r'}},
-         {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', 
'1'}},
-         {4, {0xcd, 0xcc, 0xcc, 0xcc, 0xcc, 0xdc, 0x5e, 0x40}}}};
+        {{1, Literal::Long(1).Serialize().value()},
+         {2, Literal::String("record_four").Serialize().value()},
+         {3, Literal::String("data_content_1").Serialize().value()},
+         {4, Literal::Double(123.45).Serialize().value()}}};
 
     std::vector<std::map<int32_t, std::vector<uint8_t>>> upper_bounds = {
-        {{1, {0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
-         {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 't', 'w', 'o'}},
-         {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', 
'4'}},
-         {4, {0x14, 0xae, 0x47, 0xe1, 0x7a, 0x8c, 0x7c, 0x40}}}};
+        {{1, Literal::Long(4).Serialize().value()},
+         {2, Literal::String("record_two").Serialize().value()},
+         {3, Literal::String("data_content_4").Serialize().value()},
+         {4, Literal::Double(456.78).Serialize().value()}}};
 
     DataFile data_file{.file_path = test_dir_prefix + paths[0],
                        .file_format = FileFormatType::kParquet,
diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc
index 7b0f094..ddb3285 100644
--- a/src/iceberg/type.cc
+++ b/src/iceberg/type.cc
@@ -22,6 +22,7 @@
 #include <format>
 #include <iterator>
 #include <memory>
+#include <utility>
 
 #include "iceberg/exception.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
@@ -386,4 +387,45 @@ std::shared_ptr<StructType> 
struct_(std::vector<SchemaField> fields) {
   return std::make_shared<StructType>(std::move(fields));
 }
 
+std::string_view ToString(TypeId id) {
+  switch (id) {
+    case TypeId::kStruct:
+      return "struct";
+    case TypeId::kList:
+      return "list";
+    case TypeId::kMap:
+      return "map";
+    case TypeId::kBoolean:
+      return "boolean";
+    case TypeId::kInt:
+      return "int";
+    case TypeId::kLong:
+      return "long";
+    case TypeId::kFloat:
+      return "float";
+    case TypeId::kDouble:
+      return "double";
+    case TypeId::kDecimal:
+      return "decimal";
+    case TypeId::kDate:
+      return "date";
+    case TypeId::kTime:
+      return "time";
+    case TypeId::kTimestamp:
+      return "timestamp";
+    case TypeId::kTimestampTz:
+      return "timestamptz";
+    case TypeId::kString:
+      return "string";
+    case TypeId::kUuid:
+      return "uuid";
+    case TypeId::kFixed:
+      return "fixed";
+    case TypeId::kBinary:
+      return "binary";
+  }
+
+  std::unreachable();
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/type.h b/src/iceberg/type.h
index 01c911d..2565268 100644
--- a/src/iceberg/type.h
+++ b/src/iceberg/type.h
@@ -531,4 +531,13 @@ ICEBERG_EXPORT std::shared_ptr<MapType> map(SchemaField 
key, SchemaField value);
 
 /// @}
 
+/// \brief Get the lowercase string representation of a TypeId.
+///
+/// This returns the same lowercase string as used by Type::ToString() methods.
+/// For example: TypeId::kBoolean -> "boolean", TypeId::kInt -> "int", etc.
+///
+/// \param id The TypeId to convert to string
+/// \return A string_view containing the lowercase type name
+ICEBERG_EXPORT std::string_view ToString(TypeId id);
+
 }  // namespace iceberg
diff --git a/src/iceberg/util/conversions.cc b/src/iceberg/util/conversions.cc
new file mode 100644
index 0000000..c5dbcf3
--- /dev/null
+++ b/src/iceberg/util/conversions.cc
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/conversions.h"
+
+#include <cstring>
+#include <span>
+#include <string>
+
+#include "iceberg/util/endian.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+/// \brief Write a value in little-endian format and return as vector.
+template <EndianConvertible T>
+std::vector<uint8_t> WriteLittleEndian(T value) {
+  value = ToLittleEndian(value);
+  const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
+  std::vector<uint8_t> result;
+  result.insert(result.end(), bytes, bytes + sizeof(T));
+  return result;
+}
+
+/// \brief Read a value in little-endian format from the data.
+template <EndianConvertible T>
+Result<T> ReadLittleEndian(std::span<const uint8_t> data) {
+  if (data.size() != sizeof(T)) [[unlikely]] {
+    return InvalidArgument("Insufficient data to read {} bytes, got {}", 
sizeof(T),
+                           data.size());
+  }
+
+  T value;
+  std::memcpy(&value, data.data(), sizeof(T));
+  return FromLittleEndian(value);
+}
+
+template <TypeId type_id>
+Result<std::vector<uint8_t>> ToBytesImpl(const Literal::Value& value) {
+  using CppType = typename LiteralTraits<type_id>::ValueType;
+  return WriteLittleEndian(std::get<CppType>(value));
+}
+
+template <>
+Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kBoolean>(const 
Literal::Value& value) {
+  return std::vector<uint8_t>{std::get<bool>(value) ? 
static_cast<uint8_t>(0x01)
+                                                    : 
static_cast<uint8_t>(0x00)};
+}
+
+template <>
+Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kString>(const 
Literal::Value& value) {
+  const auto& str = std::get<std::string>(value);
+  return std::vector<uint8_t>(str.begin(), str.end());
+}
+
+template <>
+Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kBinary>(const 
Literal::Value& value) {
+  return std::get<std::vector<uint8_t>>(value);
+}
+
+template <>
+Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kFixed>(const Literal::Value& 
value) {
+  return std::get<std::vector<uint8_t>>(value);
+}
+
+#define DISPATCH_LITERAL_TO_BYTES(type_id) \
+  case type_id:                            \
+    return ToBytesImpl<type_id>(value);
+
+Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type,
+                                                  const Literal::Value& value) 
{
+  const auto type_id = type.type_id();
+
+  switch (type_id) {
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kInt)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kDate)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kLong)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kTime)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kTimestamp)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kTimestampTz)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kFloat)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kDouble)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kBoolean)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kString)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kBinary)
+    DISPATCH_LITERAL_TO_BYTES(TypeId::kFixed)
+      // TODO(Li Feiyang): Add support for UUID and Decimal
+
+    default:
+      return NotSupported("Serialization for type {} is not supported", 
type.ToString());
+  }
+}
+
+#undef DISPATCH_LITERAL_TO_BYTES
+
+Result<std::vector<uint8_t>> Conversions::ToBytes(const Literal& literal) {
+  // Cannot serialize special values
+  if (literal.IsAboveMax()) {
+    return NotSupported("Cannot serialize AboveMax");
+  }
+  if (literal.IsBelowMin()) {
+    return NotSupported("Cannot serialize BelowMin");
+  }
+  if (literal.IsNull()) {
+    return NotSupported("Cannot serialize null");
+  }
+
+  return ToBytes(*literal.type(), literal.value());
+}
+
+Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type,
+                                              std::span<const uint8_t> data) {
+  const auto type_id = type.type_id();
+  switch (type_id) {
+    case TypeId::kBoolean: {
+      ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data));
+      return Literal::Value{static_cast<bool>(value != 0x00)};
+    }
+    case TypeId::kInt: {
+      ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+      return Literal::Value{value};
+    }
+    case TypeId::kDate: {
+      ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+      return Literal::Value{value};
+    }
+    case TypeId::kLong:
+    case TypeId::kTime:
+    case TypeId::kTimestamp:
+    case TypeId::kTimestampTz: {
+      int64_t value;
+      if (data.size() < 8) {
+        // Type was promoted from int to long
+        ICEBERG_ASSIGN_OR_RAISE(auto int_value, 
ReadLittleEndian<int32_t>(data));
+        value = static_cast<int64_t>(int_value);
+      } else {
+        ICEBERG_ASSIGN_OR_RAISE(auto long_value, 
ReadLittleEndian<int64_t>(data));
+        value = long_value;
+      }
+      return Literal::Value{value};
+    }
+    case TypeId::kFloat: {
+      ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<float>(data));
+      return Literal::Value{value};
+    }
+    case TypeId::kDouble: {
+      if (data.size() < 8) {
+        // Type was promoted from float to double
+        ICEBERG_ASSIGN_OR_RAISE(auto float_value, 
ReadLittleEndian<float>(data));
+        return Literal::Value{static_cast<double>(float_value)};
+      } else {
+        ICEBERG_ASSIGN_OR_RAISE(auto double_value, 
ReadLittleEndian<double>(data));
+        return Literal::Value{double_value};
+      }
+    }
+    case TypeId::kString:
+      return Literal::Value{
+          std::string(reinterpret_cast<const char*>(data.data()), 
data.size())};
+    case TypeId::kBinary:
+      return Literal::Value{std::vector<uint8_t>(data.begin(), data.end())};
+    case TypeId::kFixed: {
+      const auto& fixed_type = static_cast<const FixedType&>(type);
+      if (data.size() != fixed_type.length()) {
+        return InvalidArgument("Invalid data size for Fixed literal, got size: 
{}",
+                               data.size());
+      }
+      return Literal::Value{std::vector<uint8_t>(data.begin(), data.end())};
+    }
+      // TODO(Li Feiyang): Add support for UUID and Decimal
+    default:
+      return NotSupported("Deserialization for type {} is not supported",
+                          type.ToString());
+  }
+}
+
+Result<Literal> Conversions::FromBytes(std::shared_ptr<PrimitiveType> type,
+                                       std::span<const uint8_t> data) {
+  if (!type) {
+    return InvalidArgument("Type cannot be null");
+  }
+
+  ICEBERG_ASSIGN_OR_RAISE(auto value, FromBytes(*type, data));
+  return Literal(std::move(value), std::move(type));
+}
+
+}  // namespace iceberg
diff --git a/src/iceberg/util/conversions.h b/src/iceberg/util/conversions.h
new file mode 100644
index 0000000..fe383bc
--- /dev/null
+++ b/src/iceberg/util/conversions.h
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <span>
+#include <vector>
+
+#include "iceberg/expression/literal.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+
+/// \file iceberg/util/conversions.h
+/// \brief Conversion utilities for primitive types
+
+namespace iceberg {
+
+/// \brief Conversion utilities for primitive types
+class ICEBERG_EXPORT Conversions {
+ public:
+  /// \brief Serializes a raw literal value into a byte vector according to 
its type.
+  /// \param type The primitive type of the value.
+  /// \param value The std::variant holding the raw literal value to serialize.
+  /// \return A Result containing the serialized value.
+  static Result<std::vector<uint8_t>> ToBytes(const PrimitiveType& type,
+                                              const Literal::Value& value);
+
+  /// \brief Serializes a complete Literal object into a byte vector.
+  /// \param literal The Literal object to serialize.
+  /// \return A Result containing the serialized value.
+  static Result<std::vector<uint8_t>> ToBytes(const Literal& literal);
+
+  /// \brief Deserializes a span of bytes into a raw literal value based on 
the given
+  /// type.
+  /// \param type The target primitive type to interpret the bytes as.
+  /// \param data A std::span of bytes representing the serialized value.
+  /// \return A Result containing the deserialized value.
+  static Result<Literal::Value> FromBytes(const PrimitiveType& type,
+                                          std::span<const uint8_t> data);
+
+  /// \brief Deserializes a span of bytes into a complete Literal object.
+  /// \param type A shared pointer to the target primitive type.
+  /// \param data A std::span of bytes representing the serialized value.
+  /// \return A Result containing the deserialized value.
+  static Result<Literal> FromBytes(std::shared_ptr<PrimitiveType> type,
+                                   std::span<const uint8_t> data);
+};
+
+}  // namespace iceberg


Reply via email to