mapleFU commented on code in PR #185:
URL: https://github.com/apache/iceberg-cpp/pull/185#discussion_r2296093444


##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal& 
literal,
                       target_type->ToString());
 }
 
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal& 
literal) {
+  // Cannot serialize special values
+  if (literal.IsAboveMax()) {
+    return InvalidArgument("Cannot serialize AboveMax literal");
+  }
+  if (literal.IsBelowMin()) {
+    return InvalidArgument("Cannot serialize BelowMin literal");
+  }
+
+  std::vector<uint8_t> result;
+
+  // Null values serialize to empty buffer
+  if (literal.IsNull()) {
+    return result;
+  }
+
+  const auto& value = literal.value();
+  const auto type_id = literal.type()->type_id();
+
+  switch (type_id) {
+    case TypeId::kBoolean: {
+      // 0x00 for false, 0x01 for true
+      result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+      return result;
+    }
+
+    case TypeId::kInt: {
+      // Stored as 4-byte little-endian
+      WriteLittleEndian(result, std::get<int32_t>(value));
+      return result;
+    }
+
+    case TypeId::kDate: {
+      // Stores days from 1970-01-01 in a 4-byte little-endian int
+      WriteLittleEndian(result, std::get<int32_t>(value));
+      return result;
+    }
+
+    case TypeId::kLong: {
+      // Stored as 8-byte little-endian
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTime: {
+      // Stores microseconds from midnight in an 8-byte little-endian long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTimestamp: {
+      // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte 
little-endian
+      // long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTimestampTz: {
+      // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+      // little-endian long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kFloat: {
+      // Stored as 4-byte little-endian
+      WriteLittleEndian(result, std::get<float>(value));
+      return result;
+    }
+
+    case TypeId::kDouble: {
+      // Stored as 8-byte little-endian
+      WriteLittleEndian(result, std::get<double>(value));
+      return result;
+    }
+
+    case TypeId::kString: {
+      // UTF-8 bytes (without length)
+      const auto& str = std::get<std::string>(value);
+      result.insert(result.end(), str.begin(), str.end());
+      return result;
+    }
+
+    case TypeId::kBinary: {
+      // Binary value (without length)
+      const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+      result = binary_data;
+      return result;
+    }
+
+    case TypeId::kFixed: {
+      // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
+      // std::vector<uint8_t>
+      if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
+        const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
+        result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
+      } else if (std::holds_alternative<std::vector<uint8_t>>(value)) {
+        result = std::get<std::vector<uint8_t>>(value);
+      } else {
+        return InvalidArgument("Invalid value type for Fixed literal");
+      }
+      return result;
+    }
+
+    case TypeId::kUuid: {
+      // 16-byte big-endian value
+      const auto& uuid_bytes = std::get<std::array<uint8_t, 16>>(value);
+      WriteBigEndian16(result, uuid_bytes);
+      return result;
+    }
+
+    default:
+      return NotImplemented("Serialization for type {} is not supported",
+                            literal.type()->ToString());
+  }
+}
+
+Result<Literal> LiteralSerializer::FromBytes(std::span<const uint8_t> data,
+                                             std::shared_ptr<PrimitiveType> 
type) {

Review Comment:
   why this type is not passed by reference?



##########
src/iceberg/expression/literal.cc:
##########
@@ -19,13 +19,107 @@
 
 #include "iceberg/expression/literal.h"
 
+#include <algorithm>
+#include <bit>
+#include <chrono>
 #include <cmath>
 #include <concepts>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
 
 #include "iceberg/exception.h"
+#include "iceberg/util/macros.h"
 
 namespace iceberg {
 
+namespace {
+/// \brief Write a value in little-endian format to the buffer.
+template <typename T>
+void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
+  static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially 
copyable");
+
+  if constexpr (std::endian::native == std::endian::little) {
+    const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
+    buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+  } else {
+    if constexpr (sizeof(T) > 1) {
+      T le_value = std::byteswap(value);
+      const auto* bytes = reinterpret_cast<const uint8_t*>(&le_value);
+      buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+    } else {
+      // For single byte types, no byteswap needed
+      buffer.push_back(static_cast<uint8_t>(value));
+    }
+  }
+}
+
+/// \brief Read a value in little-endian format from the data.
+template <typename T>
+Result<T> ReadLittleEndian(std::span<const uint8_t> data) {
+  static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially 
copyable");
+
+  if (data.size() < sizeof(T)) {
+    return InvalidArgument("Insufficient data to read {} bytes, got {}", 
sizeof(T),
+                           data.size());
+  }
+
+  T value;
+  std::memcpy(&value, data.data(), sizeof(T));
+
+  if constexpr (std::endian::native != std::endian::little && sizeof(T) > 1) {
+    value = std::byteswap(value);
+  }

Review Comment:
   Maybe we can add a `FromLittleEndian` and `ToLittleEndian` function here



##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal& 
literal,
                       target_type->ToString());
 }
 
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal& 
literal) {
+  // Cannot serialize special values
+  if (literal.IsAboveMax()) {
+    return InvalidArgument("Cannot serialize AboveMax literal");
+  }
+  if (literal.IsBelowMin()) {
+    return InvalidArgument("Cannot serialize BelowMin literal");
+  }
+
+  std::vector<uint8_t> result;
+
+  // Null values serialize to empty buffer
+  if (literal.IsNull()) {
+    return result;
+  }
+
+  const auto& value = literal.value();
+  const auto type_id = literal.type()->type_id();
+
+  switch (type_id) {
+    case TypeId::kBoolean: {
+      // 0x00 for false, 0x01 for true
+      result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+      return result;
+    }
+
+    case TypeId::kInt: {
+      // Stored as 4-byte little-endian
+      WriteLittleEndian(result, std::get<int32_t>(value));
+      return result;
+    }
+
+    case TypeId::kDate: {
+      // Stores days from 1970-01-01 in a 4-byte little-endian int
+      WriteLittleEndian(result, std::get<int32_t>(value));
+      return result;
+    }
+
+    case TypeId::kLong: {
+      // Stored as 8-byte little-endian
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTime: {
+      // Stores microseconds from midnight in an 8-byte little-endian long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTimestamp: {
+      // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte 
little-endian
+      // long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTimestampTz: {
+      // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+      // little-endian long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kFloat: {
+      // Stored as 4-byte little-endian
+      WriteLittleEndian(result, std::get<float>(value));
+      return result;
+    }
+
+    case TypeId::kDouble: {
+      // Stored as 8-byte little-endian
+      WriteLittleEndian(result, std::get<double>(value));
+      return result;
+    }
+
+    case TypeId::kString: {
+      // UTF-8 bytes (without length)
+      const auto& str = std::get<std::string>(value);
+      result.insert(result.end(), str.begin(), str.end());
+      return result;
+    }
+
+    case TypeId::kBinary: {
+      // Binary value (without length)
+      const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+      result = binary_data;
+      return result;
+    }
+
+    case TypeId::kFixed: {
+      // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
+      // std::vector<uint8_t>
+      if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
+        const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
+        result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
+      } else if (std::holds_alternative<std::vector<uint8_t>>(value)) {
+        result = std::get<std::vector<uint8_t>>(value);
+      } else {
+        return InvalidArgument("Invalid value type for Fixed literal");
+      }
+      return result;
+    }
+
+    case TypeId::kUuid: {
+      // 16-byte big-endian value
+      const auto& uuid_bytes = std::get<std::array<uint8_t, 16>>(value);
+      WriteBigEndian16(result, uuid_bytes);
+      return result;
+    }
+
+    default:
+      return NotImplemented("Serialization for type {} is not supported",
+                            literal.type()->ToString());
+  }
+}
+
+Result<Literal> LiteralSerializer::FromBytes(std::span<const uint8_t> data,
+                                             std::shared_ptr<PrimitiveType> 
type) {
+  if (!type) {
+    return InvalidArgument("Type cannot be null");
+  }
+
+  // Empty data represents null value
+  if (data.empty()) {
+    return Literal::Null(type);
+  }
+
+  const auto type_id = type->type_id();
+

Review Comment:
   I found a weird thing here. This part of the code only checks the buffer 
size in few types, it assumes the buffer-size equals to the size upper layer 
passed in. Should we do some check here?



##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal& 
literal,
                       target_type->ToString());
 }
 
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal& 
literal) {
+  // Cannot serialize special values
+  if (literal.IsAboveMax()) {
+    return InvalidArgument("Cannot serialize AboveMax literal");
+  }
+  if (literal.IsBelowMin()) {
+    return InvalidArgument("Cannot serialize BelowMin literal");
+  }
+
+  std::vector<uint8_t> result;
+
+  // Null values serialize to empty buffer
+  if (literal.IsNull()) {
+    return result;
+  }
+
+  const auto& value = literal.value();
+  const auto type_id = literal.type()->type_id();
+
+  switch (type_id) {
+    case TypeId::kBoolean: {
+      // 0x00 for false, 0x01 for true
+      result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+      return result;
+    }
+
+    case TypeId::kInt: {
+      // Stored as 4-byte little-endian
+      WriteLittleEndian(result, std::get<int32_t>(value));
+      return result;
+    }
+
+    case TypeId::kDate: {
+      // Stores days from 1970-01-01 in a 4-byte little-endian int
+      WriteLittleEndian(result, std::get<int32_t>(value));
+      return result;
+    }
+
+    case TypeId::kLong: {
+      // Stored as 8-byte little-endian
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTime: {
+      // Stores microseconds from midnight in an 8-byte little-endian long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTimestamp: {
+      // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte 
little-endian
+      // long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTimestampTz: {
+      // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+      // little-endian long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kFloat: {
+      // Stored as 4-byte little-endian
+      WriteLittleEndian(result, std::get<float>(value));
+      return result;
+    }
+
+    case TypeId::kDouble: {
+      // Stored as 8-byte little-endian
+      WriteLittleEndian(result, std::get<double>(value));
+      return result;
+    }
+
+    case TypeId::kString: {
+      // UTF-8 bytes (without length)
+      const auto& str = std::get<std::string>(value);
+      result.insert(result.end(), str.begin(), str.end());
+      return result;
+    }
+
+    case TypeId::kBinary: {
+      // Binary value (without length)
+      const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+      result = binary_data;

Review Comment:
   The style here mixes assign and insert back, the code is right but I think 
keep a same style might be better ( .i.e using `result.insert` )



##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal& 
literal,
                       target_type->ToString());
 }
 
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal& 
literal) {
+  // Cannot serialize special values
+  if (literal.IsAboveMax()) {
+    return InvalidArgument("Cannot serialize AboveMax literal");
+  }
+  if (literal.IsBelowMin()) {
+    return InvalidArgument("Cannot serialize BelowMin literal");
+  }
+
+  std::vector<uint8_t> result;
+
+  // Null values serialize to empty buffer
+  if (literal.IsNull()) {
+    return result;
+  }
+
+  const auto& value = literal.value();
+  const auto type_id = literal.type()->type_id();
+
+  switch (type_id) {
+    case TypeId::kBoolean: {
+      // 0x00 for false, 0x01 for true
+      result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+      return result;
+    }
+
+    case TypeId::kInt: {
+      // Stored as 4-byte little-endian
+      WriteLittleEndian(result, std::get<int32_t>(value));
+      return result;
+    }
+
+    case TypeId::kDate: {
+      // Stores days from 1970-01-01 in a 4-byte little-endian int
+      WriteLittleEndian(result, std::get<int32_t>(value));
+      return result;
+    }
+
+    case TypeId::kLong: {
+      // Stored as 8-byte little-endian
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTime: {
+      // Stores microseconds from midnight in an 8-byte little-endian long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTimestamp: {
+      // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte 
little-endian
+      // long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kTimestampTz: {
+      // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+      // little-endian long
+      WriteLittleEndian(result, std::get<int64_t>(value));
+      return result;
+    }
+
+    case TypeId::kFloat: {
+      // Stored as 4-byte little-endian
+      WriteLittleEndian(result, std::get<float>(value));
+      return result;
+    }
+
+    case TypeId::kDouble: {
+      // Stored as 8-byte little-endian
+      WriteLittleEndian(result, std::get<double>(value));
+      return result;
+    }
+
+    case TypeId::kString: {
+      // UTF-8 bytes (without length)
+      const auto& str = std::get<std::string>(value);
+      result.insert(result.end(), str.begin(), str.end());
+      return result;
+    }
+
+    case TypeId::kBinary: {
+      // Binary value (without length)
+      const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+      result = binary_data;
+      return result;
+    }
+
+    case TypeId::kFixed: {
+      // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
+      // std::vector<uint8_t>
+      if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
+        const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
+        result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
+      } else if (std::holds_alternative<std::vector<uint8_t>>(value)) {
+        result = std::get<std::vector<uint8_t>>(value);
+      } else {
+        return InvalidArgument("Invalid value type for Fixed literal");

Review Comment:
   Maybe can print the literal here



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to