wgtmac commented on code in PR #156:
URL: https://github.com/apache/iceberg-cpp/pull/156#discussion_r2250886955


##########
src/iceberg/transform_function.cc:
##########
@@ -51,6 +61,45 @@ Result<ArrowArray> BucketTransform::Transform(const 
ArrowArray& input) {
   return NotImplemented("BucketTransform::Transform");
 }
 
+Result<std::optional<Literal>> BucketTransform::Transform(const Literal& 
literal) {
+  assert(literal.type() == source_type());

Review Comment:
   Not related to this PR: we should introduce something like `ARROW_DCHECK` to 
add some explicit debug-level checks.



##########
src/iceberg/transform_function.cc:
##########
@@ -91,6 +140,46 @@ Result<ArrowArray> TruncateTransform::Transform(const 
ArrowArray& input) {
   return NotImplemented("TruncateTransform::Transform");
 }
 
+Result<std::optional<Literal>> TruncateTransform::Transform(const Literal& 
literal) {
+  assert(literal.type() == source_type());
+  if (literal.IsBelowMin() || literal.IsAboveMax()) {
+    return InvalidArgument(
+        "Cannot apply truncate transform to literal with value {} of type {}",
+        literal.ToString(), source_type()->ToString());
+  }
+
+  switch (source_type()->type_id()) {
+    case TypeId::kInt: {
+      auto value = std::get<int32_t>(literal.value());
+      return Literal::Int(value % width_);

Review Comment:
   This looks incorrect because it is `v - (v % W)` per the spec. We should be 
consistent with the Java impl:
   
   ```java
     public static int truncateInt(int width, int value) {
       return value - (((value % width) + width) % width);
     }
   
     public static long truncateLong(int width, long value) {
       return value - (((value % width) + width) % width);
     }
   ```



##########
test/transform_test.cc:
##########
@@ -193,4 +193,297 @@ TEST(TransformResultTypeTest, NegativeCases) {
   }
 }
 
+TEST(TransformFunctionTransformTest, IdentityTransform) {

Review Comment:
   `TransformFunctionTransformTest` -> `TransformLiteralTest` ?
   
   Let's make it short...



##########
test/transform_test.cc:
##########
@@ -193,4 +193,297 @@ TEST(TransformResultTypeTest, NegativeCases) {
   }
 }
 
+TEST(TransformFunctionTransformTest, IdentityTransform) {
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::boolean(),
+       .source = Literal::Boolean(true),
+       .expected = Literal::Boolean(true)},
+      {.source_type = iceberg::int32(),
+       .source = Literal::Int(42),
+       .expected = Literal::Int(42)},
+      {.source_type = iceberg::int32(),
+       .source = Literal::Date(30000),
+       .expected = Literal::Date(30000)},
+      {.source_type = iceberg::int64(),
+       .source = Literal::Long(1234567890),
+       .expected = Literal::Long(1234567890)},
+      {.source_type = iceberg::timestamp(),
+       .source = Literal::Timestamp(1622547800000000),
+       .expected = Literal::Timestamp(1622547800000000)},
+      {.source_type = iceberg::timestamp_tz(),
+       .source = Literal::TimestampTz(1622547800000000),
+       .expected = Literal::TimestampTz(1622547800000000)},
+      {.source_type = iceberg::float32(),
+       .source = Literal::Float(3.14),
+       .expected = Literal::Float(3.14)},
+      {.source_type = iceberg::float64(),
+       .source = Literal::Double(1.23e-5),
+       .expected = Literal::Double(1.23e-5)},
+      {.source_type = iceberg::string(),
+       .source = Literal::String("Hello, World!"),
+       .expected = Literal::String("Hello, World!")},
+      {.source_type = iceberg::binary(),
+       .source = Literal::Binary({0x01, 0x02, 0x03}),
+       .expected = Literal::Binary({0x01, 0x02, 0x03})},
+  };
+
+  for (const auto& c : cases) {
+    auto transform = Transform::Identity();
+    auto transformPtr = transform->Bind(c.source_type);
+    ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind identity 
transform";
+
+    auto result = transformPtr.value()->Transform(c.source);
+    ASSERT_TRUE(result.has_value())
+        << "Failed to transform literal: " << c.source.ToString();
+
+    EXPECT_EQ(result.value(), c.expected)
+        << "Unexpected result for source: " << c.source.ToString();
+  }
+}
+
+TEST(TransformFunctionTransformTest, BucketTransform) {
+  constexpr int32_t num_buckets = 4;
+  auto transform = Transform::Bucket(num_buckets);
+
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::int32(),
+       .source = Literal::Int(42),
+       .expected = Literal::Int(3)},
+      {.source_type = iceberg::date(),
+       .source = Literal::Date(30000),
+       .expected = Literal::Int(2)},
+      {.source_type = iceberg::int64(),
+       .source = Literal::Long(1234567890),
+       .expected = Literal::Int(3)},
+      {.source_type = iceberg::timestamp(),
+       .source = Literal::Timestamp(1622547800000000),
+       .expected = Literal::Int(1)},
+      {.source_type = iceberg::timestamp_tz(),
+       .source = Literal::TimestampTz(1622547800000000),
+       .expected = Literal::Int(1)},
+      {.source_type = iceberg::string(),
+       .source = Literal::String("test"),
+       .expected = Literal::Int(3)},
+  };
+
+  for (const auto& c : cases) {
+    auto transformPtr = transform->Bind(c.source_type);
+    ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind bucket transform";
+    auto result = transformPtr.value()->Transform(c.source);
+    ASSERT_TRUE(result.has_value())
+        << "Failed to transform literal: " << c.source.ToString();
+
+    EXPECT_EQ(result.value(), c.expected)
+        << "Unexpected result for source: " << c.source.ToString();
+  }
+}
+
+TEST(TransformFunctionTransformTest, TruncateTransform) {
+  constexpr int32_t width = 5;
+  auto transform = Transform::Truncate(width);
+
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::int32(),
+       .source = Literal::Int(123456),
+       .expected = Literal::Int(1)},
+      {.source_type = iceberg::string(),
+       .source = Literal::String("Hello, World!"),
+       .expected = Literal::String("Hello")},
+      {.source_type = iceberg::binary(),
+       .source = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05, 0x06}),
+       .expected = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05})},
+  };
+
+  for (const auto& c : cases) {
+    auto transformPtr = transform->Bind(c.source_type);
+    ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind truncate 
transform";
+    auto result = transformPtr.value()->Transform(c.source);
+    ASSERT_TRUE(result.has_value())
+        << "Failed to transform literal: " << c.source.ToString();
+
+    EXPECT_EQ(result.value(), c.expected)
+        << "Unexpected result for source: " << c.source.ToString();
+  }
+}
+
+TEST(TransformFunctionTransformTest, YearTransform) {
+  auto transform = Transform::Year();
+
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::timestamp(),
+       .source = Literal::Timestamp(1622547800000000),

Review Comment:
   nit: add a comment to print human readable timestamp values.



##########
src/iceberg/transform_function.cc:
##########
@@ -91,6 +140,46 @@ Result<ArrowArray> TruncateTransform::Transform(const 
ArrowArray& input) {
   return NotImplemented("TruncateTransform::Transform");
 }
 
+Result<std::optional<Literal>> TruncateTransform::Transform(const Literal& 
literal) {
+  assert(literal.type() == source_type());
+  if (literal.IsBelowMin() || literal.IsAboveMax()) {
+    return InvalidArgument(
+        "Cannot apply truncate transform to literal with value {} of type {}",
+        literal.ToString(), source_type()->ToString());
+  }
+
+  switch (source_type()->type_id()) {
+    case TypeId::kInt: {
+      auto value = std::get<int32_t>(literal.value());
+      return Literal::Int(value % width_);
+    }
+    case TypeId::kLong: {
+      auto value = std::get<int64_t>(literal.value());
+      return Literal::Long(value % width_);
+    }
+    case TypeId::kDecimal: {
+      // TODO(zhjwpku): Handle decimal truncation logic here
+      return NotImplemented("Truncate for Decimal is not implemented yet");
+    }
+    case TypeId::kString: {
+      auto value = std::get<std::string>(literal.value());
+      if (value.size() > static_cast<size_t>(width_)) {
+        value.resize(width_);

Review Comment:
   This is not correct. We need to handle UTF8 code point.



##########
src/iceberg/transform.h:
##########
@@ -172,6 +174,8 @@ class ICEBERG_EXPORT TransformFunction {
   TransformFunction(TransformType transform_type, std::shared_ptr<Type> 
source_type);
   /// \brief Transform an input array to a new array
   virtual Result<ArrowArray> Transform(const ArrowArray& data) = 0;

Review Comment:
   I have to admit that `Result<ArrowArray> Transform(const ArrowArray& data)` 
is a bad design. It was added at the time when we don't have `Literal` yet. We 
can safely remove it since it is no longer needed.



##########
test/transform_test.cc:
##########
@@ -193,4 +193,297 @@ TEST(TransformResultTypeTest, NegativeCases) {
   }
 }
 
+TEST(TransformFunctionTransformTest, IdentityTransform) {
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::boolean(),
+       .source = Literal::Boolean(true),
+       .expected = Literal::Boolean(true)},
+      {.source_type = iceberg::int32(),
+       .source = Literal::Int(42),
+       .expected = Literal::Int(42)},
+      {.source_type = iceberg::int32(),
+       .source = Literal::Date(30000),
+       .expected = Literal::Date(30000)},
+      {.source_type = iceberg::int64(),
+       .source = Literal::Long(1234567890),
+       .expected = Literal::Long(1234567890)},
+      {.source_type = iceberg::timestamp(),
+       .source = Literal::Timestamp(1622547800000000),
+       .expected = Literal::Timestamp(1622547800000000)},
+      {.source_type = iceberg::timestamp_tz(),
+       .source = Literal::TimestampTz(1622547800000000),
+       .expected = Literal::TimestampTz(1622547800000000)},
+      {.source_type = iceberg::float32(),
+       .source = Literal::Float(3.14),
+       .expected = Literal::Float(3.14)},
+      {.source_type = iceberg::float64(),
+       .source = Literal::Double(1.23e-5),
+       .expected = Literal::Double(1.23e-5)},
+      {.source_type = iceberg::string(),
+       .source = Literal::String("Hello, World!"),
+       .expected = Literal::String("Hello, World!")},
+      {.source_type = iceberg::binary(),
+       .source = Literal::Binary({0x01, 0x02, 0x03}),
+       .expected = Literal::Binary({0x01, 0x02, 0x03})},
+  };
+
+  for (const auto& c : cases) {
+    auto transform = Transform::Identity();
+    auto transformPtr = transform->Bind(c.source_type);
+    ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind identity 
transform";
+
+    auto result = transformPtr.value()->Transform(c.source);
+    ASSERT_TRUE(result.has_value())
+        << "Failed to transform literal: " << c.source.ToString();
+
+    EXPECT_EQ(result.value(), c.expected)
+        << "Unexpected result for source: " << c.source.ToString();
+  }
+}
+
+TEST(TransformFunctionTransformTest, BucketTransform) {
+  constexpr int32_t num_buckets = 4;
+  auto transform = Transform::Bucket(num_buckets);
+
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::int32(),
+       .source = Literal::Int(42),
+       .expected = Literal::Int(3)},
+      {.source_type = iceberg::date(),
+       .source = Literal::Date(30000),
+       .expected = Literal::Int(2)},
+      {.source_type = iceberg::int64(),
+       .source = Literal::Long(1234567890),
+       .expected = Literal::Int(3)},
+      {.source_type = iceberg::timestamp(),
+       .source = Literal::Timestamp(1622547800000000),
+       .expected = Literal::Int(1)},
+      {.source_type = iceberg::timestamp_tz(),
+       .source = Literal::TimestampTz(1622547800000000),
+       .expected = Literal::Int(1)},
+      {.source_type = iceberg::string(),
+       .source = Literal::String("test"),
+       .expected = Literal::Int(3)},
+  };
+
+  for (const auto& c : cases) {
+    auto transformPtr = transform->Bind(c.source_type);
+    ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind bucket transform";
+    auto result = transformPtr.value()->Transform(c.source);
+    ASSERT_TRUE(result.has_value())
+        << "Failed to transform literal: " << c.source.ToString();
+
+    EXPECT_EQ(result.value(), c.expected)
+        << "Unexpected result for source: " << c.source.ToString();
+  }
+}
+
+TEST(TransformFunctionTransformTest, TruncateTransform) {
+  constexpr int32_t width = 5;
+  auto transform = Transform::Truncate(width);
+
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::int32(),
+       .source = Literal::Int(123456),
+       .expected = Literal::Int(1)},
+      {.source_type = iceberg::string(),
+       .source = Literal::String("Hello, World!"),

Review Comment:
   Please add some non-ASCII UTF8 strings.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to