This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 6e5976c feat: support decimal literal and refactor transform
utilities (#238)
6e5976c is described below
commit 6e5976c7fbc630507e4aa3706e55382aaa4d5267
Author: Junwang Zhao <[email protected]>
AuthorDate: Sun Oct 19 12:56:53 2025 +0800
feat: support decimal literal and refactor transform utilities (#238)
---
src/iceberg/CMakeLists.txt | 3 +
src/iceberg/expression/literal.cc | 26 +-
src/iceberg/expression/literal.h | 17 +-
src/iceberg/manifest_adapter.cc | 5 +-
src/iceberg/meson.build | 3 +
src/iceberg/test/CMakeLists.txt | 2 +
src/iceberg/test/bucket_util_test.cc | 81 ++++
src/iceberg/test/decimal_test.cc | 98 +++++
src/iceberg/test/literal_test.cc | 54 +++
src/iceberg/test/meson.build | 2 +
src/iceberg/test/transform_test.cc | 767 +++++++++++++++++++--------------
src/iceberg/test/truncate_util_test.cc | 53 +++
src/iceberg/transform_function.cc | 231 +---------
src/iceberg/transform_function.h | 3 +
src/iceberg/type_fwd.h | 3 +
src/iceberg/util/bucket_util.cc | 147 +++++++
src/iceberg/util/bucket_util.h | 60 +++
src/iceberg/util/conversions.cc | 17 +-
src/iceberg/util/decimal.cc | 110 ++++-
src/iceberg/util/decimal.h | 13 +-
src/iceberg/util/temporal_util.cc | 239 ++++++++++
src/iceberg/util/temporal_util.h | 43 ++
src/iceberg/util/truncate_util.cc | 107 +++++
src/iceberg/util/truncate_util.h | 21 +-
24 files changed, 1548 insertions(+), 557 deletions(-)
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index e370950..a13f095 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -59,11 +59,14 @@ set(ICEBERG_SOURCES
transform.cc
transform_function.cc
type.cc
+ util/bucket_util.cc
util/conversions.cc
util/decimal.cc
util/gzip_internal.cc
util/murmurhash3_internal.cc
+ util/temporal_util.cc
util/timepoint.cc
+ util/truncate_util.cc
util/uuid.cc
v1_metadata.cc
v2_metadata.cc
diff --git a/src/iceberg/expression/literal.cc
b/src/iceberg/expression/literal.cc
index 18a46c6..aea719c 100644
--- a/src/iceberg/expression/literal.cc
+++ b/src/iceberg/expression/literal.cc
@@ -24,9 +24,9 @@
#include <cstdint>
#include <string>
-#include "iceberg/type_fwd.h"
#include "iceberg/util/checked_cast.h"
#include "iceberg/util/conversions.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
@@ -188,11 +188,14 @@ Result<Literal> LiteralCaster::CastFromString(
const auto& str_val = std::get<std::string>(literal.value_);
switch (target_type->type_id()) {
+ case TypeId::kUuid: {
+ ICEBERG_ASSIGN_OR_RAISE(auto uuid, Uuid::FromString(str_val));
+ return Literal::UUID(uuid);
+ }
case TypeId::kDate:
case TypeId::kTime:
case TypeId::kTimestamp:
case TypeId::kTimestampTz:
- case TypeId::kUuid:
return NotImplemented("Cast from String to {} is not implemented yet",
target_type->ToString());
default:
@@ -296,6 +299,10 @@ Literal Literal::Fixed(std::vector<uint8_t> value) {
return {Value{std::move(value)}, fixed(size)};
}
+Literal Literal::Decimal(int128_t value, int32_t precision, int32_t scale) {
+ return {Value{::iceberg::Decimal(value)}, decimal(precision, scale)};
+}
+
Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
std::shared_ptr<PrimitiveType> type) {
return Conversions::FromBytes(std::move(type), data);
@@ -385,6 +392,15 @@ std::partial_ordering Literal::operator<=>(const Literal&
other) const {
return CompareFloat(this_val, other_val);
}
+ case TypeId::kDecimal: {
+ auto& this_val = std::get<::iceberg::Decimal>(value_);
+ auto& other_val = std::get<::iceberg::Decimal>(other.value_);
+ const auto& this_decimal_type =
internal::checked_cast<DecimalType&>(*type_);
+ const auto& other_decimal_type =
internal::checked_cast<DecimalType&>(*other.type_);
+ return ::iceberg::Decimal::Compare(this_val, other_val,
this_decimal_type.scale(),
+ other_decimal_type.scale());
+ }
+
case TypeId::kString: {
auto& this_val = std::get<std::string>(value_);
auto& other_val = std::get<std::string>(other.value_);
@@ -440,6 +456,12 @@ std::string Literal::ToString() const {
case TypeId::kDouble: {
return std::to_string(std::get<double>(value_));
}
+ case TypeId::kDecimal: {
+ const auto& decimal_type = internal::checked_cast<DecimalType&>(*type_);
+ const auto& decimal = std::get<::iceberg::Decimal>(value_);
+ return decimal.ToString(decimal_type.scale())
+ .value_or("invalid literal of type decimal");
+ }
case TypeId::kString: {
return "\"" + std::get<std::string>(value_) + "\"";
}
diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h
index 70ff2d8..13ffafe 100644
--- a/src/iceberg/expression/literal.h
+++ b/src/iceberg/expression/literal.h
@@ -27,7 +27,9 @@
#include "iceberg/result.h"
#include "iceberg/type.h"
+#include "iceberg/util/decimal.h"
#include "iceberg/util/formattable.h"
+#include "iceberg/util/int128.h"
#include "iceberg/util/uuid.h"
namespace iceberg {
@@ -57,9 +59,9 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
float, // for float
double, // for double
std::string, // for string
- Uuid, // for uuid
- std::vector<uint8_t>, // for binary, fixed
- std::array<uint8_t, 16>, // for decimal
+ std::vector<uint8_t>, // for binary, fixed
+ ::iceberg::Decimal, // for decimal
+ Uuid, // for uuid
BelowMin, AboveMax>;
/// \brief Factory methods for primitive types
@@ -77,6 +79,10 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
static Literal Binary(std::vector<uint8_t> value);
static Literal Fixed(std::vector<uint8_t> value);
+ /// \brief Create a decimal literal.
+ /// \param value The unscaled 128-bit integer value.
+ static Literal Decimal(int128_t value, int32_t precision, int32_t scale);
+
/// \brief Create a literal representing a null value.
static Literal Null(std::shared_ptr<PrimitiveType> type) {
return {Value{std::monostate{}}, std::move(type)};
@@ -205,6 +211,11 @@ struct LiteralTraits<TypeId::kDouble> {
using ValueType = double;
};
+template <>
+struct LiteralTraits<TypeId::kDecimal> {
+ using ValueType = Decimal;
+};
+
template <>
struct LiteralTraits<TypeId::kString> {
using ValueType = std::string;
diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc
index bc0f834..c2ac30e 100644
--- a/src/iceberg/manifest_adapter.cc
+++ b/src/iceberg/manifest_adapter.cc
@@ -220,9 +220,12 @@ Status ManifestEntryAdapter::AppendPartitionValues(
break;
case TypeId::kDecimal:
ICEBERG_RETURN_UNEXPECTED(AppendField(
- child_array, std::get<std::array<uint8_t,
16>>(partition_value.value())));
+ child_array,
std::get<Decimal>(partition_value.value()).ToBytes()));
break;
case TypeId::kUuid:
+ ICEBERG_RETURN_UNEXPECTED(
+ AppendField(child_array,
std::get<Uuid>(partition_value.value()).bytes()));
+ break;
case TypeId::kStruct:
case TypeId::kList:
case TypeId::kMap:
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index 25bfdc6..1b24f85 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -81,11 +81,14 @@ iceberg_sources = files(
'transform.cc',
'transform_function.cc',
'type.cc',
+ 'util/bucket_util.cc',
'util/conversions.cc',
'util/decimal.cc',
'util/gzip_internal.cc',
'util/murmurhash3_internal.cc',
+ 'util/temporal_util.cc',
'util/timepoint.cc',
+ 'util/truncate_util.cc',
'util/uuid.cc',
'v1_metadata.cc',
'v2_metadata.cc',
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index 7c62a2a..68af62b 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -99,11 +99,13 @@ add_iceberg_test(json_serde_test
add_iceberg_test(util_test
SOURCES
+ bucket_util_test.cc
config_test.cc
decimal_test.cc
endian_test.cc
formatter_test.cc
string_util_test.cc
+ truncate_util_test.cc
uuid_test.cc
visit_type_test.cc)
diff --git a/src/iceberg/test/bucket_util_test.cc
b/src/iceberg/test/bucket_util_test.cc
new file mode 100644
index 0000000..69a04ef
--- /dev/null
+++ b/src/iceberg/test/bucket_util_test.cc
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/bucket_util.h"
+
+#include <chrono>
+
+#include <gtest/gtest.h>
+
+#include "iceberg/util/decimal.h"
+#include "iceberg/util/uuid.h"
+
+namespace iceberg {
+
+// The following tests are from
+// https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
+TEST(BucketUtilsTest, HashHelper) {
+ // int and long
+ EXPECT_EQ(BucketUtils::HashInt(34), 2017239379);
+ EXPECT_EQ(BucketUtils::HashLong(34L), 2017239379);
+
+ // decimal hash
+ auto decimal = Decimal::FromString("14.20");
+ ASSERT_TRUE(decimal.has_value());
+ EXPECT_EQ(BucketUtils::HashBytes(decimal->ToBigEndian()), -500754589);
+
+ // date hash
+ std::chrono::sys_days sd = std::chrono::year{2017} / 11 / 16;
+ std::chrono::sys_days epoch{std::chrono::year{1970} / 1 / 1};
+ int32_t days = (sd - epoch).count();
+ EXPECT_EQ(BucketUtils::HashInt(days), -653330422);
+
+ // time
+ // 22:31:08 in microseconds
+ int64_t time_micros = (22 * 3600 + 31 * 60 + 8) * 1000000LL;
+ EXPECT_EQ(BucketUtils::HashLong(time_micros), -662762989);
+
+ // timestamp
+ // 2017-11-16T22:31:08 in microseconds
+ std::chrono::system_clock::time_point tp =
+ std::chrono::sys_days{std::chrono::year{2017} / 11 / 16} +
std::chrono::hours{22} +
+ std::chrono::minutes{31} + std::chrono::seconds{8};
+ int64_t timestamp_micros =
+
std::chrono::duration_cast<std::chrono::microseconds>(tp.time_since_epoch())
+ .count();
+ EXPECT_EQ(BucketUtils::HashLong(timestamp_micros), -2047944441);
+ // 2017-11-16T22:31:08.000001 in microseconds
+ EXPECT_EQ(BucketUtils::HashLong(timestamp_micros + 1), -1207196810);
+
+ // string
+ std::string str = "iceberg";
+ EXPECT_EQ(BucketUtils::HashBytes(std::span<const uint8_t>(
+ reinterpret_cast<const uint8_t*>(str.data()), str.size())),
+ 1210000089);
+
+ // uuid
+ auto uuid = Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7");
+ EXPECT_EQ(BucketUtils::HashBytes(uuid->bytes()), 1488055340);
+
+ // fixed & binary
+ std::vector<uint8_t> fixed = {0, 1, 2, 3};
+ EXPECT_EQ(BucketUtils::HashBytes(fixed), -188683207);
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/test/decimal_test.cc b/src/iceberg/test/decimal_test.cc
index 6850d7a..71ba674 100644
--- a/src/iceberg/test/decimal_test.cc
+++ b/src/iceberg/test/decimal_test.cc
@@ -490,6 +490,50 @@ TEST(DecimalTest, FromBigEndianInvalid) {
IsError(ErrorKind::kInvalidArgument));
}
+TEST(DecimalTest, ToBigEndian) {
+ std::vector<int64_t> high_values = {0,
+ 1,
+ -1,
+ INT32_MAX,
+ INT32_MIN,
+ static_cast<int64_t>(INT32_MAX) + 1,
+ static_cast<int64_t>(INT32_MIN) - 1,
+ INT64_MAX,
+ INT64_MIN};
+ std::vector<uint64_t> low_values = {0,
+ 1,
+ 255,
+ UINT32_MAX,
+ static_cast<uint64_t>(UINT32_MAX) + 1,
+ static_cast<uint64_t>(UINT32_MAX) + 2,
+ static_cast<uint64_t>(UINT32_MAX) + 3,
+ static_cast<uint64_t>(UINT32_MAX) + 4,
+ static_cast<uint64_t>(UINT32_MAX) + 5,
+ static_cast<uint64_t>(UINT32_MAX) + 6,
+ static_cast<uint64_t>(UINT32_MAX) + 7,
+ static_cast<uint64_t>(UINT32_MAX) + 8,
+ UINT64_MAX};
+
+ for (int64_t high : high_values) {
+ for (uint64_t low : low_values) {
+ Decimal decimal(high, low);
+ auto bytes = decimal.ToBigEndian();
+ auto result = Decimal::FromBigEndian(bytes.data(), bytes.size());
+ ASSERT_THAT(result, IsOk());
+ EXPECT_EQ(result.value(), decimal);
+ }
+ }
+
+ for (int128_t value : std::vector<int128_t>{-INT64_MAX, -INT32_MAX, -255,
-1, 0, 1, 255,
+ 256, INT32_MAX, INT64_MAX}) {
+ Decimal decimal(value);
+ auto bytes = decimal.ToBigEndian();
+ auto result = Decimal::FromBigEndian(bytes.data(), bytes.size());
+ ASSERT_THAT(result, IsOk());
+ EXPECT_EQ(result.value(), decimal);
+ }
+}
+
TEST(DecimalTestFunctionality, Multiply) {
ASSERT_EQ(Decimal(60501), Decimal(301) * Decimal(201));
ASSERT_EQ(Decimal(-60501), Decimal(-301) * Decimal(201));
@@ -671,4 +715,58 @@ TEST(DecimalTest, Rescale) {
ASSERT_THAT(Decimal(5555555).Rescale(6, 1), IsError(ErrorKind::kInvalid));
}
+TEST(DecimalTest, Compare) {
+ // max positive unscaled value
+ // 10^38 - 1 scale cause overflow
+ ASSERT_EQ(Decimal::Compare(Decimal("99999999999999999999999999999999999999"),
+
Decimal("99999999999999999999999999999999999999"), 2, 3),
+ std::partial_ordering::greater);
+ // 10^37 - 1 scale no overflow
+ ASSERT_EQ(Decimal::Compare(Decimal("9999999999999999999999999999999999999"),
+
Decimal("99999999999999999999999999999999999999"), 2, 3),
+ std::partial_ordering::less);
+
+ // min negative unscaled value
+ // -10^38 + 1 scale cause overflow
+
ASSERT_EQ(Decimal::Compare(Decimal("-99999999999999999999999999999999999999"),
+
Decimal("-99999999999999999999999999999999999999"), 2, 3),
+ std::partial_ordering::less);
+ // -10^37 + 1 scale no overflow
+ ASSERT_EQ(Decimal::Compare(Decimal("-9999999999999999999999999999999999999"),
+
Decimal("-99999999999999999999999999999999999999"), 2, 3),
+ std::partial_ordering::greater);
+
+ // equal values with different scales
+ ASSERT_EQ(Decimal::Compare(Decimal("123456789"), Decimal("1234567890"), 2,
3),
+ std::partial_ordering::equivalent);
+ ASSERT_EQ(Decimal::Compare(Decimal("-1234567890"), Decimal("-123456789"), 3,
2),
+ std::partial_ordering::equivalent);
+
+ // different values with different scales
+ ASSERT_EQ(Decimal::Compare(Decimal("123456788"), Decimal("1234567890"), 2,
3),
+ std::partial_ordering::less);
+ ASSERT_EQ(Decimal::Compare(Decimal("-1234567890"), Decimal("-123456788"), 2,
3),
+ std::partial_ordering::less);
+
+ // different values with same scales
+ ASSERT_EQ(Decimal::Compare(Decimal("123456790"), Decimal("123456789"), 2, 2),
+ std::partial_ordering::greater);
+ ASSERT_EQ(Decimal::Compare(Decimal("-123456790"), Decimal("-123456789"), 2,
2),
+ std::partial_ordering::less);
+
+ // different signs
+ ASSERT_EQ(Decimal::Compare(Decimal("123456789"), Decimal("-123456789"), 2,
3),
+ std::partial_ordering::greater);
+ ASSERT_EQ(Decimal::Compare(Decimal("-123456789"), Decimal("123456789"), 2,
3),
+ std::partial_ordering::less);
+
+ // zero comparisons
+ ASSERT_EQ(Decimal::Compare(Decimal("0"), Decimal("0"), 2, 3),
+ std::partial_ordering::equivalent);
+ ASSERT_EQ(Decimal::Compare(Decimal("0"), Decimal("123456789"), 2, 3),
+ std::partial_ordering::less);
+ ASSERT_EQ(Decimal::Compare(Decimal("-123456789"), Decimal("0"), 2, 3),
+ std::partial_ordering::less);
+}
+
} // namespace iceberg
diff --git a/src/iceberg/test/literal_test.cc b/src/iceberg/test/literal_test.cc
index 6e4b2aa..0dd291d 100644
--- a/src/iceberg/test/literal_test.cc
+++ b/src/iceberg/test/literal_test.cc
@@ -256,6 +256,20 @@ TEST(LiteralTest, DoubleZeroComparison) {
EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less);
}
+TEST(LiteralTest, UuidComparison) {
+ auto uuid1 =
Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value();
+ auto uuid2 =
Uuid::FromString("123e4567-e89b-12d3-a456-426614174001").value();
+ auto uuid3 =
Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value();
+
+ auto literal1 = Literal::UUID(uuid1);
+ auto literal2 = Literal::UUID(uuid2);
+ auto literal3 = Literal::UUID(uuid3);
+
+ EXPECT_EQ(literal1 <=> literal3, std::partial_ordering::equivalent);
+ EXPECT_EQ(literal1 <=> literal2, std::partial_ordering::unordered);
+ EXPECT_EQ(literal2 <=> literal1, std::partial_ordering::unordered);
+}
+
// Parameter struct for literal serialization and deserialization tests
struct LiteralParam {
std::string test_name;
@@ -346,6 +360,17 @@ INSTANTIATE_TEST_SUITE_P(
Literal::Double(std::numeric_limits<double>::lowest()),
float64()},
+ // Decimal type
+ LiteralParam{"DecimalPositive",
+ {1, 226, 64},
+ Literal::Decimal(123456, 6, 2),
+ decimal(6, 2)},
+ LiteralParam{"DecimalNegative",
+ {254, 29, 192},
+ Literal::Decimal(-123456, 6, 2),
+ decimal(6, 2)},
+ LiteralParam{"DecimalZero", {0}, Literal::Decimal(0, 3, 0), decimal(3,
0)},
+
LiteralParam{"String",
{105, 99, 101, 98, 101, 114, 103},
Literal::String("iceberg"),
@@ -506,10 +531,28 @@ INSTANTIATE_TEST_SUITE_P(
.literal = Literal::Double(std::numbers::pi),
.expected_type_id = TypeId::kDouble,
.expected_string = "3.141593"},
+ BasicLiteralTestParam{.test_name = "DecimalPositive",
+ .literal = Literal::Decimal(123456, 6, 2),
+ .expected_type_id = TypeId::kDecimal,
+ .expected_string = "1234.56"},
+ BasicLiteralTestParam{.test_name = "DecimalNegative",
+ .literal = Literal::Decimal(-123456, 6, 2),
+ .expected_type_id = TypeId::kDecimal,
+ .expected_string = "-1234.56"},
+ BasicLiteralTestParam{.test_name = "DecimalZero",
+ .literal = Literal::Decimal(0, 3, 0),
+ .expected_type_id = TypeId::kDecimal,
+ .expected_string = "0"},
BasicLiteralTestParam{.test_name = "String",
.literal = Literal::String("hello world"),
.expected_type_id = TypeId::kString,
.expected_string = "\"hello world\""},
+ BasicLiteralTestParam{
+ .test_name = "Uuid",
+ .literal = Literal::UUID(
+
Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value()),
+ .expected_type_id = TypeId::kUuid,
+ .expected_string = "123e4567-e89b-12d3-a456-426614174000"},
BasicLiteralTestParam{
.test_name = "Binary",
.literal = Literal::Binary(std::vector<uint8_t>{0x01, 0x02, 0x03,
0xFF}),
@@ -563,6 +606,10 @@ INSTANTIATE_TEST_SUITE_P(
.small_literal = Literal::Double(1.5),
.large_literal = Literal::Double(2.5),
.equal_literal = Literal::Double(1.5)},
+ ComparisonLiteralTestParam{.test_name = "Decimal",
+ .small_literal = Literal::Decimal(123456,
6, 2),
+ .large_literal = Literal::Decimal(234567,
6, 2),
+ .equal_literal = Literal::Decimal(123456,
6, 2)},
ComparisonLiteralTestParam{.test_name = "String",
.small_literal = Literal::String("apple"),
.large_literal = Literal::String("banana"),
@@ -672,6 +719,13 @@ INSTANTIATE_TEST_SUITE_P(
.target_type = fixed(4),
.expected_literal =
Literal::Fixed(std::vector<uint8_t>{
0x01, 0x02, 0x03, 0x04})},
+ // String cast tests
+ CastLiteralTestParam{
+ .test_name = "StringToUuid",
+ .source_literal =
Literal::String("123e4567-e89b-12d3-a456-426614174000"),
+ .target_type = uuid(),
+ .expected_literal = Literal::UUID(
+
Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value())},
// Same type cast test
CastLiteralTestParam{.test_name = "IntToInt",
.source_literal = Literal::Int(42),
diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build
index dd3bd05..88b1632 100644
--- a/src/iceberg/test/meson.build
+++ b/src/iceberg/test/meson.build
@@ -68,11 +68,13 @@ iceberg_tests = {
},
'util_test': {
'sources': files(
+ 'bucket_util_test.cc',
'config_test.cc',
'decimal_test.cc',
'endian_test.cc',
'formatter_test.cc',
'string_util_test.cc',
+ 'truncate_util_test.cc',
'uuid_test.cc',
'visit_type_test.cc',
),
diff --git a/src/iceberg/test/transform_test.cc
b/src/iceberg/test/transform_test.cc
index c1efcb5..1003b95 100644
--- a/src/iceberg/test/transform_test.cc
+++ b/src/iceberg/test/transform_test.cc
@@ -21,11 +21,13 @@
#include <format>
#include <memory>
+#include <string>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "iceberg/expression/literal.h"
+#include "iceberg/transform_function.h"
#include "iceberg/type.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
#include "matchers.h"
@@ -63,6 +65,7 @@ TEST(TransformFunctionTest, CreateTruncateTransform) {
auto transformPtr = transform->Bind(iceberg::string());
EXPECT_EQ(transformPtr.value()->transform_type(), TransformType::kTruncate);
}
+
TEST(TransformFromStringTest, PositiveCases) {
struct Case {
std::string str;
@@ -187,373 +190,485 @@ TEST(TransformResultTypeTest, NegativeCases) {
}
}
-TEST(TransformLiteralTest, IdentityTransform) {
- struct Case {
- std::shared_ptr<Type> source_type;
- Literal source;
- Literal expected;
- };
+// Parameterized tests for transform functions
+struct TransformParam {
+ std::string str;
+ // The integer parameter associated with the transform.
+ int32_t param;
+ std::shared_ptr<Type> source_type;
+ Literal source;
+ Literal expected;
+};
- const std::vector<Case> cases = {
- {.source_type = iceberg::boolean(),
- .source = Literal::Boolean(true),
- .expected = Literal::Boolean(true)},
- {.source_type = iceberg::int32(),
- .source = Literal::Int(42),
- .expected = Literal::Int(42)},
- {.source_type = iceberg::int32(),
- .source = Literal::Date(30000),
- .expected = Literal::Date(30000)},
- {.source_type = iceberg::int64(),
- .source = Literal::Long(1234567890),
- .expected = Literal::Long(1234567890)},
- {.source_type = iceberg::timestamp(),
- .source = Literal::Timestamp(1622547800000000),
- .expected = Literal::Timestamp(1622547800000000)},
- {.source_type = iceberg::timestamp_tz(),
- .source = Literal::TimestampTz(1622547800000000),
- .expected = Literal::TimestampTz(1622547800000000)},
- {.source_type = iceberg::float32(),
- .source = Literal::Float(3.14),
- .expected = Literal::Float(3.14)},
- {.source_type = iceberg::float64(),
- .source = Literal::Double(1.23e-5),
- .expected = Literal::Double(1.23e-5)},
- {.source_type = iceberg::string(),
- .source = Literal::String("Hello, World!"),
- .expected = Literal::String("Hello, World!")},
- {.source_type = iceberg::binary(),
- .source = Literal::Binary({0x01, 0x02, 0x03}),
- .expected = Literal::Binary({0x01, 0x02, 0x03})},
- };
+class TransformLiteralTest : public ::testing::TestWithParam<TransformParam>
{};
- for (const auto& c : cases) {
- auto transform = Transform::Identity();
- auto transformPtr = transform->Bind(c.source_type);
- ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind identity
transform";
+TEST_P(TransformLiteralTest, IdentityTransform) {
+ const auto& param = GetParam();
- auto result = transformPtr.value()->Transform(c.source);
- ASSERT_TRUE(result.has_value())
- << "Failed to transform literal: " << c.source.ToString();
+ auto transform = Transform::Identity();
+ auto transformPtr = transform->Bind(param.source_type);
+ ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind identity transform";
- EXPECT_EQ(result.value(), c.expected)
- << "Unexpected result for source: " << c.source.ToString();
- }
+ auto result = transformPtr.value()->Transform(param.source);
+ ASSERT_TRUE(result.has_value())
+ << "Failed to transform literal: " << param.source.ToString();
+
+ EXPECT_EQ(result.value(), param.expected)
+ << "Unexpected result for source: " << param.source.ToString();
}
-TEST(TransformLiteralTest, BucketTransform) {
+INSTANTIATE_TEST_SUITE_P(
+ IdentityTransformTests, TransformLiteralTest,
+ ::testing::Values(
+ TransformParam{.str = "BooleanTrue",
+ .source_type = iceberg::boolean(),
+ .source = Literal::Boolean(true),
+ .expected = Literal::Boolean(true)},
+ TransformParam{.str = "BooleanFalse",
+ .source_type = iceberg::boolean(),
+ .source = Literal::Boolean(false),
+ .expected = Literal::Boolean(false)},
+ TransformParam{.str = "Int32",
+ .source_type = iceberg::int32(),
+ .source = Literal::Int(42),
+ .expected = Literal::Int(42)},
+ TransformParam{.str = "Date",
+ .source_type = iceberg::int32(),
+ .source = Literal::Date(30000),
+ .expected = Literal::Date(30000)},
+ TransformParam{.str = "Int64",
+ .source_type = iceberg::int64(),
+ .source = Literal::Long(1234567890),
+ .expected = Literal::Long(1234567890)},
+ TransformParam{.str = "Timestamp",
+ .source_type = iceberg::timestamp(),
+ .source = Literal::Timestamp(1622547800000000),
+ .expected = Literal::Timestamp(1622547800000000)},
+ TransformParam{.str = "TimestampTz",
+ .source_type = iceberg::timestamp_tz(),
+ .source = Literal::TimestampTz(1622547800000000),
+ .expected = Literal::TimestampTz(1622547800000000)},
+ TransformParam{.str = "Float",
+ .source_type = iceberg::float32(),
+ .source = Literal::Float(3.14),
+ .expected = Literal::Float(3.14)},
+ TransformParam{.str = "Double",
+ .source_type = iceberg::float64(),
+ .source = Literal::Double(1.23e-5),
+ .expected = Literal::Double(1.23e-5)},
+ TransformParam{.str = "Decimal",
+ .source_type = iceberg::decimal(10, 2),
+ .source = Literal::Decimal(123456, 10, 2),
+ .expected = Literal::Decimal(123456, 10, 2)},
+ TransformParam{.str = "String",
+ .source_type = iceberg::string(),
+ .source = Literal::String("Hello, World!"),
+ .expected = Literal::String("Hello, World!")},
+ TransformParam{
+ .str = "Uuid",
+ .source_type = iceberg::uuid(),
+ .source = Literal::UUID(
+
Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value()),
+ .expected = Literal::UUID(
+
Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value())},
+ TransformParam{.str = "Binary",
+ .source_type = iceberg::binary(),
+ .source = Literal::Binary({0x01, 0x02, 0x03}),
+ .expected = Literal::Binary({0x01, 0x02, 0x03})},
+ TransformParam{.str = "Fixed",
+ .source_type = iceberg::fixed(3),
+ .source = Literal::Fixed({0x01, 0x02, 0x03}),
+ .expected = Literal::Fixed({0x01, 0x02, 0x03})}),
+ [](const ::testing::TestParamInfo<TransformParam>& info) { return
info.param.str; });
+
+class BucketTransformTest : public ::testing::TestWithParam<TransformParam> {};
+
+TEST_P(BucketTransformTest, BucketTransform) {
constexpr int32_t num_buckets = 4;
auto transform = Transform::Bucket(num_buckets);
- struct Case {
- std::shared_ptr<Type> source_type;
- Literal source;
- Literal expected;
- };
+ const auto& param = GetParam();
+ auto transformPtr = transform->Bind(param.source_type);
+ ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind bucket transform";
+ auto result = transformPtr.value()->Transform(param.source);
+ ASSERT_TRUE(result.has_value())
+ << "Failed to transform literal: " << param.source.ToString();
- const std::vector<Case> cases = {
- {.source_type = iceberg::int32(),
- .source = Literal::Int(42),
- .expected = Literal::Int(3)},
- {.source_type = iceberg::date(),
- .source = Literal::Date(30000),
- .expected = Literal::Int(2)},
- {.source_type = iceberg::int64(),
- .source = Literal::Long(1234567890),
- .expected = Literal::Int(3)},
- {.source_type = iceberg::timestamp(),
- .source = Literal::Timestamp(1622547800000000),
- .expected = Literal::Int(1)},
- {.source_type = iceberg::timestamp_tz(),
- .source = Literal::TimestampTz(1622547800000000),
- .expected = Literal::Int(1)},
- {.source_type = iceberg::string(),
- .source = Literal::String("test"),
- .expected = Literal::Int(3)},
- };
-
- for (const auto& c : cases) {
- auto transformPtr = transform->Bind(c.source_type);
- ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind bucket transform";
- auto result = transformPtr.value()->Transform(c.source);
- ASSERT_TRUE(result.has_value())
- << "Failed to transform literal: " << c.source.ToString();
-
- EXPECT_EQ(result.value(), c.expected)
- << "Unexpected result for source: " << c.source.ToString();
- }
+ EXPECT_EQ(result.value(), param.expected)
+ << "Unexpected result for source: " << param.source.ToString();
}
-TEST(TransformLiteralTest, TruncateTransform) {
- struct Case {
- std::shared_ptr<Type> source_type;
- int32_t width;
- Literal source;
- Literal expected;
- };
-
- const std::vector<Case> cases = {
- {.source_type = iceberg::int32(),
- .width = 5,
- .source = Literal::Int(123456),
- .expected = Literal::Int(123455)},
- {.source_type = iceberg::string(),
- .width = 5,
- .source = Literal::String("Hello, World!"),
- .expected = Literal::String("Hello")},
- {.source_type = iceberg::string(),
- .width = 5,
- .source = Literal::String("😜🧐🤔🤪🥳😵💫😂"),
- // Truncate to 5 utf-8 code points
- .expected = Literal::String("😜🧐🤔🤪🥳")},
- {.source_type = iceberg::string(),
- .width = 8,
- .source = Literal::String("a😜b🧐c🤔d🤪e🥳"),
- .expected = Literal::String("a😜b🧐c🤔d🤪")},
- {.source_type = iceberg::binary(),
- .width = 5,
- .source = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05, 0x06}),
- .expected = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05})},
- };
-
- for (const auto& c : cases) {
- auto transform = Transform::Truncate(c.width);
- auto transformPtr = transform->Bind(c.source_type);
- ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind truncate
transform";
- auto result = transformPtr.value()->Transform(c.source);
- ASSERT_TRUE(result.has_value())
- << "Failed to transform literal: " << c.source.ToString();
-
- EXPECT_EQ(result.value(), c.expected)
- << "Unexpected result for source: " << c.source.ToString();
- }
+INSTANTIATE_TEST_SUITE_P(
+ BucketTransformTests, BucketTransformTest,
+ ::testing::Values(
+ TransformParam{.str = "Int32",
+ .source_type = iceberg::int32(),
+ .source = Literal::Int(34),
+ .expected = Literal::Int(3)},
+ TransformParam{.str = "Int64",
+ .source_type = iceberg::int64(),
+ .source = Literal::Long(34),
+ .expected = Literal::Int(3)},
+ TransformParam{.str = "Decimal",
+ // 14.20
+ .source_type = iceberg::decimal(4, 2),
+ .source = Literal::Decimal(1420, 4, 2),
+ .expected = Literal::Int(3)},
+ TransformParam{.str = "Date",
+ // 2017-11-16
+ .source_type = iceberg::date(),
+ .source = Literal::Date(17486),
+ .expected = Literal::Int(2)},
+ TransformParam{.str = "Time",
+ // 22:31:08 in microseconds
+ .source_type = iceberg::time(),
+ .source = Literal::Time(81068000000),
+ .expected = Literal::Int(3)},
+ TransformParam{.str = "Timestamp",
+ // 2017-11-16T22:31:08 in microseconds
+ .source_type = iceberg::timestamp(),
+ .source = Literal::Timestamp(1510871468000000),
+ .expected = Literal::Int(3)},
+ TransformParam{.str = "TimestampTz",
+ // 2017-11-16T22:31:08.000001 in microseconds
+ .source_type = iceberg::timestamp_tz(),
+ .source = Literal::TimestampTz(1510871468000001),
+ .expected = Literal::Int(2)},
+ TransformParam{.str = "String",
+ .source_type = iceberg::string(),
+ .source = Literal::String("iceberg"),
+ .expected = Literal::Int(1)},
+ TransformParam{
+ .str = "Uuid",
+ .source_type = iceberg::uuid(),
+ .source = Literal::UUID(
+
Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7").value()),
+ .expected = Literal::Int(0)},
+ TransformParam{.str = "Fixed",
+ .source_type = iceberg::fixed(4),
+ .source = Literal::Fixed({0, 1, 2, 3}),
+ .expected = Literal::Int(1)},
+ TransformParam{.str = "Binary",
+ .source_type = iceberg::binary(),
+ .source = Literal::Binary({0, 1, 2, 3}),
+ .expected = Literal::Int(1)}),
+ [](const ::testing::TestParamInfo<TransformParam>& info) { return
info.param.str; });
+
+class TruncateTransformTest : public ::testing::TestWithParam<TransformParam>
{};
+
+TEST_P(TruncateTransformTest, TruncateTransform) {
+ const auto& param = GetParam();
+ auto transform = Transform::Truncate(param.param);
+ auto transformPtr = transform->Bind(param.source_type);
+ ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind truncate transform";
+ auto result = transformPtr.value()->Transform(param.source);
+ ASSERT_TRUE(result.has_value())
+ << "Failed to transform literal: " << param.source.ToString();
+
+ EXPECT_EQ(result.value(), param.expected)
+ << "Unexpected result for source: " << param.source.ToString();
}
-TEST(TransformLiteralTest, YearTransform) {
+INSTANTIATE_TEST_SUITE_P(
+ TruncateTransformTests, TruncateTransformTest,
+ ::testing::Values(
+ TransformParam{.str = "Int32",
+ .param = 5,
+ .source_type = iceberg::int32(),
+ .source = Literal::Int(123456),
+ .expected = Literal::Int(123455)},
+ TransformParam{.str = "Int64",
+ .param = 10,
+ .source_type = iceberg::int64(),
+ .source = Literal::Long(-1),
+ .expected = Literal::Long(-10)},
+ TransformParam{.str = "Decimal",
+ .param = 50,
+ .source_type = iceberg::decimal(5, 2),
+ .source = Literal::Decimal(12345, 5, 2),
+ .expected = Literal::Decimal(12300, 5, 2)},
+ TransformParam{.str = "StringShort",
+ .param = 5,
+ .source_type = iceberg::string(),
+ .source = Literal::String("Hello, World!"),
+ .expected = Literal::String("Hello")},
+ TransformParam{.str = "StringEmoji",
+ .param = 5,
+ .source_type = iceberg::string(),
+ .source = Literal::String("😜🧐🤔🤪🥳😵💫😂"),
+ .expected = Literal::String("😜🧐🤔🤪🥳")},
+ TransformParam{.str = "StringMixed",
+ .param = 8,
+ .source_type = iceberg::string(),
+ .source = Literal::String("a😜b🧐c🤔d🤪e🥳"),
+ .expected = Literal::String("a😜b🧐c🤔d🤪")},
+ TransformParam{.str = "Binary",
+ .param = 5,
+ .source_type = iceberg::binary(),
+ .source = Literal::Binary({0x01, 0x02, 0x03, 0x04,
0x05, 0x06}),
+ .expected = Literal::Binary({0x01, 0x02, 0x03, 0x04,
0x05})}),
+ [](const ::testing::TestParamInfo<TransformParam>& info) { return
info.param.str; });
+
+class YearTransformTest : public ::testing::TestWithParam<TransformParam> {};
+
+TEST_P(YearTransformTest, YearTransform) {
auto transform = Transform::Year();
+ const auto& param = GetParam();
- struct Case {
- std::shared_ptr<Type> source_type;
- Literal source;
- Literal expected;
- };
+ auto transformPtr = transform->Bind(param.source_type);
+ ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind year transform";
- const std::vector<Case> cases = {
- {.source_type = iceberg::timestamp(),
- // 2021-06-01T11:43:20Z
- .source = Literal::Timestamp(1622547800000000),
- .expected = Literal::Int(2021)},
- {.source_type = iceberg::timestamp_tz(),
- .source = Literal::TimestampTz(1622547800000000),
- .expected = Literal::Int(2021)},
- {.source_type = iceberg::date(),
- .source = Literal::Date(30000),
- .expected = Literal::Int(2052)},
- };
+ auto result = transformPtr.value()->Transform(param.source);
+ ASSERT_TRUE(result.has_value())
+ << "Failed to transform literal: " << param.source.ToString();
- for (const auto& c : cases) {
- auto transformPtr = transform->Bind(c.source_type);
- ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind year transform";
- auto result = transformPtr.value()->Transform(c.source);
- ASSERT_TRUE(result.has_value())
- << "Failed to transform literal: " << c.source.ToString();
-
- EXPECT_EQ(result.value(), c.expected)
- << "Unexpected result for source: " << c.source.ToString();
- }
+ EXPECT_EQ(result.value(), param.expected)
+ << "Unexpected result for source: " << param.source.ToString();
}
-TEST(TransformLiteralTest, MonthTransform) {
+INSTANTIATE_TEST_SUITE_P(
+ YearTransformTests, YearTransformTest,
+ ::testing::Values(TransformParam{.str = "Timestamp",
+ // 2021-06-01T11:43:20Z
+ .source_type = iceberg::timestamp(),
+ .source =
Literal::Timestamp(1622547800000000),
+ .expected = Literal::Int(2021)},
+ TransformParam{.str = "TimestampTz",
+ .source_type = iceberg::timestamp_tz(),
+ .source =
Literal::TimestampTz(1622547800000000),
+ .expected = Literal::Int(2021)},
+ TransformParam{.str = "Date",
+ .source_type = iceberg::date(),
+ .source = Literal::Date(30000),
+ .expected = Literal::Int(2052)}),
+ [](const ::testing::TestParamInfo<TransformParam>& info) { return
info.param.str; });
+
+class MonthTransformTest : public ::testing::TestWithParam<TransformParam> {};
+
+TEST_P(MonthTransformTest, MonthTransform) {
auto transform = Transform::Month();
+ const auto& param = GetParam();
- struct Case {
- std::shared_ptr<Type> source_type;
- Literal source;
- Literal expected;
- };
-
- const std::vector<Case> cases = {
- {.source_type = iceberg::timestamp(),
- .source = Literal::Timestamp(1622547800000000),
- .expected = Literal::Int(617)},
- {.source_type = iceberg::timestamp_tz(),
- .source = Literal::TimestampTz(1622547800000000),
- .expected = Literal::Int(617)},
- {.source_type = iceberg::date(),
- .source = Literal::Date(30000),
- .expected = Literal::Int(985)},
- };
+ auto transformPtr = transform->Bind(param.source_type);
+ ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind month transform";
- for (const auto& c : cases) {
- auto transformPtr = transform->Bind(c.source_type);
- ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind month transform";
- auto result = transformPtr.value()->Transform(c.source);
- ASSERT_TRUE(result.has_value())
- << "Failed to transform literal: " << c.source.ToString();
+ auto result = transformPtr.value()->Transform(param.source);
+ ASSERT_TRUE(result.has_value())
+ << "Failed to transform literal: " << param.source.ToString();
- EXPECT_EQ(result.value(), c.expected)
- << "Unexpected result for source: " << c.source.ToString();
- }
+ EXPECT_EQ(result.value(), param.expected)
+ << "Unexpected result for source: " << param.source.ToString();
}
-TEST(TransformFunctionTransformTest, DayTransform) {
+INSTANTIATE_TEST_SUITE_P(
+ MonthTransformTests, MonthTransformTest,
+ ::testing::Values(TransformParam{.str = "Timestamp",
+ .source_type = iceberg::timestamp(),
+ .source =
Literal::Timestamp(1622547800000000),
+ .expected = Literal::Int(617)},
+ TransformParam{.str = "TimestampTz",
+ .source_type = iceberg::timestamp_tz(),
+ .source =
Literal::TimestampTz(1622547800000000),
+ .expected = Literal::Int(617)},
+ TransformParam{.str = "Date",
+ .source_type = iceberg::date(),
+ .source = Literal::Date(30000),
+ .expected = Literal::Int(985)}),
+ [](const ::testing::TestParamInfo<TransformParam>& info) { return
info.param.str; });
+
+class DayTransformTest : public ::testing::TestWithParam<TransformParam> {};
+
+TEST_P(DayTransformTest, DayTransform) {
auto transform = Transform::Day();
+ const auto& param = GetParam();
- struct Case {
- std::shared_ptr<Type> source_type;
- Literal source;
- Literal expected;
- };
+ auto transformPtr = transform->Bind(param.source_type);
+ ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind day transform";
- const std::vector<Case> cases = {
- {.source_type = iceberg::timestamp(),
- .source = Literal::Timestamp(1622547800000000),
- .expected = Literal::Int(18779)},
- {.source_type = iceberg::timestamp_tz(),
- .source = Literal::TimestampTz(1622547800000000),
- .expected = Literal::Int(18779)},
- {.source_type = iceberg::date(),
- .source = Literal::Date(30000),
- .expected = Literal::Int(30000)},
- };
-
- for (const auto& c : cases) {
- auto transformPtr = transform->Bind(c.source_type);
- ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind day transform";
- auto result = transformPtr.value()->Transform(c.source);
- ASSERT_TRUE(result.has_value())
- << "Failed to transform literal: " << c.source.ToString();
+ auto result = transformPtr.value()->Transform(param.source);
+ ASSERT_TRUE(result.has_value())
+ << "Failed to transform literal: " << param.source.ToString();
- EXPECT_EQ(result.value(), c.expected)
- << "Unexpected result for source: " << c.source.ToString();
- }
+ EXPECT_EQ(result.value(), param.expected)
+ << "Unexpected result for source: " << param.source.ToString();
}
-TEST(TransformLiteralTest, HourTransform) {
+INSTANTIATE_TEST_SUITE_P(
+ DayTransformTests, DayTransformTest,
+ ::testing::Values(TransformParam{.str = "Timestamp",
+ .source_type = iceberg::timestamp(),
+ .source =
Literal::Timestamp(1622547800000000),
+ .expected = Literal::Int(18779)},
+ TransformParam{.str = "TimestampTz",
+ .source_type = iceberg::timestamp_tz(),
+ .source =
Literal::TimestampTz(1622547800000000),
+ .expected = Literal::Int(18779)},
+ TransformParam{.str = "Date",
+ .source_type = iceberg::date(),
+ .source = Literal::Date(30000),
+ .expected = Literal::Int(30000)}),
+ [](const ::testing::TestParamInfo<TransformParam>& info) { return
info.param.str; });
+
+class HourTransformTest : public ::testing::TestWithParam<TransformParam> {};
+
+TEST_P(HourTransformTest, HourTransform) {
auto transform = Transform::Hour();
+ const auto& param = GetParam();
- struct Case {
- std::shared_ptr<Type> source_type;
- Literal source;
- Literal expected;
- };
-
- const std::vector<Case> cases = {
- {.source_type = iceberg::timestamp(),
- .source = Literal::Timestamp(1622547800000000),
- .expected = Literal::Int(450707)},
- {.source_type = iceberg::timestamp_tz(),
- .source = Literal::TimestampTz(1622547800000000),
- .expected = Literal::Int(450707)},
- };
+ auto transformPtr = transform->Bind(param.source_type);
+ ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind hour transform";
- for (const auto& c : cases) {
- auto transformPtr = transform->Bind(c.source_type);
- ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind hour transform";
- auto result = transformPtr.value()->Transform(c.source);
- ASSERT_TRUE(result.has_value())
- << "Failed to transform literal: " << c.source.ToString();
+ auto result = transformPtr.value()->Transform(param.source);
+ ASSERT_TRUE(result.has_value())
+ << "Failed to transform literal: " << param.source.ToString();
- EXPECT_EQ(result.value(), c.expected)
- << "Unexpected result for source: " << c.source.ToString();
- }
+ EXPECT_EQ(result.value(), param.expected)
+ << "Unexpected result for source: " << param.source.ToString();
}
-TEST(TransformLiteralTest, VoidTransform) {
+INSTANTIATE_TEST_SUITE_P(
+ HourTransformTests, HourTransformTest,
+ ::testing::Values(TransformParam{.str = "Timestamp",
+ .source_type = iceberg::timestamp(),
+ .source =
Literal::Timestamp(1622547800000000),
+ .expected = Literal::Int(450707)},
+ TransformParam{.str = "TimestampTz",
+ .source_type = iceberg::timestamp_tz(),
+ .source =
Literal::TimestampTz(1622547800000000),
+ .expected = Literal::Int(450707)}),
+ [](const ::testing::TestParamInfo<TransformParam>& info) { return
info.param.str; });
+
+class VoidTransformTest : public ::testing::TestWithParam<TransformParam> {};
+
+TEST_P(VoidTransformTest, VoidTransform) {
auto transform = Transform::Void();
-
- struct Case {
- std::shared_ptr<Type> source_type;
- Literal source;
- };
-
- const std::vector<Case> cases = {
- {.source_type = iceberg::boolean(), .source = Literal::Boolean(true)},
- {.source_type = iceberg::int32(), .source = Literal::Int(42)},
- {.source_type = iceberg::date(), .source = Literal::Date(30000)},
- {.source_type = iceberg::int64(), .source = Literal::Long(1234567890)},
- {.source_type = iceberg::timestamp(),
- .source = Literal::Timestamp(1622547800000000)},
- {.source_type = iceberg::timestamp_tz(),
- .source = Literal::TimestampTz(1622547800000000)},
- {.source_type = iceberg::float32(), .source = Literal::Float(3.14)},
- {.source_type = iceberg::float64(), .source = Literal::Double(1.23e-5)},
- {.source_type = iceberg::string(), .source = Literal::String("Hello,
World!")},
- {.source_type = iceberg::binary(), .source = Literal::Binary({0x01,
0x02, 0x03})},
- };
-
- for (const auto& c : cases) {
- auto transformPtr = transform->Bind(c.source_type);
- ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind void transform";
- auto result = transformPtr.value()->Transform(c.source);
- EXPECT_TRUE(result->IsNull())
- << "Expected void transform to return null type for source: "
- << c.source.ToString();
- EXPECT_EQ(result->type()->type_id(), c.source_type->type_id())
- << "Expected void transform to return same type as source for: "
- << c.source.ToString();
- }
+ const auto& param = GetParam();
+
+ auto transformPtr = transform->Bind(param.source_type);
+ ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind void transform";
+
+ auto result = transformPtr.value()->Transform(param.source);
+ EXPECT_TRUE(result->IsNull())
+ << "Expected void transform to return null type for source: "
+ << param.source.ToString();
+ EXPECT_EQ(result->type()->type_id(), param.source_type->type_id())
+ << "Expected void transform to return same type as source for: "
+ << param.source.ToString();
+ EXPECT_EQ(result->ToString(), param.expected.ToString())
+ << "Unexpected result for source: " << param.source.ToString();
}
-TEST(TransformLiteralTest, NullLiteral) {
- struct Case {
- std::string str;
- std::shared_ptr<Type> source_type;
- Literal source;
- std::shared_ptr<Type> expected_result_type;
- };
-
- const std::vector<Case> cases = {
- {.str = "identity",
- .source_type = iceberg::string(),
- .source = Literal::Null(iceberg::string()),
- .expected_result_type = iceberg::string()},
- {.str = "year",
- .source_type = iceberg::timestamp(),
- .source = Literal::Null(iceberg::timestamp()),
- .expected_result_type = iceberg::int32()},
- {.str = "month",
- .source_type = iceberg::timestamp(),
- .source = Literal::Null(iceberg::timestamp()),
- .expected_result_type = iceberg::int32()},
- {.str = "day",
- .source_type = iceberg::timestamp(),
- .source = Literal::Null(iceberg::timestamp()),
- .expected_result_type = iceberg::int32()},
- {.str = "hour",
- .source_type = iceberg::timestamp(),
- .source = Literal::Null(iceberg::timestamp()),
- .expected_result_type = iceberg::int32()},
- {.str = "void",
- .source_type = iceberg::string(),
- .source = Literal::Null(iceberg::string()),
- .expected_result_type = iceberg::string()},
- {.str = "bucket[16]",
- .source_type = iceberg::string(),
- .source = Literal::Null(iceberg::string()),
- .expected_result_type = iceberg::int32()},
- {.str = "truncate[32]",
- .source_type = iceberg::string(),
- .source = Literal::Null(iceberg::string()),
- .expected_result_type = iceberg::string()},
- };
-
- for (const auto& c : cases) {
- auto result = TransformFromString(c.str);
- ASSERT_TRUE(result.has_value()) << "Failed to parse: " << c.str;
-
- const auto& transform = result.value();
- const auto transformPtr = transform->Bind(c.source_type);
- ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind: " << c.str;
-
- auto transform_result = transformPtr.value()->Transform(c.source);
- EXPECT_TRUE(transform_result->IsNull())
- << "Expected void transform to return null type for source: "
- << c.source.ToString();
- EXPECT_EQ(transform_result->type()->type_id(),
c.expected_result_type->type_id())
- << "Expected void transform to return same type as source for: "
- << c.source.ToString();
- }
+INSTANTIATE_TEST_SUITE_P(
+ VoidTransformTests, VoidTransformTest,
+ ::testing::Values(
+ TransformParam{.str = "Boolean",
+ .source_type = iceberg::boolean(),
+ .source = Literal::Boolean(true),
+ .expected = Literal::Null(iceberg::boolean())},
+ TransformParam{.str = "Int32",
+ .source_type = iceberg::int32(),
+ .source = Literal::Int(42),
+ .expected = Literal::Null(iceberg::int32())},
+ TransformParam{.str = "Date",
+ .source_type = iceberg::date(),
+ .source = Literal::Date(30000),
+ .expected = Literal::Null(iceberg::date())},
+ TransformParam{.str = "Int64",
+ .source_type = iceberg::int64(),
+ .source = Literal::Long(1234567890),
+ .expected = Literal::Null(iceberg::int64())},
+ TransformParam{.str = "Timestamp",
+ .source_type = iceberg::timestamp(),
+ .source = Literal::Timestamp(1622547800000000),
+ .expected = Literal::Null(iceberg::timestamp())},
+ TransformParam{.str = "TimestampTz",
+ .source_type = iceberg::timestamp_tz(),
+ .source = Literal::TimestampTz(1622547800000000),
+ .expected = Literal::Null(iceberg::timestamp_tz())},
+ TransformParam{.str = "Float",
+ .source_type = iceberg::float32(),
+ .source = Literal::Float(3.14),
+ .expected = Literal::Null(iceberg::float32())},
+ TransformParam{.str = "Double",
+ .source_type = iceberg::float64(),
+ .source = Literal::Double(1.23e-5),
+ .expected = Literal::Null(iceberg::float64())},
+ TransformParam{.str = "Decimal",
+ .source_type = iceberg::decimal(10, 2),
+ .source = Literal::Decimal(123456, 10, 2),
+ .expected = Literal::Null(iceberg::decimal(10, 2))},
+ TransformParam{.str = "String",
+ .source_type = iceberg::string(),
+ .source = Literal::String("Hello, World!"),
+ .expected = Literal::Null(iceberg::string())},
+ TransformParam{
+ .str = "Uuid",
+ .source_type = iceberg::uuid(),
+ .source = Literal::UUID(
+
Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value()),
+ .expected = Literal::Null(iceberg::uuid())},
+ TransformParam{.str = "Binary",
+ .source_type = iceberg::binary(),
+ .source = Literal::Binary({0x01, 0x02, 0x03}),
+ .expected = Literal::Null(iceberg::binary())},
+ TransformParam{.str = "Fixed",
+ .source_type = iceberg::fixed(3),
+ .source = Literal::Fixed({0x01, 0x02, 0x03}),
+ .expected = Literal::Null(iceberg::fixed(3))}),
+ [](const ::testing::TestParamInfo<TransformParam>& info) { return
info.param.str; });
+
+class NullLiteralTransformTest : public
::testing::TestWithParam<TransformParam> {};
+
+TEST_P(NullLiteralTransformTest, NullLiteralTransform) {
+ const auto& param = GetParam();
+
+ auto result = TransformFromString(param.str);
+ ASSERT_TRUE(result.has_value()) << "Failed to parse: " << param.str;
+
+ const auto& transform = result.value();
+ const auto transformPtr = transform->Bind(param.source_type);
+ ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind: " << param.str;
+
+ auto transform_result = transformPtr.value()->Transform(param.source);
+ EXPECT_TRUE(transform_result->IsNull())
+ << "Expected transform to return null type for source: " <<
param.source.ToString();
+ EXPECT_EQ(transform_result->ToString(), param.expected.ToString())
+ << "Unexpected result for source: " << param.source.ToString();
}
+INSTANTIATE_TEST_SUITE_P(
+ NullLiteralTransformTests, NullLiteralTransformTest,
+ ::testing::Values(TransformParam{.str = "identity",
+ .source_type = iceberg::string(),
+ .source =
Literal::Null(iceberg::string()),
+ .expected =
Literal::Null(iceberg::string())},
+ TransformParam{.str = "year",
+ .source_type = iceberg::timestamp(),
+ .source =
Literal::Null(iceberg::timestamp()),
+ .expected =
Literal::Null(iceberg::int32())},
+ TransformParam{.str = "month",
+ .source_type = iceberg::timestamp(),
+ .source =
Literal::Null(iceberg::timestamp()),
+ .expected =
Literal::Null(iceberg::int32())},
+ TransformParam{.str = "day",
+ .source_type = iceberg::timestamp(),
+ .source =
Literal::Null(iceberg::timestamp()),
+ .expected =
Literal::Null(iceberg::int32())},
+ TransformParam{.str = "hour",
+ .source_type = iceberg::timestamp(),
+ .source =
Literal::Null(iceberg::timestamp()),
+ .expected =
Literal::Null(iceberg::int32())},
+ TransformParam{.str = "void",
+ .source_type = iceberg::string(),
+ .source =
Literal::Null(iceberg::string()),
+ .expected =
Literal::Null(iceberg::string())},
+ TransformParam{.str = "bucket[16]",
+ .source_type = iceberg::string(),
+ .source =
Literal::Null(iceberg::string()),
+ .expected =
Literal::Null(iceberg::int32())},
+ TransformParam{.str = "truncate[32]",
+ .source_type = iceberg::string(),
+ .source =
Literal::Null(iceberg::string()),
+ .expected =
Literal::Null(iceberg::string())}));
+
} // namespace iceberg
diff --git a/src/iceberg/test/truncate_util_test.cc
b/src/iceberg/test/truncate_util_test.cc
new file mode 100644
index 0000000..61010fc
--- /dev/null
+++ b/src/iceberg/test/truncate_util_test.cc
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/truncate_util.h"
+
+#include <gtest/gtest.h>
+
+#include "iceberg/expression/literal.h"
+
+namespace iceberg {
+
+// The following tests are from
+// https://iceberg.apache.org/spec/#truncate-transform-details
+TEST(TruncateUtilTest, TruncateLiteral) {
+ // Integer
+ EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Int(1), 10),
Literal::Int(0));
+ EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Int(-1), 10),
Literal::Int(-10));
+ EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Long(1), 10),
Literal::Long(0));
+ EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Long(-1), 10),
Literal::Long(-10));
+
+ // Decimal
+ EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Decimal(1065, 4, 2), 50),
+ Literal::Decimal(1050, 4, 2));
+
+ // String
+ EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::String("iceberg"), 3),
+ Literal::String("ice"));
+
+ // Binary
+ std::string data = "\x01\x02\x03\x04\x05";
+ std::string expected = "\x01\x02\x03";
+ EXPECT_EQ(TruncateUtils::TruncateLiteral(
+ Literal::Binary(std::vector<uint8_t>(data.begin(),
data.end())), 3),
+ Literal::Binary(std::vector<uint8_t>(expected.begin(),
expected.end())));
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/transform_function.cc
b/src/iceberg/transform_function.cc
index fd9a165..e2f5ece 100644
--- a/src/iceberg/transform_function.cc
+++ b/src/iceberg/transform_function.cc
@@ -20,16 +20,14 @@
#include "iceberg/transform_function.h"
#include <cassert>
-#include <chrono>
-#include <type_traits>
-#include <utility>
-#include <variant>
#include "iceberg/expression/literal.h"
#include "iceberg/type.h"
-#include "iceberg/util/murmurhash3_internal.h"
+#include "iceberg/type_fwd.h"
+#include "iceberg/util/bucket_util.h"
+#include "iceberg/util/macros.h"
+#include "iceberg/util/temporal_util.h"
#include "iceberg/util/truncate_util.h"
-#include "iceberg/util/uuid.h"
namespace iceberg {
@@ -54,48 +52,14 @@ BucketTransform::BucketTransform(std::shared_ptr<Type>
const& source_type,
: TransformFunction(TransformType::kBucket, source_type),
num_buckets_(num_buckets) {}
Result<Literal> BucketTransform::Transform(const Literal& literal) {
- assert(literal.type() == source_type());
- if (literal.IsBelowMin() || literal.IsAboveMax()) {
- return InvalidArgument(
- "Cannot apply bucket transform to literal with value {} of type {}",
- literal.ToString(), source_type()->ToString());
- }
+ ICEBERG_DCHECK(*literal.type() == *source_type(),
+ "Literal type must match source type");
if (literal.IsNull()) [[unlikely]] {
return Literal::Null(int32());
}
- int32_t hash_value = 0;
- std::visit(
- [&](auto&& value) {
- using T = std::decay_t<decltype(value)>;
- if constexpr (std::is_same_v<T, int32_t>) {
- MurmurHash3_x86_32(&value, sizeof(int32_t), 0, &hash_value);
- } else if constexpr (std::is_same_v<T, int64_t>) {
- MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value);
- } else if constexpr (std::is_same_v<T, std::array<uint8_t, 16>>) {
- MurmurHash3_x86_32(value.data(), sizeof(uint8_t) * 16, 0,
&hash_value);
- } else if constexpr (std::is_same_v<T, std::string>) {
- MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value);
- } else if constexpr (std::is_same_v<T, Uuid>) {
- MurmurHash3_x86_32(std::get<Uuid>(literal.value()).bytes().data(),
- Uuid::kLength, 0, &hash_value);
- } else if constexpr (std::is_same_v<T, std::vector<uint8_t>>) {
- MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value);
- } else if constexpr (std::is_same_v<T, std::monostate> ||
- std::is_same_v<T, bool> || std::is_same_v<T,
float> ||
- std::is_same_v<T, double> ||
- std::is_same_v<T, Literal::BelowMin> ||
- std::is_same_v<T, Literal::AboveMax>) {
- std::unreachable();
- } else {
- static_assert(false, "Unhandled type in BucketTransform::Transform");
- }
- },
- literal.value());
-
- // Calculate the bucket index
- int32_t bucket_index =
- (hash_value & std::numeric_limits<int32_t>::max()) % num_buckets_;
+ ICEBERG_ASSIGN_OR_RAISE(auto bucket_index,
+ BucketUtils::BucketIndex(literal, num_buckets_))
return Literal::Int(bucket_index);
}
@@ -135,47 +99,9 @@ TruncateTransform::TruncateTransform(std::shared_ptr<Type>
const& source_type,
: TransformFunction(TransformType::kTruncate, source_type), width_(width)
{}
Result<Literal> TruncateTransform::Transform(const Literal& literal) {
- assert(literal.type() == source_type());
- if (literal.IsBelowMin() || literal.IsAboveMax()) {
- return InvalidArgument(
- "Cannot apply truncate transform to literal with value {} of type {}",
- literal.ToString(), source_type()->ToString());
- }
- if (literal.IsNull()) [[unlikely]] {
- // Return null as is
- return literal;
- }
-
- switch (source_type()->type_id()) {
- case TypeId::kInt: {
- auto value = std::get<int32_t>(literal.value());
- return Literal::Int(TruncateUtils::TruncateInteger(value, width_));
- }
- case TypeId::kLong: {
- auto value = std::get<int64_t>(literal.value());
- return Literal::Long(TruncateUtils::TruncateInteger(value, width_));
- }
- case TypeId::kDecimal: {
- // TODO(zhjwpku): Handle decimal truncation logic here
- return NotImplemented("Truncate for Decimal is not implemented yet");
- }
- case TypeId::kString: {
- // Strings are truncated to a valid UTF-8 string with no more than L
code points.
- auto value = std::get<std::string>(literal.value());
- return Literal::String(TruncateUtils::TruncateUTF8(std::move(value),
width_));
- }
- case TypeId::kBinary: {
- /// In contrast to strings, binary values do not have an assumed
encoding and are
- /// truncated to L bytes.
- auto value = std::get<std::vector<uint8_t>>(literal.value());
- if (value.size() > static_cast<size_t>(width_)) {
- value.resize(width_);
- }
- return Literal::Binary(std::move(value));
- }
- default:
- std::unreachable();
- }
+ ICEBERG_DCHECK(*literal.type() == *source_type(),
+ "Literal type must match source type");
+ return TruncateUtils::TruncateLiteral(literal, width_);
}
std::shared_ptr<Type> TruncateTransform::ResultType() const { return
source_type(); }
@@ -206,34 +132,9 @@ YearTransform::YearTransform(std::shared_ptr<Type> const&
source_type)
: TransformFunction(TransformType::kTruncate, source_type) {}
Result<Literal> YearTransform::Transform(const Literal& literal) {
- assert(literal.type() == source_type());
- if (literal.IsBelowMin() || literal.IsAboveMax()) {
- return InvalidArgument(
- "Cannot apply year transform to literal with value {} of type {}",
- literal.ToString(), source_type()->ToString());
- }
- if (literal.IsNull()) [[unlikely]] {
- return Literal::Null(int32());
- }
-
- using namespace std::chrono; // NOLINT
- switch (source_type()->type_id()) {
- case TypeId::kDate: {
- auto value = std::get<int32_t>(literal.value());
- auto epoch = sys_days(year{1970} / January / 1);
- auto ymd = year_month_day(epoch + days{value});
- return Literal::Int(static_cast<int32_t>(ymd.year()));
- }
- case TypeId::kTimestamp:
- case TypeId::kTimestampTz: {
- auto value = std::get<int64_t>(literal.value());
- // Convert microseconds-since-epoch into a `year_month_day` object
- auto ymd =
year_month_day(floor<days>(sys_time<microseconds>(microseconds{value})));
- return Literal::Int(static_cast<int32_t>(ymd.year()));
- }
- default:
- std::unreachable();
- }
+ ICEBERG_DCHECK(*literal.type() == *source_type(),
+ "Literal type must match source type");
+ return TemporalUtils::ExtractYear(literal);
}
std::shared_ptr<Type> YearTransform::ResultType() const { return int32(); }
@@ -259,46 +160,9 @@ MonthTransform::MonthTransform(std::shared_ptr<Type>
const& source_type)
: TransformFunction(TransformType::kMonth, source_type) {}
Result<Literal> MonthTransform::Transform(const Literal& literal) {
- assert(literal.type() == source_type());
- if (literal.IsBelowMin() || literal.IsAboveMax()) {
- return InvalidArgument(
- "Cannot apply month transform to literal with value {} of type {}",
- literal.ToString(), source_type()->ToString());
- }
- if (literal.IsNull()) [[unlikely]] {
- return Literal::Null(int32());
- }
-
- using namespace std::chrono; // NOLINT
- switch (source_type()->type_id()) {
- case TypeId::kDate: {
- auto value = std::get<int32_t>(literal.value());
- auto epoch = sys_days(year{1970} / January / 1);
- auto ymd = year_month_day(epoch + days{value});
- auto epoch_ymd = year_month_day(epoch);
- auto delta = ymd.year() - epoch_ymd.year();
- // Calculate the month as months from 1970-01
- // Note: January is month 1, so we subtract 1 to get zero-based
- // month count.
- return Literal::Int(static_cast<int32_t>(delta.count() * 12 +
-
static_cast<unsigned>(ymd.month()) - 1));
- }
- case TypeId::kTimestamp:
- case TypeId::kTimestampTz: {
- auto value = std::get<int64_t>(literal.value());
- // Convert microseconds-since-epoch into a `year_month_day` object
- auto ymd =
year_month_day(floor<days>(sys_time<microseconds>(microseconds{value})));
- auto epoch_ymd = year_month_day(year{1970} / January / 1);
- auto delta = ymd.year() - epoch_ymd.year();
- // Calculate the month as months from 1970-01
- // Note: January is month 1, so we subtract 1 to get zero-based
- // month count.
- return Literal::Int(static_cast<int32_t>(delta.count() * 12 +
-
static_cast<unsigned>(ymd.month()) - 1));
- }
- default:
- std::unreachable();
- }
+ ICEBERG_DCHECK(*literal.type() == *source_type(),
+ "Literal type must match source type");
+ return TemporalUtils::ExtractMonth(literal);
}
std::shared_ptr<Type> MonthTransform::ResultType() const { return int32(); }
@@ -324,34 +188,9 @@ DayTransform::DayTransform(std::shared_ptr<Type> const&
source_type)
: TransformFunction(TransformType::kDay, source_type) {}
Result<Literal> DayTransform::Transform(const Literal& literal) {
- assert(literal.type() == source_type());
- if (literal.IsBelowMin() || literal.IsAboveMax()) {
- return InvalidArgument(
- "Cannot apply day transform to literal with value {} of type {}",
- literal.ToString(), source_type()->ToString());
- }
- if (literal.IsNull()) [[unlikely]] {
- return Literal::Null(int32());
- }
-
- using namespace std::chrono; // NOLINT
- switch (source_type()->type_id()) {
- case TypeId::kDate: {
- return Literal::Int(std::get<int32_t>(literal.value()));
- }
- case TypeId::kTimestamp:
- case TypeId::kTimestampTz: {
- auto value = std::get<int64_t>(literal.value());
- // Convert microseconds to `sys_days` (chronological days since epoch)
- auto timestamp = sys_time<microseconds>(microseconds{value});
- auto days_since_epoch = floor<days>(timestamp);
-
- return Literal::Int(
- static_cast<int32_t>(days_since_epoch.time_since_epoch().count()));
- }
- default:
- std::unreachable();
- }
+ ICEBERG_DCHECK(*literal.type() == *source_type(),
+ "Literal type must match source type");
+ return TemporalUtils::ExtractDay(literal);
}
std::shared_ptr<Type> DayTransform::ResultType() const { return int32(); }
@@ -377,33 +216,9 @@ HourTransform::HourTransform(std::shared_ptr<Type> const&
source_type)
: TransformFunction(TransformType::kHour, source_type) {}
Result<Literal> HourTransform::Transform(const Literal& literal) {
- assert(literal.type() == source_type());
- if (literal.IsBelowMin() || literal.IsAboveMax()) {
- return InvalidArgument(
- "Cannot apply hour transform to literal with value {} of type {}",
- literal.ToString(), source_type()->ToString());
- }
-
- if (literal.IsNull()) [[unlikely]] {
- return Literal::Null(int32());
- }
-
- using namespace std::chrono; // NOLINT
- switch (source_type()->type_id()) {
- case TypeId::kTimestamp:
- case TypeId::kTimestampTz: {
- auto value = std::get<int64_t>(literal.value());
- // Create a `sys_time` object from the microseconds value
- auto timestamp = sys_time<microseconds>(microseconds{value});
-
- // Convert the time since epoch directly into hours
- auto hours_since_epoch =
duration_cast<hours>(timestamp.time_since_epoch()).count();
-
- return Literal::Int(static_cast<int32_t>(hours_since_epoch));
- }
- default:
- std::unreachable();
- }
+ ICEBERG_DCHECK(*literal.type() == *source_type(),
+ "Literal type must match source type");
+ return TemporalUtils::ExtractHour(literal);
}
std::shared_ptr<Type> HourTransform::ResultType() const { return int32(); }
diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h
index 165390b..fc0dd72 100644
--- a/src/iceberg/transform_function.h
+++ b/src/iceberg/transform_function.h
@@ -51,6 +51,9 @@ class ICEBERG_EXPORT BucketTransform : public
TransformFunction {
BucketTransform(std::shared_ptr<Type> const& source_type, int32_t
num_buckets);
/// \brief Applies the bucket hash function to the input Literal.
+ ///
+ /// Reference:
+ /// - https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
Result<Literal> Transform(const Literal& literal) override;
/// \brief Returns INT32 as the output type.
diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h
index 3bd067d..4367448 100644
--- a/src/iceberg/type_fwd.h
+++ b/src/iceberg/type_fwd.h
@@ -115,6 +115,9 @@ class NameMapping;
enum class SnapshotRefType;
enum class TransformType;
+class Decimal;
+class Uuid;
+
class Expression;
class Literal;
diff --git a/src/iceberg/util/bucket_util.cc b/src/iceberg/util/bucket_util.cc
new file mode 100644
index 0000000..88b240d
--- /dev/null
+++ b/src/iceberg/util/bucket_util.cc
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/bucket_util.h"
+
+#include <utility>
+
+#include "iceberg/expression/literal.h"
+#include "iceberg/util/endian.h"
+#include "iceberg/util/murmurhash3_internal.h"
+
+namespace iceberg {
+
+namespace {
+template <TypeId type_id>
+int32_t HashLiteral(const Literal& literal) {
+ std::unreachable();
+}
+
+template <>
+int32_t HashLiteral<TypeId::kInt>(const Literal& literal) {
+ return BucketUtils::HashInt(std::get<int32_t>(literal.value()));
+}
+
+template <>
+int32_t HashLiteral<TypeId::kDate>(const Literal& literal) {
+ return BucketUtils::HashInt(std::get<int32_t>(literal.value()));
+}
+
+template <>
+int32_t HashLiteral<TypeId::kLong>(const Literal& literal) {
+ return BucketUtils::HashLong(std::get<int64_t>(literal.value()));
+}
+
+template <>
+int32_t HashLiteral<TypeId::kTime>(const Literal& literal) {
+ return BucketUtils::HashLong(std::get<int64_t>(literal.value()));
+}
+
+template <>
+int32_t HashLiteral<TypeId::kTimestamp>(const Literal& literal) {
+ return BucketUtils::HashLong(std::get<int64_t>(literal.value()));
+}
+
+template <>
+int32_t HashLiteral<TypeId::kTimestampTz>(const Literal& literal) {
+ return BucketUtils::HashLong(std::get<int64_t>(literal.value()));
+}
+
+template <>
+int32_t HashLiteral<TypeId::kDecimal>(const Literal& literal) {
+ const auto& decimal = std::get<Decimal>(literal.value());
+ return BucketUtils::HashBytes(decimal.ToBigEndian());
+}
+
+template <>
+int32_t HashLiteral<TypeId::kString>(const Literal& literal) {
+ const auto& str = std::get<std::string>(literal.value());
+ return BucketUtils::HashBytes(
+ std::span<const uint8_t>(reinterpret_cast<const uint8_t*>(str.data()),
str.size()));
+}
+
+template <>
+int32_t HashLiteral<TypeId::kUuid>(const Literal& literal) {
+ const auto& uuid = std::get<Uuid>(literal.value());
+ return BucketUtils::HashBytes(uuid.bytes());
+}
+
+template <>
+int32_t HashLiteral<TypeId::kBinary>(const Literal& literal) {
+ const auto& binary = std::get<std::vector<uint8_t>>(literal.value());
+ return BucketUtils::HashBytes(binary);
+}
+
+template <>
+int32_t HashLiteral<TypeId::kFixed>(const Literal& literal) {
+ const auto& fixed = std::get<std::vector<uint8_t>>(literal.value());
+ return BucketUtils::HashBytes(fixed);
+}
+
+} // namespace
+
+int32_t BucketUtils::HashBytes(std::span<const uint8_t> bytes) {
+ int32_t hash_value = 0;
+ MurmurHash3_x86_32(bytes.data(), bytes.size(), 0, &hash_value);
+ return hash_value;
+}
+
+int32_t BucketUtils::HashLong(int64_t value) {
+ int32_t hash_value = 0;
+ value = ToLittleEndian(value);
+ MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value);
+ return hash_value;
+}
+
+#define DISPATCH_HASH_LITERAL(TYPE_ID) \
+ case TYPE_ID: \
+ hash_value = HashLiteral<TYPE_ID>(literal); \
+ break;
+
+Result<int32_t> BucketUtils::BucketIndex(const Literal& literal, int32_t
num_buckets) {
+ if (num_buckets <= 0) [[unlikely]] {
+ return InvalidArgument("Number of buckets must be positive, got {}",
num_buckets);
+ }
+
+ if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] {
+ return NotSupported("Cannot compute bucket index for {}",
literal.ToString());
+ }
+
+ int32_t hash_value = 0;
+ switch (literal.type()->type_id()) {
+ DISPATCH_HASH_LITERAL(TypeId::kInt)
+ DISPATCH_HASH_LITERAL(TypeId::kDate)
+ DISPATCH_HASH_LITERAL(TypeId::kLong)
+ DISPATCH_HASH_LITERAL(TypeId::kTime)
+ DISPATCH_HASH_LITERAL(TypeId::kTimestamp)
+ DISPATCH_HASH_LITERAL(TypeId::kTimestampTz)
+ DISPATCH_HASH_LITERAL(TypeId::kDecimal)
+ DISPATCH_HASH_LITERAL(TypeId::kString)
+ DISPATCH_HASH_LITERAL(TypeId::kUuid)
+ DISPATCH_HASH_LITERAL(TypeId::kBinary)
+ DISPATCH_HASH_LITERAL(TypeId::kFixed)
+ default:
+ return NotSupported("Hashing not supported for type {}",
+ literal.type()->ToString());
+ }
+
+ return (hash_value & std::numeric_limits<int32_t>::max()) % num_buckets;
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/util/bucket_util.h b/src/iceberg/util/bucket_util.h
new file mode 100644
index 0000000..31a574b
--- /dev/null
+++ b/src/iceberg/util/bucket_util.h
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <span>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+
+namespace iceberg {
+
+class ICEBERG_EXPORT BucketUtils {
+ public:
+ /// \brief Hash a 32-bit integer using MurmurHash3 and return a 32-bit hash
value.
+ /// \param value The input integer to hash.
+ /// \note Integer and long hash results must be identical for all integer
values. This
+ /// ensures that schema evolution does not change bucket partition values if
integer
+ /// types are promoted.
+ /// \return A 32-bit hash value.
+ static inline int32_t HashInt(int32_t value) {
+ return HashLong(static_cast<int64_t>(value));
+ }
+
+ /// \brief Hash a 64-bit integer using MurmurHash3 and return a 32-bit hash
value.
+ /// \param value The input long to hash.
+ /// \return A 32-bit hash value.
+ static int32_t HashLong(int64_t value);
+
+ /// \brief Hash a byte array using MurmurHash3 and return a 32-bit hash
value.
+ /// \param bytes The input byte array to hash.
+ /// \return A 32-bit hash value.
+ static int32_t HashBytes(std::span<const uint8_t> bytes);
+
+ /// \brief Compute the bucket index for a given literal and number of
buckets.
+ /// \param literal The input literal to hash.
+ /// \param num_buckets The number of buckets to hash into.
+ /// \return (murmur3_x86_32_hash(literal) & Integer.MAX_VALUE) % num_buckets
+ static Result<int32_t> BucketIndex(const Literal& literal, int32_t
num_buckets);
+};
+
+} // namespace iceberg
diff --git a/src/iceberg/util/conversions.cc b/src/iceberg/util/conversions.cc
index e12e481..0cc7c55 100644
--- a/src/iceberg/util/conversions.cc
+++ b/src/iceberg/util/conversions.cc
@@ -23,6 +23,7 @@
#include <span>
#include <string>
+#include "iceberg/util/decimal.h"
#include "iceberg/util/endian.h"
#include "iceberg/util/macros.h"
#include "iceberg/util/uuid.h"
@@ -64,6 +65,12 @@ Result<std::vector<uint8_t>>
ToBytesImpl<TypeId::kBoolean>(const Literal::Value&
:
static_cast<uint8_t>(0x00)};
}
+template <>
+Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kDecimal>(const
Literal::Value& value) {
+ const auto& decimal = std::get<Decimal>(value);
+ return decimal.ToBigEndian();
+}
+
template <>
Result<std::vector<uint8_t>> ToBytesImpl<TypeId::kString>(const
Literal::Value& value) {
const auto& str = std::get<std::string>(value);
@@ -95,6 +102,7 @@ Result<std::vector<uint8_t>> Conversions::ToBytes(const
PrimitiveType& type,
const auto type_id = type.type_id();
switch (type_id) {
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kBoolean)
DISPATCH_LITERAL_TO_BYTES(TypeId::kInt)
DISPATCH_LITERAL_TO_BYTES(TypeId::kDate)
DISPATCH_LITERAL_TO_BYTES(TypeId::kLong)
@@ -103,12 +111,11 @@ Result<std::vector<uint8_t>> Conversions::ToBytes(const
PrimitiveType& type,
DISPATCH_LITERAL_TO_BYTES(TypeId::kTimestampTz)
DISPATCH_LITERAL_TO_BYTES(TypeId::kFloat)
DISPATCH_LITERAL_TO_BYTES(TypeId::kDouble)
- DISPATCH_LITERAL_TO_BYTES(TypeId::kBoolean)
+ DISPATCH_LITERAL_TO_BYTES(TypeId::kDecimal)
DISPATCH_LITERAL_TO_BYTES(TypeId::kString)
DISPATCH_LITERAL_TO_BYTES(TypeId::kUuid)
DISPATCH_LITERAL_TO_BYTES(TypeId::kBinary)
DISPATCH_LITERAL_TO_BYTES(TypeId::kFixed)
- // TODO(Li Feiyang): Add support for Decimal
default:
return NotSupported("Serialization for type {} is not supported",
type.ToString());
@@ -177,6 +184,11 @@ Result<Literal::Value> Conversions::FromBytes(const
PrimitiveType& type,
return Literal::Value{double_value};
}
}
+ case TypeId::kDecimal: {
+ ICEBERG_ASSIGN_OR_RAISE(auto decimal,
+ Decimal::FromBigEndian(data.data(),
data.size()));
+ return Literal::Value{decimal};
+ }
case TypeId::kString:
return Literal::Value{
std::string(reinterpret_cast<const char*>(data.data()),
data.size())};
@@ -194,7 +206,6 @@ Result<Literal::Value> Conversions::FromBytes(const
PrimitiveType& type,
}
return Literal::Value{std::vector<uint8_t>(data.begin(), data.end())};
}
- // TODO(Li Feiyang): Add support for Decimal
default:
return NotSupported("Deserialization for type {} is not supported",
type.ToString());
diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc
index 5018574..f33d932 100644
--- a/src/iceberg/util/decimal.cc
+++ b/src/iceberg/util/decimal.cc
@@ -24,12 +24,12 @@
#include "iceberg/util/decimal.h"
+#include <algorithm>
#include <bit>
#include <charconv>
#include <climits>
#include <cmath>
#include <cstring>
-#include <format>
#include <iomanip>
#include <limits>
#include <sstream>
@@ -44,6 +44,16 @@ namespace iceberg {
namespace {
+constexpr int32_t kMinDecimalBytes = 1;
+constexpr int32_t kMaxDecimalBytes = 16;
+
+// The maximum decimal value that can be represented with kMaxPrecision digits.
+// 10^38 - 1
+constexpr Decimal kMaxDecimalValue(5421010862427522170LL,
687399551400673279ULL);
+// The mininum decimal value that can be represented with kMaxPrecision digits.
+// - (10^38 - 1)
+constexpr Decimal kMinDecimalValue(-5421010862427522171LL,
17759344522308878337ULL);
+
struct DecimalComponents {
std::string_view while_digits;
std::string_view fractional_digits;
@@ -275,8 +285,15 @@ bool RescaleWouldCauseDataLoss(const Decimal& value,
int32_t delta_scale,
return res->second != 0;
}
+ auto max_safe_value = kMaxDecimalValue / multiplier;
+ auto min_safe_value = kMinDecimalValue / multiplier;
+ if (value > max_safe_value || value < min_safe_value) {
+ // Overflow would happen — treat as data loss
+ return true;
+ }
+
*result = value * multiplier;
- return (value < 0) ? *result > value : *result < value;
+ return false;
}
} // namespace
@@ -470,11 +487,6 @@ Result<Decimal> Decimal::FromString(std::string_view str,
int32_t* precision,
}
Result<Decimal> Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) {
- static constexpr int32_t kMinDecimalBytes = 1;
- static constexpr int32_t kMaxDecimalBytes = 16;
-
- int64_t high, low;
-
if (length < kMinDecimalBytes || length > kMaxDecimalBytes) {
return InvalidArgument(
"Decimal::FromBigEndian: length must be in the range [{}, {}], was {}",
@@ -486,7 +498,8 @@ Result<Decimal> Decimal::FromBigEndian(const uint8_t*
bytes, int32_t length) {
const bool is_negative = static_cast<int8_t>(bytes[0]) < 0;
uint128_t result = 0;
- std::memcpy(reinterpret_cast<uint8_t*>(&result) + 16 - length, bytes,
length);
+ std::memcpy(reinterpret_cast<uint8_t*>(&result) + kMaxDecimalBytes - length,
bytes,
+ length);
if constexpr (std::endian::native == std::endian::little) {
auto high = static_cast<uint64_t>(result >> 64);
@@ -505,6 +518,36 @@ Result<Decimal> Decimal::FromBigEndian(const uint8_t*
bytes, int32_t length) {
return Decimal(static_cast<int128_t>(result));
}
+std::vector<uint8_t> Decimal::ToBigEndian() const {
+ std::vector<uint8_t> bytes(kMaxDecimalBytes);
+
+ auto uvalue = static_cast<uint128_t>(data_);
+ std::memcpy(bytes.data(), &uvalue, kMaxDecimalBytes);
+
+ if constexpr (std::endian::native == std::endian::little) {
+ std::ranges::reverse(bytes);
+ }
+
+ auto is_negative = data_ < 0;
+ int keep = kMaxDecimalBytes;
+ for (int32_t i = 0; i < kMaxDecimalBytes - 1; ++i) {
+ uint8_t byte = bytes[i];
+ uint8_t next = bytes[i + 1];
+ // For negative numbers, keep the leading 0xff byte if the next byte has
its sign bit
+ // unset. For positive numbers, keep the leading 0x00 byte if the next
byte has its
+ // sign bit set.
+ if ((is_negative && byte == 0xff && (next & 0x80)) ||
+ (!is_negative && byte == 0x00 && !(next & 0x80))) {
+ --keep;
+ } else {
+ break;
+ }
+ }
+
+ bytes.erase(bytes.begin(), bytes.begin() + (kMaxDecimalBytes - keep));
+ return bytes;
+}
+
Result<Decimal> Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const {
if (orig_scale == new_scale) {
return *this;
@@ -518,10 +561,7 @@ Result<Decimal> Decimal::Rescale(int32_t orig_scale,
int32_t new_scale) const {
auto& multiplier = kDecimal128PowersOfTen[abs_delta_scale];
- const bool rescale_would_cause_data_loss =
- RescaleWouldCauseDataLoss(*this, delta_scale, multiplier, &out);
-
- if (rescale_would_cause_data_loss) {
+ if (RescaleWouldCauseDataLoss(*this, delta_scale, multiplier, &out))
[[unlikely]] {
return Invalid("Rescale {} from {} to {} would cause data loss",
ToIntegerString(),
orig_scale, new_scale);
}
@@ -534,6 +574,52 @@ bool Decimal::FitsInPrecision(int32_t precision) const {
return Decimal::Abs(*this) < kDecimal128PowersOfTen[precision];
}
+std::partial_ordering Decimal::Compare(const Decimal& lhs, const Decimal& rhs,
+ int32_t lhs_scale, int32_t rhs_scale) {
+ if (lhs_scale == rhs_scale || lhs.data_ == 0 || rhs.data_ == 0) {
+ return lhs <=> rhs;
+ }
+
+ // If one is negative and the other is positive, the positive is greater.
+ if (lhs.data_ < 0 && rhs.data_ > 0) {
+ return std::partial_ordering::less;
+ }
+ if (lhs.data_ > 0 && rhs.data_ < 0) {
+ return std::partial_ordering::greater;
+ }
+
+ // Both are negative
+ bool negative = lhs.data_ < 0 && rhs.data_ < 0;
+
+ const int32_t delta_scale = lhs_scale - rhs_scale;
+ const int32_t abs_delta_scale = std::abs(delta_scale);
+
+ ICEBERG_DCHECK(abs_delta_scale <= kMaxScale, "");
+
+ const auto& multiplier = kDecimal128PowersOfTen[abs_delta_scale];
+
+ Decimal adjusted_lhs;
+ Decimal adjusted_rhs;
+
+ if (delta_scale < 0) {
+ // lhs_scale < rhs_scale
+ if (RescaleWouldCauseDataLoss(lhs, -delta_scale, multiplier,
&adjusted_lhs))
+ [[unlikely]] {
+ return negative ? std::partial_ordering::less :
std::partial_ordering::greater;
+ }
+ adjusted_rhs = rhs;
+ } else {
+ // lhs_scale > rhs_scale
+ if (RescaleWouldCauseDataLoss(rhs, delta_scale, multiplier, &adjusted_rhs))
+ [[unlikely]] {
+ return negative ? std::partial_ordering::greater :
std::partial_ordering::less;
+ }
+ adjusted_lhs = lhs;
+ }
+
+ return adjusted_lhs <=> adjusted_rhs;
+}
+
std::array<uint8_t, Decimal::kByteWidth> Decimal::ToBytes() const {
std::array<uint8_t, kByteWidth> out{{0}};
std::memcpy(out.data(), &data_, kByteWidth);
diff --git a/src/iceberg/util/decimal.h b/src/iceberg/util/decimal.h
index 7e9cd7c..b7f57f4 100644
--- a/src/iceberg/util/decimal.h
+++ b/src/iceberg/util/decimal.h
@@ -25,11 +25,13 @@
/// https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.h
#include <array>
+#include <compare>
#include <cstdint>
#include <iosfwd>
#include <string>
#include <string_view>
#include <type_traits>
+#include <vector>
#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
@@ -142,7 +144,7 @@ class ICEBERG_EXPORT Decimal : public util::Formattable {
/// \brief Convert the Decimal value to a base 10 decimal string with the
given scale.
/// \param scale The scale to use for the string representation.
/// \return The string representation of the Decimal value.
- Result<std::string> ToString(int32_t scale = 0) const;
+ Result<std::string> ToString(int32_t scale) const;
/// \brief Convert the Decimal value to an integer string.
std::string ToIntegerString() const;
@@ -164,6 +166,11 @@ class ICEBERG_EXPORT Decimal : public util::Formattable {
/// \return error status if the length is an invalid value
static Result<Decimal> FromBigEndian(const uint8_t* data, int32_t length);
+ /// \brief Convert Decimal's unscaled value to two’s-complement big-endian
binary, using
+ /// the minimum number of bytes for the value.
+ /// \return A vector containing the big-endian bytes.
+ std::vector<uint8_t> ToBigEndian() const;
+
/// \brief Convert Decimal from one scale to another.
Result<Decimal> Rescale(int32_t orig_scale, int32_t new_scale) const;
@@ -180,6 +187,10 @@ class ICEBERG_EXPORT Decimal : public util::Formattable {
return low() <=> other.low();
}
+ /// \brief Compare two Decimals with different scales.
+ static std::partial_ordering Compare(const Decimal& lhs, const Decimal& rhs,
+ int32_t lhs_scale, int32_t rhs_scale);
+
const uint8_t* native_endian_bytes() const {
return reinterpret_cast<const uint8_t*>(&data_);
}
diff --git a/src/iceberg/util/temporal_util.cc
b/src/iceberg/util/temporal_util.cc
new file mode 100644
index 0000000..41748c9
--- /dev/null
+++ b/src/iceberg/util/temporal_util.cc
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/temporal_util.h"
+
+#include <chrono>
+#include <utility>
+
+#include "iceberg/expression/literal.h"
+
+namespace iceberg {
+
+namespace {
+
+using namespace std::chrono; // NOLINT
+
+constexpr auto kEpochYmd = year{1970} / January / 1;
+constexpr auto kEpochDays = sys_days(kEpochYmd);
+
+inline constexpr year_month_day DateToYmd(int32_t days_since_epoch) {
+ return {kEpochDays + days{days_since_epoch}};
+}
+
+inline constexpr year_month_day TimestampToYmd(int64_t micros_since_epoch) {
+ return
{floor<days>(sys_time<microseconds>(microseconds{micros_since_epoch}))};
+}
+
+template <typename Duration>
+ requires std::is_same_v<Duration, days> || std::is_same_v<Duration, hours>
+inline constexpr int32_t TimestampToDuration(int64_t micros_since_epoch) {
+ return static_cast<int32_t>(
+ floor<Duration>(
+
sys_time<microseconds>(microseconds{micros_since_epoch}).time_since_epoch())
+ .count());
+}
+
+inline constexpr int32_t MonthsSinceEpoch(const year_month_day& ymd) {
+ auto delta = ymd.year() - kEpochYmd.year();
+ // Calculate the month as months from 1970-01
+ // Note: January is month 1, so we subtract 1 to get zero-based month count.
+ return static_cast<int32_t>(delta.count() * 12 +
static_cast<unsigned>(ymd.month()) -
+ 1);
+}
+
+template <TypeId type_id>
+Result<Literal> ExtractYearImpl(const Literal& literal) {
+ std::unreachable();
+}
+
+template <>
+Result<Literal> ExtractYearImpl<TypeId::kDate>(const Literal& literal) {
+ auto value = std::get<int32_t>(literal.value());
+ auto ymd = DateToYmd(value);
+ return Literal::Int(static_cast<int32_t>(ymd.year()));
+}
+
+template <>
+Result<Literal> ExtractYearImpl<TypeId::kTimestamp>(const Literal& literal) {
+ auto value = std::get<int64_t>(literal.value());
+ auto ymd = TimestampToYmd(value);
+ return Literal::Int(static_cast<int32_t>(ymd.year()));
+}
+
+template <>
+Result<Literal> ExtractYearImpl<TypeId::kTimestampTz>(const Literal& literal) {
+ return ExtractYearImpl<TypeId::kTimestamp>(literal);
+}
+
+template <TypeId type_id>
+Result<Literal> ExtractMonthImpl(const Literal& literal) {
+ std::unreachable();
+}
+
+template <>
+Result<Literal> ExtractMonthImpl<TypeId::kDate>(const Literal& literal) {
+ auto value = std::get<int32_t>(literal.value());
+ auto ymd = DateToYmd(value);
+ return Literal::Int(MonthsSinceEpoch(ymd));
+}
+
+template <>
+Result<Literal> ExtractMonthImpl<TypeId::kTimestamp>(const Literal& literal) {
+ auto value = std::get<int64_t>(literal.value());
+ auto ymd = TimestampToYmd(value);
+ return Literal::Int(MonthsSinceEpoch(ymd));
+}
+
+template <>
+Result<Literal> ExtractMonthImpl<TypeId::kTimestampTz>(const Literal& literal)
{
+ return ExtractMonthImpl<TypeId::kTimestamp>(literal);
+}
+
+template <TypeId type_id>
+Result<Literal> ExtractDayImpl(const Literal& literal) {
+ std::unreachable();
+}
+
+template <>
+Result<Literal> ExtractDayImpl<TypeId::kDate>(const Literal& literal) {
+ return Literal::Int(std::get<int32_t>(literal.value()));
+}
+
+template <>
+Result<Literal> ExtractDayImpl<TypeId::kTimestamp>(const Literal& literal) {
+ auto value = std::get<int64_t>(literal.value());
+ return Literal::Int(TimestampToDuration<days>(value));
+}
+
+template <>
+Result<Literal> ExtractDayImpl<TypeId::kTimestampTz>(const Literal& literal) {
+ return ExtractDayImpl<TypeId::kTimestamp>(literal);
+}
+
+template <TypeId type_id>
+Result<Literal> ExtractHourImpl(const Literal& literal) {
+ std::unreachable();
+}
+
+template <>
+Result<Literal> ExtractHourImpl<TypeId::kTimestamp>(const Literal& literal) {
+ auto value = std::get<int64_t>(literal.value());
+ return Literal::Int(TimestampToDuration<hours>(value));
+}
+
+template <>
+Result<Literal> ExtractHourImpl<TypeId::kTimestampTz>(const Literal& literal) {
+ return ExtractHourImpl<TypeId::kTimestamp>(literal);
+}
+
+} // namespace
+
+#define DISPATCH_EXTRACT_YEAR(type_id) \
+ case type_id: \
+ return ExtractYearImpl<type_id>(literal);
+
+Result<Literal> TemporalUtils::ExtractYear(const Literal& literal) {
+ if (literal.IsNull()) [[unlikely]] {
+ return Literal::Null(int32());
+ }
+
+ if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] {
+ return NotSupported("Cannot extract year from {}", literal.ToString());
+ }
+
+ switch (literal.type()->type_id()) {
+ DISPATCH_EXTRACT_YEAR(TypeId::kDate)
+ DISPATCH_EXTRACT_YEAR(TypeId::kTimestamp)
+ DISPATCH_EXTRACT_YEAR(TypeId::kTimestampTz)
+ default:
+ return NotSupported("Extract year from type {} is not supported",
+ literal.type()->ToString());
+ }
+}
+
+#define DISPATCH_EXTRACT_MONTH(type_id) \
+ case type_id: \
+ return ExtractMonthImpl<type_id>(literal);
+
+Result<Literal> TemporalUtils::ExtractMonth(const Literal& literal) {
+ if (literal.IsNull()) [[unlikely]] {
+ return Literal::Null(int32());
+ }
+
+ if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] {
+ return NotSupported("Cannot extract month from {}", literal.ToString());
+ }
+
+ switch (literal.type()->type_id()) {
+ DISPATCH_EXTRACT_MONTH(TypeId::kDate)
+ DISPATCH_EXTRACT_MONTH(TypeId::kTimestamp)
+ DISPATCH_EXTRACT_MONTH(TypeId::kTimestampTz)
+ default:
+ return NotSupported("Extract month from type {} is not supported",
+ literal.type()->ToString());
+ }
+}
+
+#define DISPATCH_EXTRACT_DAY(type_id) \
+ case type_id: \
+ return ExtractDayImpl<type_id>(literal);
+
+Result<Literal> TemporalUtils::ExtractDay(const Literal& literal) {
+ if (literal.IsNull()) [[unlikely]] {
+ return Literal::Null(int32());
+ }
+
+ if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] {
+ return NotSupported("Cannot extract day from {}", literal.ToString());
+ }
+
+ switch (literal.type()->type_id()) {
+ DISPATCH_EXTRACT_DAY(TypeId::kDate)
+ DISPATCH_EXTRACT_DAY(TypeId::kTimestamp)
+ DISPATCH_EXTRACT_DAY(TypeId::kTimestampTz)
+ default:
+ return NotSupported("Extract day from type {} is not supported",
+ literal.type()->ToString());
+ }
+}
+
+#define DISPATCH_EXTRACT_HOUR(type_id) \
+ case type_id: \
+ return ExtractHourImpl<type_id>(literal);
+
+Result<Literal> TemporalUtils::ExtractHour(const Literal& literal) {
+ if (literal.IsNull()) [[unlikely]] {
+ return Literal::Null(int32());
+ }
+
+ if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] {
+ return NotSupported("Cannot extract hour from {}", literal.ToString());
+ }
+
+ switch (literal.type()->type_id()) {
+ DISPATCH_EXTRACT_HOUR(TypeId::kTimestamp)
+ DISPATCH_EXTRACT_HOUR(TypeId::kTimestampTz)
+ default:
+ return NotSupported("Extract hour from type {} is not supported",
+ literal.type()->ToString());
+ }
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/util/temporal_util.h b/src/iceberg/util/temporal_util.h
new file mode 100644
index 0000000..750c3d8
--- /dev/null
+++ b/src/iceberg/util/temporal_util.h
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+
+namespace iceberg {
+
+class ICEBERG_EXPORT TemporalUtils {
+ public:
+ /// \brief Extract a date or timestamp year, as years from 1970
+ static Result<Literal> ExtractYear(const Literal& literal);
+
+ /// \brief Extract a date or timestamp month, as months from 1970-01-01
+ static Result<Literal> ExtractMonth(const Literal& literal);
+
+ /// \brief Extract a date or timestamp day, as days from 1970-01-01
+ static Result<Literal> ExtractDay(const Literal& literal);
+
+ /// \brief Extract a timestamp hour, as hours from 1970-01-01 00:00:00
+ static Result<Literal> ExtractHour(const Literal& literal);
+};
+
+} // namespace iceberg
diff --git a/src/iceberg/util/truncate_util.cc
b/src/iceberg/util/truncate_util.cc
new file mode 100644
index 0000000..9d0c6e7
--- /dev/null
+++ b/src/iceberg/util/truncate_util.cc
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/util/truncate_util.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "iceberg/expression/literal.h"
+#include "iceberg/util/checked_cast.h"
+
+namespace iceberg {
+
+namespace {
+template <TypeId type_id>
+Literal TruncateLiteralImpl(const Literal& literal, int32_t width) {
+ std::unreachable();
+}
+
+template <>
+Literal TruncateLiteralImpl<TypeId::kInt>(const Literal& literal, int32_t
width) {
+ int32_t v = std::get<int32_t>(literal.value());
+ return Literal::Int(TruncateUtils::TruncateInteger(v, width));
+}
+
+template <>
+Literal TruncateLiteralImpl<TypeId::kLong>(const Literal& literal, int32_t
width) {
+ int64_t v = std::get<int64_t>(literal.value());
+ return Literal::Long(TruncateUtils::TruncateInteger(v, width));
+}
+
+template <>
+Literal TruncateLiteralImpl<TypeId::kDecimal>(const Literal& literal, int32_t
width) {
+ const auto& decimal = std::get<Decimal>(literal.value());
+ auto type = internal::checked_pointer_cast<DecimalType>(literal.type());
+ return Literal::Decimal(TruncateUtils::TruncateDecimal(decimal,
width).value(),
+ type->precision(), type->scale());
+}
+
+template <>
+Literal TruncateLiteralImpl<TypeId::kString>(const Literal& literal, int32_t
width) {
+ // Strings are truncated to a valid UTF-8 string with no more than `width`
code points.
+ const auto& str = std::get<std::string>(literal.value());
+ return Literal::String(TruncateUtils::TruncateUTF8(str, width));
+}
+
+template <>
+Literal TruncateLiteralImpl<TypeId::kBinary>(const Literal& literal, int32_t
width) {
+ // In contrast to strings, binary values do not have an assumed encoding and
are
+ // truncated to `width` bytes.
+ const auto& data = std::get<std::vector<uint8_t>>(literal.value());
+ if (data.size() <= width) {
+ return literal;
+ }
+ return Literal::Binary(std::vector<uint8_t>(data.begin(), data.begin() +
width));
+}
+
+} // namespace
+
+Decimal TruncateUtils::TruncateDecimal(const Decimal& decimal, int32_t width) {
+ return decimal - (((decimal % width) + width) % width);
+}
+
+#define DISPATCH_TRUNCATE_LITERAL(TYPE_ID) \
+ case TYPE_ID: \
+ return TruncateLiteralImpl<TYPE_ID>(literal, width);
+
+Result<Literal> TruncateUtils::TruncateLiteral(const Literal& literal, int32_t
width) {
+ if (literal.IsNull()) [[unlikely]] {
+ // Return null as is
+ return literal;
+ }
+
+ if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] {
+ return NotSupported("Cannot truncate {}", literal.ToString());
+ }
+
+ switch (literal.type()->type_id()) {
+ DISPATCH_TRUNCATE_LITERAL(TypeId::kInt)
+ DISPATCH_TRUNCATE_LITERAL(TypeId::kLong)
+ DISPATCH_TRUNCATE_LITERAL(TypeId::kDecimal)
+ DISPATCH_TRUNCATE_LITERAL(TypeId::kString)
+ DISPATCH_TRUNCATE_LITERAL(TypeId::kBinary)
+ default:
+ return NotSupported("Truncate is not supported for type: {}",
+ literal.type()->ToString());
+ }
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/util/truncate_util.h b/src/iceberg/util/truncate_util.h
index 5e76135..881c1d7 100644
--- a/src/iceberg/util/truncate_util.h
+++ b/src/iceberg/util/truncate_util.h
@@ -19,10 +19,13 @@
#pragma once
+#include <cstdint>
#include <string>
#include <utility>
#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
namespace iceberg {
@@ -64,9 +67,25 @@ class ICEBERG_EXPORT TruncateUtils {
/// values, the correct truncate function is: v - (((v % W) + W) % W)
template <typename T>
requires std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t>
- static inline T TruncateInteger(T v, size_t W) {
+ static inline T TruncateInteger(T v, int32_t W) {
return v - (((v % W) + W) % W);
}
+
+ /// \brief Truncate a Decimal to a specified width.
+ /// \param decimal The input Decimal to truncate.
+ /// \param width The width to truncate to.
+ /// \return A Decimal truncated to the specified width.
+ static Decimal TruncateDecimal(const Decimal& decimal, int32_t width);
+
+ /// \brief Truncate a Literal to a specified width.
+ /// \param literal The input Literal to truncate.
+ /// \param width The width to truncate to.
+ /// \return A Result containing the truncated Literal or an error.
+ /// Supported types are: INT, LONG, DECIMAL, STRING, BINARY.
+ /// Reference:
+ /// - [Truncate Transform
+ /// Details](https://iceberg.apache.org/spec/#truncate-transform-details)
+ static Result<Literal> TruncateLiteral(const Literal& literal, int32_t
width);
};
} // namespace iceberg