mapleFU commented on code in PR #156: URL: https://github.com/apache/iceberg-cpp/pull/156#discussion_r2262747242
########## src/iceberg/util/truncate_utils.h: ########## @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <string> +#include <utility> + +#include "iceberg/iceberg_export.h" + +namespace iceberg { + +ICEBERG_EXPORT class TruncateUtils { + public: + /// \brief Truncate a UTF-8 string to a specified number of code points. + /// + /// \param source The input string to truncate. + /// \param L The maximum number of code points allowed in the output string. + /// \return A valid UTF-8 string truncated to L code points. + /// If the input string is already valid and has fewer than L code points, it is + /// returned unchanged. + static std::string TruncateUTF8(std::string&& source, size_t L) { Review Comment: Why not ``` static std::string TruncateUTF8(std::string source, size_t L) { ``` ########## src/iceberg/expression/literal.h: ########## @@ -48,27 +48,35 @@ class ICEBERG_EXPORT Literal { bool operator==(const AboveMax&) const = default; std::strong_ordering operator<=>(const AboveMax&) const = default; }; - - using Value = std::variant<bool, // for boolean - int32_t, // for int, date - int64_t, // for long, timestamp, timestamp_tz, time - float, // for float - double, // for double - std::string, // for string + using Value = std::variant<std::monostate, // for null Review Comment: Previously I may think this only present the literal value, but this is also ok to me ########## src/iceberg/transform_function.cc: ########## @@ -87,14 +129,52 @@ TruncateTransform::TruncateTransform(std::shared_ptr<Type> const& source_type, int32_t width) : TransformFunction(TransformType::kTruncate, source_type), width_(width) {} -Result<ArrowArray> TruncateTransform::Transform(const ArrowArray& input) { - return NotImplemented("TruncateTransform::Transform"); -} +Result<Literal> TruncateTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply truncate transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + if (literal.IsNull()) [[unlikely]] { + // Return null as is + return literal; + } -Result<std::shared_ptr<Type>> TruncateTransform::ResultType() const { - return source_type(); + switch (source_type()->type_id()) { + case TypeId::kInt: { + auto value = std::get<int32_t>(literal.value()); + return Literal::Int(TruncateUtils::TruncateInteger(value, width_)); + } + case TypeId::kLong: { + auto value = std::get<int64_t>(literal.value()); + return Literal::Long(TruncateUtils::TruncateInteger(value, width_)); + } + case TypeId::kDecimal: { + // TODO(zhjwpku): Handle decimal truncation logic here + return NotImplemented("Truncate for Decimal is not implemented yet"); + } + case TypeId::kString: { + // Strings are truncated to a valid UTF-8 string with no more than L code points. + auto value = std::get<std::string>(literal.value()); + return Literal::String(TruncateUtils::TruncateUTF8(std::move(value), width_)); + } + case TypeId::kBinary: { + /// In contrast to strings, binary values do not have an assumed encoding and are + /// truncated to L bytes. + auto value = std::get<std::vector<uint8_t>>(literal.value()); + if (value.size() > static_cast<size_t>(width_)) { + value.resize(width_); + } + return Literal::Binary(value); Review Comment: ```suggestion return Literal::Binary(std::move(value)); ``` ########## src/iceberg/transform_function.cc: ########## @@ -176,11 +318,39 @@ Result<std::unique_ptr<TransformFunction>> MonthTransform::Make( DayTransform::DayTransform(std::shared_ptr<Type> const& source_type) : TransformFunction(TransformType::kDay, source_type) {} -Result<ArrowArray> DayTransform::Transform(const ArrowArray& input) { - return NotImplemented("DayTransform::Transform"); +Result<Literal> DayTransform::Transform(const Literal& literal) { + assert(literal.type() == source_type()); + if (literal.IsBelowMin() || literal.IsAboveMax()) { + return InvalidArgument( + "Cannot apply day transform to literal with value {} of type {}", + literal.ToString(), source_type()->ToString()); + } + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(iceberg::int32()); Review Comment: why not date here? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
