wgtmac commented on code in PR #47190: URL: https://github.com/apache/arrow/pull/47190#discussion_r2241435145
########## cpp/src/parquet/types.cc: ########## @@ -96,40 +98,107 @@ bool PageCanUseChecksum(PageType::type pageType) { } } -std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) { +namespace { + +template <typename T> +std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue( + ::std::string_view val) { + std::stringstream result; + T value{}; + std::memcpy(&value, val.data(), sizeof(T)); + result << value; + return result.str(); +} + +std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val, + const std::shared_ptr<const LogicalType>& logical_type) { + ARROW_DCHECK(logical_type != nullptr && logical_type->is_decimal()); + + const auto& decimal_type = + ::arrow::internal::checked_cast<const DecimalLogicalType&>(*logical_type); + const int32_t scale = decimal_type.scale(); + + std::stringstream result; + switch (parquet_type) { + case Type::INT32: { + int32_t int_value{}; + std::memcpy(&int_value, val.data(), sizeof(int32_t)); + ::arrow::Decimal128 decimal_value(int_value); + result << decimal_value.ToString(scale); + break; + } + case Type::INT64: { + int64_t long_value{}; + std::memcpy(&long_value, val.data(), sizeof(int64_t)); + ::arrow::Decimal128 decimal_value(long_value); + result << decimal_value.ToString(scale); + break; + } + case Type::FIXED_LEN_BYTE_ARRAY: + case Type::BYTE_ARRAY: { + auto decimal_result = ::arrow::Decimal128::FromBigEndian( + reinterpret_cast<const uint8_t*>(val.data()), static_cast<int32_t>(val.size())); + if (!decimal_result.ok()) { + throw ParquetException("Failed to parse decimal value: ", + decimal_result.status().message()); + } + result << decimal_result.ValueUnsafe().ToString(scale); + break; + } + default: + throw ParquetException("Unsupported decimal type: ", TypeToString(parquet_type)); + } + + return result.str(); +} + +std::string FormatNonUTF8Value(::std::string_view val) { + if (val.empty()) { + return ""; + } + + std::stringstream result; + result << "0x" << std::hex; + for (const auto& c : val) { + result << std::setw(2) << std::setfill('0') + << static_cast<int>(static_cast<unsigned char>(c)); + } + return result.str(); +} + +} // namespace + +std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, + const std::shared_ptr<const LogicalType>& logical_type) { std::stringstream result; + if (logical_type != nullptr && logical_type->is_decimal()) { + return FormatDecimalValue(parquet_type, val, logical_type); + } + + // Default handling for non-decimal types or when decimal processing fails const char* bytes = val.data(); switch (parquet_type) { case Type::BOOLEAN: { - bool value{}; - std::memcpy(&value, bytes, sizeof(bool)); - result << value; - break; + return FormatNumericValue<bool>(val); } case Type::INT32: { - int32_t value{}; - std::memcpy(&value, bytes, sizeof(int32_t)); - result << value; - break; + if (logical_type != nullptr && logical_type->is_decimal()) { + return FormatDecimalValue(parquet_type, val, logical_type); + } Review Comment: Good catch! I forgot the clean up these duplicate lines. ########## cpp/src/parquet/types.cc: ########## @@ -96,40 +98,107 @@ bool PageCanUseChecksum(PageType::type pageType) { } } -std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) { +namespace { + +template <typename T> +std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue( + ::std::string_view val) { + std::stringstream result; + T value{}; + std::memcpy(&value, val.data(), sizeof(T)); + result << value; + return result.str(); +} + +std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val, + const std::shared_ptr<const LogicalType>& logical_type) { + ARROW_DCHECK(logical_type != nullptr && logical_type->is_decimal()); + + const auto& decimal_type = + ::arrow::internal::checked_cast<const DecimalLogicalType&>(*logical_type); + const int32_t scale = decimal_type.scale(); + + std::stringstream result; + switch (parquet_type) { + case Type::INT32: { + int32_t int_value{}; + std::memcpy(&int_value, val.data(), sizeof(int32_t)); + ::arrow::Decimal128 decimal_value(int_value); + result << decimal_value.ToString(scale); + break; + } + case Type::INT64: { + int64_t long_value{}; + std::memcpy(&long_value, val.data(), sizeof(int64_t)); + ::arrow::Decimal128 decimal_value(long_value); + result << decimal_value.ToString(scale); + break; + } + case Type::FIXED_LEN_BYTE_ARRAY: + case Type::BYTE_ARRAY: { + auto decimal_result = ::arrow::Decimal128::FromBigEndian( + reinterpret_cast<const uint8_t*>(val.data()), static_cast<int32_t>(val.size())); + if (!decimal_result.ok()) { + throw ParquetException("Failed to parse decimal value: ", + decimal_result.status().message()); + } + result << decimal_result.ValueUnsafe().ToString(scale); + break; + } + default: + throw ParquetException("Unsupported decimal type: ", TypeToString(parquet_type)); + } + + return result.str(); +} + +std::string FormatNonUTF8Value(::std::string_view val) { + if (val.empty()) { + return ""; + } + + std::stringstream result; + result << "0x" << std::hex; + for (const auto& c : val) { + result << std::setw(2) << std::setfill('0') + << static_cast<int>(static_cast<unsigned char>(c)); + } + return result.str(); +} + +} // namespace + +std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, + const std::shared_ptr<const LogicalType>& logical_type) { std::stringstream result; + if (logical_type != nullptr && logical_type->is_decimal()) { + return FormatDecimalValue(parquet_type, val, logical_type); + } + + // Default handling for non-decimal types or when decimal processing fails const char* bytes = val.data(); switch (parquet_type) { case Type::BOOLEAN: { - bool value{}; - std::memcpy(&value, bytes, sizeof(bool)); - result << value; - break; + return FormatNumericValue<bool>(val); } case Type::INT32: { - int32_t value{}; - std::memcpy(&value, bytes, sizeof(int32_t)); - result << value; - break; + if (logical_type != nullptr && logical_type->is_decimal()) { + return FormatDecimalValue(parquet_type, val, logical_type); + } Review Comment: Good catch! I forgot to clean up these duplicate lines. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org