wgtmac commented on code in PR #47190: URL: https://github.com/apache/arrow/pull/47190#discussion_r2241524724
########## cpp/src/parquet/types.cc: ########## @@ -96,57 +98,125 @@ bool PageCanUseChecksum(PageType::type pageType) { } } -std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) { +namespace { + +template <typename T> +std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue( + ::std::string_view val) { + std::stringstream result; + T value{}; + std::memcpy(&value, val.data(), sizeof(T)); + result << value; + return result.str(); +} + +std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val, + const std::shared_ptr<const LogicalType>& logical_type) { + ARROW_DCHECK(logical_type != nullptr && logical_type->is_decimal()); + + const auto& decimal_type = + ::arrow::internal::checked_cast<const DecimalLogicalType&>(*logical_type); + const int32_t scale = decimal_type.scale(); + + std::stringstream result; + switch (parquet_type) { + case Type::INT32: { + int32_t int_value{}; + std::memcpy(&int_value, val.data(), sizeof(int32_t)); + ::arrow::Decimal128 decimal_value(int_value); + result << decimal_value.ToString(scale); + break; + } + case Type::INT64: { + int64_t long_value{}; + std::memcpy(&long_value, val.data(), sizeof(int64_t)); + ::arrow::Decimal128 decimal_value(long_value); + result << decimal_value.ToString(scale); + break; + } + case Type::FIXED_LEN_BYTE_ARRAY: + case Type::BYTE_ARRAY: { + auto decimal_result = ::arrow::Decimal128::FromBigEndian( + reinterpret_cast<const uint8_t*>(val.data()), static_cast<int32_t>(val.size())); + if (!decimal_result.ok()) { + throw ParquetException("Failed to parse decimal value: ", + decimal_result.status().message()); + } + result << decimal_result.ValueUnsafe().ToString(scale); + break; + } + default: + throw ParquetException("Unsupported decimal type: ", TypeToString(parquet_type)); + } + + return result.str(); +} + +std::string FormatNonUTF8Value(::std::string_view val) { + if (val.empty()) { + return ""; + } + std::stringstream result; + result << "0x" << std::hex; + for (const auto& c : val) { + result << std::setw(2) << std::setfill('0') + << static_cast<int>(static_cast<unsigned char>(c)); + } + return result.str(); +} + +} // namespace +std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, + const std::shared_ptr<const LogicalType>& logical_type) { const char* bytes = val.data(); switch (parquet_type) { case Type::BOOLEAN: { - bool value{}; - std::memcpy(&value, bytes, sizeof(bool)); - result << value; - break; + return FormatNumericValue<bool>(val); } case Type::INT32: { - int32_t value{}; - std::memcpy(&value, bytes, sizeof(int32_t)); - result << value; - break; + if (logical_type != nullptr && logical_type->is_decimal()) { + return FormatDecimalValue(parquet_type, val, logical_type); + } + return FormatNumericValue<int32_t>(val); } case Type::INT64: { - int64_t value{}; - std::memcpy(&value, bytes, sizeof(int64_t)); - result << value; - break; + if (logical_type != nullptr && logical_type->is_decimal()) { + return FormatDecimalValue(parquet_type, val, logical_type); + } + return FormatNumericValue<int64_t>(val); } case Type::DOUBLE: { - double value{}; - std::memcpy(&value, bytes, sizeof(double)); - result << value; - break; + return FormatNumericValue<double>(val); } case Type::FLOAT: { - float value{}; - std::memcpy(&value, bytes, sizeof(float)); - result << value; - break; + return FormatNumericValue<float>(val); } case Type::INT96: { + std::stringstream result; std::array<int32_t, 3> values{}; std::memcpy(values.data(), bytes, 3 * sizeof(int32_t)); result << values[0] << " " << values[1] << " " << values[2]; - break; + return result.str(); } case Type::BYTE_ARRAY: case Type::FIXED_LEN_BYTE_ARRAY: { - result << val; - break; + if (logical_type != nullptr) { + if (logical_type->is_decimal()) { + return FormatDecimalValue(parquet_type, val, logical_type); + } + if (logical_type->is_string()) { + return std::string(val); + } Review Comment: Added. I think we are still missing format functions for other logical types like timestamp, geometry, etc. We can add them as needed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org