mapleFU commented on code in PR #47190:
URL: https://github.com/apache/arrow/pull/47190#discussion_r2240223177
##########
cpp/src/parquet/types.cc:
##########
@@ -96,40 +98,107 @@ bool PageCanUseChecksum(PageType::type pageType) {
}
}
-std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) {
+namespace {
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue(
+ ::std::string_view val) {
+ std::stringstream result;
+ T value{};
+ std::memcpy(&value, val.data(), sizeof(T));
+ result << value;
+ return result.str();
+}
+
+std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val,
+ const std::shared_ptr<const LogicalType>&
logical_type) {
+ ARROW_DCHECK(logical_type != nullptr && logical_type->is_decimal());
+
+ const auto& decimal_type =
+ ::arrow::internal::checked_cast<const
DecimalLogicalType&>(*logical_type);
+ const int32_t scale = decimal_type.scale();
+
+ std::stringstream result;
+ switch (parquet_type) {
+ case Type::INT32: {
+ int32_t int_value{};
+ std::memcpy(&int_value, val.data(), sizeof(int32_t));
+ ::arrow::Decimal128 decimal_value(int_value);
+ result << decimal_value.ToString(scale);
+ break;
+ }
+ case Type::INT64: {
+ int64_t long_value{};
+ std::memcpy(&long_value, val.data(), sizeof(int64_t));
+ ::arrow::Decimal128 decimal_value(long_value);
+ result << decimal_value.ToString(scale);
+ break;
+ }
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ case Type::BYTE_ARRAY: {
+ auto decimal_result = ::arrow::Decimal128::FromBigEndian(
+ reinterpret_cast<const uint8_t*>(val.data()),
static_cast<int32_t>(val.size()));
+ if (!decimal_result.ok()) {
+ throw ParquetException("Failed to parse decimal value: ",
+ decimal_result.status().message());
+ }
+ result << decimal_result.ValueUnsafe().ToString(scale);
+ break;
+ }
+ default:
+ throw ParquetException("Unsupported decimal type: ",
TypeToString(parquet_type));
+ }
+
+ return result.str();
+}
+
+std::string FormatNonUTF8Value(::std::string_view val) {
+ if (val.empty()) {
+ return "";
+ }
+
+ std::stringstream result;
+ result << "0x" << std::hex;
+ for (const auto& c : val) {
+ result << std::setw(2) << std::setfill('0')
+ << static_cast<int>(static_cast<unsigned char>(c));
+ }
+ return result.str();
+}
+
+} // namespace
+
+std::string FormatStatValue(Type::type parquet_type, ::std::string_view val,
+ const std::shared_ptr<const LogicalType>&
logical_type) {
std::stringstream result;
+ if (logical_type != nullptr && logical_type->is_decimal()) {
+ return FormatDecimalValue(parquet_type, val, logical_type);
+ }
+
+ // Default handling for non-decimal types or when decimal processing fails
const char* bytes = val.data();
switch (parquet_type) {
case Type::BOOLEAN: {
- bool value{};
- std::memcpy(&value, bytes, sizeof(bool));
- result << value;
- break;
+ return FormatNumericValue<bool>(val);
Review Comment:
is bool numeric value?
##########
cpp/src/parquet/types.h:
##########
@@ -853,8 +853,9 @@ PARQUET_EXPORT std::string TypeToString(Type::type t);
PARQUET_EXPORT std::string TypeToString(Type::type t, int type_length);
-PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
- ::std::string_view val);
+PARQUET_EXPORT std::string FormatStatValue(
+ Type::type parquet_type, ::std::string_view val,
+ const std::shared_ptr<const LogicalType>& logical_type = NULLPTR);
Review Comment:
Is this = NULLPTR for compatible?
##########
cpp/src/parquet/types.cc:
##########
@@ -96,40 +98,107 @@ bool PageCanUseChecksum(PageType::type pageType) {
}
}
-std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) {
+namespace {
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue(
+ ::std::string_view val) {
+ std::stringstream result;
+ T value{};
+ std::memcpy(&value, val.data(), sizeof(T));
+ result << value;
+ return result.str();
+}
+
+std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val,
+ const std::shared_ptr<const LogicalType>&
logical_type) {
+ ARROW_DCHECK(logical_type != nullptr && logical_type->is_decimal());
Review Comment:
Can pass in argument just be `const DecimalLogicalType&`? ( Current also
LGTM)
##########
cpp/src/parquet/types.cc:
##########
@@ -96,40 +98,107 @@ bool PageCanUseChecksum(PageType::type pageType) {
}
}
-std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) {
+namespace {
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue(
+ ::std::string_view val) {
+ std::stringstream result;
+ T value{};
+ std::memcpy(&value, val.data(), sizeof(T));
+ result << value;
+ return result.str();
+}
+
+std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val,
+ const std::shared_ptr<const LogicalType>&
logical_type) {
+ ARROW_DCHECK(logical_type != nullptr && logical_type->is_decimal());
+
+ const auto& decimal_type =
+ ::arrow::internal::checked_cast<const
DecimalLogicalType&>(*logical_type);
+ const int32_t scale = decimal_type.scale();
+
+ std::stringstream result;
+ switch (parquet_type) {
+ case Type::INT32: {
+ int32_t int_value{};
+ std::memcpy(&int_value, val.data(), sizeof(int32_t));
+ ::arrow::Decimal128 decimal_value(int_value);
+ result << decimal_value.ToString(scale);
+ break;
+ }
+ case Type::INT64: {
+ int64_t long_value{};
+ std::memcpy(&long_value, val.data(), sizeof(int64_t));
+ ::arrow::Decimal128 decimal_value(long_value);
+ result << decimal_value.ToString(scale);
+ break;
+ }
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ case Type::BYTE_ARRAY: {
+ auto decimal_result = ::arrow::Decimal128::FromBigEndian(
+ reinterpret_cast<const uint8_t*>(val.data()),
static_cast<int32_t>(val.size()));
+ if (!decimal_result.ok()) {
+ throw ParquetException("Failed to parse decimal value: ",
+ decimal_result.status().message());
+ }
+ result << decimal_result.ValueUnsafe().ToString(scale);
+ break;
+ }
+ default:
+ throw ParquetException("Unsupported decimal type: ",
TypeToString(parquet_type));
+ }
+
+ return result.str();
+}
+
+std::string FormatNonUTF8Value(::std::string_view val) {
+ if (val.empty()) {
+ return "";
+ }
+
+ std::stringstream result;
+ result << "0x" << std::hex;
+ for (const auto& c : val) {
+ result << std::setw(2) << std::setfill('0')
+ << static_cast<int>(static_cast<unsigned char>(c));
+ }
+ return result.str();
+}
+
+} // namespace
+
+std::string FormatStatValue(Type::type parquet_type, ::std::string_view val,
+ const std::shared_ptr<const LogicalType>&
logical_type) {
std::stringstream result;
+ if (logical_type != nullptr && logical_type->is_decimal()) {
+ return FormatDecimalValue(parquet_type, val, logical_type);
+ }
+
+ // Default handling for non-decimal types or when decimal processing fails
const char* bytes = val.data();
switch (parquet_type) {
case Type::BOOLEAN: {
- bool value{};
- std::memcpy(&value, bytes, sizeof(bool));
- result << value;
- break;
+ return FormatNumericValue<bool>(val);
}
case Type::INT32: {
- int32_t value{};
- std::memcpy(&value, bytes, sizeof(int32_t));
- result << value;
- break;
+ if (logical_type != nullptr && logical_type->is_decimal()) {
+ return FormatDecimalValue(parquet_type, val, logical_type);
+ }
Review Comment:
Is this duplicate with L175?
##########
cpp/src/parquet/types.cc:
##########
@@ -96,40 +98,107 @@ bool PageCanUseChecksum(PageType::type pageType) {
}
}
-std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) {
+namespace {
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue(
+ ::std::string_view val) {
+ std::stringstream result;
+ T value{};
+ std::memcpy(&value, val.data(), sizeof(T));
+ result << value;
+ return result.str();
+}
+
+std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val,
+ const std::shared_ptr<const LogicalType>&
logical_type) {
+ ARROW_DCHECK(logical_type != nullptr && logical_type->is_decimal());
+
+ const auto& decimal_type =
+ ::arrow::internal::checked_cast<const
DecimalLogicalType&>(*logical_type);
+ const int32_t scale = decimal_type.scale();
+
+ std::stringstream result;
+ switch (parquet_type) {
+ case Type::INT32: {
+ int32_t int_value{};
+ std::memcpy(&int_value, val.data(), sizeof(int32_t));
+ ::arrow::Decimal128 decimal_value(int_value);
+ result << decimal_value.ToString(scale);
+ break;
+ }
+ case Type::INT64: {
+ int64_t long_value{};
+ std::memcpy(&long_value, val.data(), sizeof(int64_t));
+ ::arrow::Decimal128 decimal_value(long_value);
+ result << decimal_value.ToString(scale);
+ break;
+ }
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ case Type::BYTE_ARRAY: {
+ auto decimal_result = ::arrow::Decimal128::FromBigEndian(
+ reinterpret_cast<const uint8_t*>(val.data()),
static_cast<int32_t>(val.size()));
+ if (!decimal_result.ok()) {
+ throw ParquetException("Failed to parse decimal value: ",
+ decimal_result.status().message());
+ }
+ result << decimal_result.ValueUnsafe().ToString(scale);
+ break;
+ }
+ default:
+ throw ParquetException("Unsupported decimal type: ",
TypeToString(parquet_type));
+ }
+
+ return result.str();
+}
+
+std::string FormatNonUTF8Value(::std::string_view val) {
+ if (val.empty()) {
+ return "";
+ }
+
+ std::stringstream result;
+ result << "0x" << std::hex;
+ for (const auto& c : val) {
+ result << std::setw(2) << std::setfill('0')
+ << static_cast<int>(static_cast<unsigned char>(c));
+ }
+ return result.str();
+}
+
+} // namespace
+
+std::string FormatStatValue(Type::type parquet_type, ::std::string_view val,
+ const std::shared_ptr<const LogicalType>&
logical_type) {
std::stringstream result;
+ if (logical_type != nullptr && logical_type->is_decimal()) {
+ return FormatDecimalValue(parquet_type, val, logical_type);
+ }
+
+ // Default handling for non-decimal types or when decimal processing fails
const char* bytes = val.data();
switch (parquet_type) {
case Type::BOOLEAN: {
- bool value{};
- std::memcpy(&value, bytes, sizeof(bool));
- result << value;
- break;
+ return FormatNumericValue<bool>(val);
}
case Type::INT32: {
- int32_t value{};
- std::memcpy(&value, bytes, sizeof(int32_t));
- result << value;
- break;
+ if (logical_type != nullptr && logical_type->is_decimal()) {
+ return FormatDecimalValue(parquet_type, val, logical_type);
+ }
+ return FormatNumericValue<int32_t>(val);
}
case Type::INT64: {
- int64_t value{};
- std::memcpy(&value, bytes, sizeof(int64_t));
- result << value;
- break;
+ if (logical_type != nullptr && logical_type->is_decimal()) {
+ return FormatDecimalValue(parquet_type, val, logical_type);
Review Comment:
ditto
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]