wgtmac commented on code in PR #47190:
URL: https://github.com/apache/arrow/pull/47190#discussion_r2241524724


##########
cpp/src/parquet/types.cc:
##########
@@ -96,57 +98,125 @@ bool PageCanUseChecksum(PageType::type pageType) {
   }
 }
 
-std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) {
+namespace {
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue(
+    ::std::string_view val) {
+  std::stringstream result;
+  T value{};
+  std::memcpy(&value, val.data(), sizeof(T));
+  result << value;
+  return result.str();
+}
+
+std::string FormatDecimalValue(Type::type parquet_type, ::std::string_view val,
+                               const std::shared_ptr<const LogicalType>& 
logical_type) {
+  ARROW_DCHECK(logical_type != nullptr && logical_type->is_decimal());
+
+  const auto& decimal_type =
+      ::arrow::internal::checked_cast<const 
DecimalLogicalType&>(*logical_type);
+  const int32_t scale = decimal_type.scale();
+
+  std::stringstream result;
+  switch (parquet_type) {
+    case Type::INT32: {
+      int32_t int_value{};
+      std::memcpy(&int_value, val.data(), sizeof(int32_t));
+      ::arrow::Decimal128 decimal_value(int_value);
+      result << decimal_value.ToString(scale);
+      break;
+    }
+    case Type::INT64: {
+      int64_t long_value{};
+      std::memcpy(&long_value, val.data(), sizeof(int64_t));
+      ::arrow::Decimal128 decimal_value(long_value);
+      result << decimal_value.ToString(scale);
+      break;
+    }
+    case Type::FIXED_LEN_BYTE_ARRAY:
+    case Type::BYTE_ARRAY: {
+      auto decimal_result = ::arrow::Decimal128::FromBigEndian(
+          reinterpret_cast<const uint8_t*>(val.data()), 
static_cast<int32_t>(val.size()));
+      if (!decimal_result.ok()) {
+        throw ParquetException("Failed to parse decimal value: ",
+                               decimal_result.status().message());
+      }
+      result << decimal_result.ValueUnsafe().ToString(scale);
+      break;
+    }
+    default:
+      throw ParquetException("Unsupported decimal type: ", 
TypeToString(parquet_type));
+  }
+
+  return result.str();
+}
+
+std::string FormatNonUTF8Value(::std::string_view val) {
+  if (val.empty()) {
+    return "";
+  }
+
   std::stringstream result;
+  result << "0x" << std::hex;
+  for (const auto& c : val) {
+    result << std::setw(2) << std::setfill('0')
+           << static_cast<int>(static_cast<unsigned char>(c));
+  }
+  return result.str();
+}
+
+}  // namespace
 
+std::string FormatStatValue(Type::type parquet_type, ::std::string_view val,
+                            const std::shared_ptr<const LogicalType>& 
logical_type) {
   const char* bytes = val.data();
   switch (parquet_type) {
     case Type::BOOLEAN: {
-      bool value{};
-      std::memcpy(&value, bytes, sizeof(bool));
-      result << value;
-      break;
+      return FormatNumericValue<bool>(val);
     }
     case Type::INT32: {
-      int32_t value{};
-      std::memcpy(&value, bytes, sizeof(int32_t));
-      result << value;
-      break;
+      if (logical_type != nullptr && logical_type->is_decimal()) {
+        return FormatDecimalValue(parquet_type, val, logical_type);
+      }
+      return FormatNumericValue<int32_t>(val);
     }
     case Type::INT64: {
-      int64_t value{};
-      std::memcpy(&value, bytes, sizeof(int64_t));
-      result << value;
-      break;
+      if (logical_type != nullptr && logical_type->is_decimal()) {
+        return FormatDecimalValue(parquet_type, val, logical_type);
+      }
+      return FormatNumericValue<int64_t>(val);
     }
     case Type::DOUBLE: {
-      double value{};
-      std::memcpy(&value, bytes, sizeof(double));
-      result << value;
-      break;
+      return FormatNumericValue<double>(val);
     }
     case Type::FLOAT: {
-      float value{};
-      std::memcpy(&value, bytes, sizeof(float));
-      result << value;
-      break;
+      return FormatNumericValue<float>(val);
     }
     case Type::INT96: {
+      std::stringstream result;
       std::array<int32_t, 3> values{};
       std::memcpy(values.data(), bytes, 3 * sizeof(int32_t));
       result << values[0] << " " << values[1] << " " << values[2];
-      break;
+      return result.str();
     }
     case Type::BYTE_ARRAY:
     case Type::FIXED_LEN_BYTE_ARRAY: {
-      result << val;
-      break;
+      if (logical_type != nullptr) {
+        if (logical_type->is_decimal()) {
+          return FormatDecimalValue(parquet_type, val, logical_type);
+        }
+        if (logical_type->is_string()) {
+          return std::string(val);
+        }

Review Comment:
   Added. I think we are still missing format functions for other logical types 
like timestamp, geometry, etc. We can add them as needed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to