wgtmac commented on code in PR #47427:
URL: https://github.com/apache/arrow/pull/47427#discussion_r2301269510


##########
cpp/src/parquet/properties.h:
##########
@@ -1126,6 +1127,15 @@ class PARQUET_EXPORT ArrowReaderProperties {
   /// Return whether loading statistics as much as possible.
   bool should_load_statistics() const { return should_load_statistics_; }
 
+  /// \brief Set whether infer Decimal32/64 from parquet.
+  ///
+  /// Default is false.
+  void set_smallest_decimal_enabled(bool smallest_decimal_enable) {
+    smallest_decimal_enabled_ = smallest_decimal_enable;
+  }
+  /// Return whether infer Decimal32/64 from parquet.

Review Comment:
   ```suggestion
     /// \brief Return whether to infer Decimal32/64 from parquet.
   ```



##########
cpp/src/parquet/properties.h:
##########
@@ -1126,6 +1127,15 @@ class PARQUET_EXPORT ArrowReaderProperties {
   /// Return whether loading statistics as much as possible.
   bool should_load_statistics() const { return should_load_statistics_; }
 
+  /// \brief Set whether infer Decimal32/64 from parquet.

Review Comment:
   I think we need to explain two things:
   
   1. If Arrow schema is serialized, this flag does nothing because we will 
always use the serialized decimal type.
   2. If Arrow schema is not serialized, we will choose the smallest width that 
can hold the precision if the flag is enabled.



##########
cpp/src/parquet/arrow/schema_internal.h:
##########
@@ -19,16 +19,19 @@
 
 #include "arrow/result.h"
 #include "arrow/type_fwd.h"
+#include "parquet/properties.h"
 #include "parquet/schema.h"
 
 namespace parquet::arrow {
 
 using ::arrow::Result;
 
-Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& 
logical_type,

Review Comment:
   Why this is gone?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5468,6 +5532,86 @@ TYPED_TEST(TestIntegerAnnotateDecimalTypeParquetIO, 
SingleNullableDecimalColumn)
   ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleDecimalColumnFile(*values));
 }
 
+template <typename TestType>
+class TestIntegerAnnotateSmallestDecimalTypeParquetIO
+    : public TestIntegerAnnotateDecimalTypeParquetIO<TestType> {
+ public:
+  void ReadAndCheckSingleDecimalColumnFile(const Array& values) {
+    ArrowReaderProperties properties = default_arrow_reader_properties();
+    properties.set_smallest_decimal_enabled(true);
+
+    std::shared_ptr<Array> out;
+    std::unique_ptr<FileReader> reader;
+    this->ReaderFromSink(&reader, properties);
+    this->ReadSingleColumnFile(std::move(reader), &out);
+
+    if (values.type()->id() == out->type()->id()) {
+      AssertArraysEqual(values, *out);
+    } else {
+      auto& expected_values = values;
+      auto& read_values = *out;
+      ASSERT_EQ(expected_values.length(), read_values.length());
+      ASSERT_EQ(expected_values.null_count(), read_values.null_count());
+
+      auto format_decimal = [](const Array& values, int64_t index) {
+        switch (values.type()->id()) {
+          case ::arrow::Type::DECIMAL32:
+            return static_cast<const 
::arrow::Decimal32Array&>(values).FormatValue(index);
+          case ::arrow::Type::DECIMAL64:
+            return static_cast<const 
::arrow::Decimal64Array&>(values).FormatValue(index);
+          case ::arrow::Type::DECIMAL128:
+            return static_cast<const 
::arrow::Decimal128Array&>(values).FormatValue(
+                index);
+          case ::arrow::Type::DECIMAL256:
+            return static_cast<const 
::arrow::Decimal256Array&>(values).FormatValue(
+                index);
+          default:
+            std::string err("Unexpected decimal type: " + 
values.type()->ToString());
+            ADD_FAILURE() << err;
+            return err;
+        }
+      };
+
+      for (int64_t i = 0; i < expected_values.length(); ++i) {
+        ASSERT_EQ(expected_values.IsNull(i), read_values.IsNull(i));
+        if (!expected_values.IsNull(i)) {
+          std::string expected_str = format_decimal(expected_values, i);
+          std::string read_str = format_decimal(read_values, i);
+          ASSERT_EQ(expected_str, read_str);
+        }
+      }
+    }
+  }
+};
+
+using SmallestDecimalTestTypes = ::testing::Types<
+    Decimal32WithPrecisionAndScale<1>, Decimal32WithPrecisionAndScale<5>,
+    Decimal32WithPrecisionAndScale<9>, Decimal64WithPrecisionAndScale<1>,
+    Decimal64WithPrecisionAndScale<5>, Decimal64WithPrecisionAndScale<10>,
+    Decimal64WithPrecisionAndScale<18>, Decimal128WithPrecisionAndScale<1>,
+    Decimal128WithPrecisionAndScale<5>, Decimal128WithPrecisionAndScale<10>,
+    Decimal128WithPrecisionAndScale<18>, Decimal256WithPrecisionAndScale<1>,
+    Decimal256WithPrecisionAndScale<5>, Decimal256WithPrecisionAndScale<10>,
+    Decimal256WithPrecisionAndScale<18>>;
+
+TYPED_TEST_SUITE(TestIntegerAnnotateSmallestDecimalTypeParquetIO,
+                 SmallestDecimalTestTypes);
+
+TYPED_TEST(TestIntegerAnnotateSmallestDecimalTypeParquetIO,
+           SingleNonNullableDecimalColumn) {
+  std::shared_ptr<Array> values;
+  ASSERT_OK(NonNullArray<TypeParam>(SMALL_SIZE, &values));
+  ASSERT_NO_FATAL_FAILURE(this->WriteColumn(values));

Review Comment:
   Just to confirm, is `store_schema()` not set in this case?



##########
cpp/src/parquet/arrow/schema_internal.h:
##########
@@ -19,16 +19,19 @@
 
 #include "arrow/result.h"
 #include "arrow/type_fwd.h"
+#include "parquet/properties.h"

Review Comment:
   Can we avoid default parameter here and use forward declaration of 
`ArrowReaderProperties`?



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -2519,29 +2520,29 @@ struct SerializeFunctor<
     int64_t non_null_count = array.length() - array.null_count();
     int64_t size = non_null_count * ArrowType::kByteWidth;
     scratch_buffer = AllocateBuffer(ctx->memory_pool, size);
-    scratch = reinterpret_cast<int64_t*>(scratch_buffer->mutable_data());
+    scratch = scratch_buffer->mutable_data();
   }
 
-  template <int byte_width>
   FixedLenByteArray FixDecimalEndianness(const uint8_t* in, int64_t offset) {
-    const auto* u64_in = reinterpret_cast<const int64_t*>(in);
     auto out = reinterpret_cast<const uint8_t*>(scratch) + offset;
-    static_assert(byte_width == 16 || byte_width == 32,
-                  "only 16 and 32 byte Decimals supported");
-    if (byte_width == 32) {
-      *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[3]);
-      *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[2]);
-      *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[1]);
-      *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]);
+    if constexpr (std::is_same_v<ArrowType, ::arrow::Decimal32Type>) {
+      const auto* u32_in = reinterpret_cast<const uint32_t*>(in);
+      auto p = reinterpret_cast<uint32_t*>(scratch);
+      *p++ = ::arrow::bit_util::ToBigEndian(u32_in[0]);
+      scratch = reinterpret_cast<uint8_t*>(p);
     } else {
-      *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[1]);
-      *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]);
+      const auto* u64_in = reinterpret_cast<const uint64_t*>(in);
+      auto p = reinterpret_cast<uint64_t*>(scratch);
+      for (int i = ArrowType::kByteWidth / 8 - 1; i >= 0; i--) {

Review Comment:
   I'm not sure if compilers will be smart enough to unroll the loop or do 
similar optimization. Perhaps we can add some benchmark for Decimal128 just in 
case of regression?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to