[ 
https://issues.apache.org/jira/browse/PARQUET-1160?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16580409#comment-16580409
 ] 

Ted Haining commented on PARQUET-1160:
--------------------------------------

This change (diff from apache-parquet-cpp-1.4.0) to support BYTE_ARRAY-backed 
decimals seems to look like this:
{noformat}
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index bd68ec3..7ae905a 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -1055,6 +1055,68 @@ struct TransferFunctor<::arrow::Decimal128Type, 
FLBAType> {
}
};

+/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::Decimal128Array
+/// 3. Converting the big-endian bytes in each BinaryArray entry to two 
integers
+/// representing the high and low bits of each decimal value.
+template <>
+struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> {
+ Status operator()(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<::arrow::DataType>& type,
+ std::shared_ptr<Array>* out) {
+ DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL);
+
+ // Finish the built data into a temporary array
+ std::shared_ptr<Array> array;
+ RETURN_NOT_OK(reader->builder()->Finish(&array));
+ const auto& binary_array =
+ static_cast<const ::arrow::BinaryArray&>(*array);
+
+ const int64_t length = binary_array.length();
+
+ const auto& decimal_type = static_cast<const ::arrow::Decimal128Type&>(*type);
+ const int64_t type_length = decimal_type.byte_width();
+
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data));
+
+ // raw bytes that we can write to
+ uint8_t* out_ptr = data->mutable_data();
+
+ const int64_t null_count = binary_array.null_count();
+
+    // convert each BinaryArray value to valid decimal bytes
+    for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
+
+        int32_t record_len = 0;
+        const uint8_t *record_loc = binary_array.GetValue(i, &record_len);
+
+        if ((record_len < 0) || (record_len > type_length)) {
+            return Status::Invalid("Invalid BYTE_ARRAY size");
+        }
+
+        auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
+        out_ptr_view[0] = 0;
+        out_ptr_view[1] = 0;
+
+
+        if ((null_count > 0) && !binary_array.IsNull(i)) {
+            RawBytesToDecimalBytes(record_loc, record_len, out_ptr);
+        } else if (null_count <= 0) {
+            RawBytesToDecimalBytes(record_loc, record_len, out_ptr);
+        }
+    }
+
+    *out = std::make_shared<::arrow::Decimal128Array>(
+        type, length, data, binary_array.null_bitmap(), null_count);
+
+    return Status::OK();
+  }
+};
+
+
 /// \brief Convert an Int32 or Int64 array into a Decimal128Array
 /// The parquet spec allows systems to write decimals in int32, int64 if the 
values are
 /// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
@@ -1193,12 +1255,15 @@ Status PrimitiveImpl::NextBatch(int64_t 
records_to_read, std::shared_ptr<Array>*
         case ::parquet::Type::INT64: {
           TRANSFER_DATA(::arrow::Decimal128Type, Int64Type);
         } break;
+        case ::parquet::Type::BYTE_ARRAY: {
+          TRANSFER_DATA(::arrow::Decimal128Type, ByteArrayType);
+        } break;
         case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
           TRANSFER_DATA(::arrow::Decimal128Type, FLBAType);
         } break;
         default:
           return Status::Invalid(
-              "Physical type for decimal must be int32, int64, or fixed length 
binary");
+              "Physical type for decimal must be int32, int64, byte array, or 
fixed length binary");
       }
     } break;
     case ::arrow::Type::TIMESTAMP: {
{noformat}

> [C++] Implement BYTE_ARRAY-backed Decimal reads
> -----------------------------------------------
>
>                 Key: PARQUET-1160
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1160
>             Project: Parquet
>          Issue Type: Task
>          Components: parquet-cpp
>    Affects Versions: cpp-1.3.0
>            Reporter: Phillip Cloud
>            Assignee: Phillip Cloud
>            Priority: Major
>         Attachments: 20180726193815980.parquet
>
>
> These are valid in the parquet spec, but it seems like no system in use today 
> implements a writer for this type.
> What systems support writing Decimals with this underlying type?



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to