This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 58b3109  ARROW-13104: [C++] Fix unsafe cast in ByteStreamSplit 
implementation
58b3109 is described below

commit 58b310971665af3b80e83631db0a9874c19e8c1e
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jun 30 11:12:22 2021 +0200

    ARROW-13104: [C++] Fix unsafe cast in ByteStreamSplit implementation
    
    Closes #10596 from pitrou/ARROW-13104-unsafe-cast
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/encoding.cc | 49 ++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index cc1e262..6e8f7ee 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -861,20 +861,31 @@ class ByteStreamSplitEncoder : public EncoderImpl, 
virtual public TypedEncoder<D
                  int64_t valid_bits_offset) override;
 
  protected:
-  ::arrow::TypedBufferBuilder<T> values_;
+  template <typename ArrowType>
+  void PutImpl(const ::arrow::Array& values) {
+    if (values.type_id() != ArrowType::type_id) {
+      throw ParquetException(std::string() + "direct put to " + 
ArrowType::type_name() +
+                             " from " + values.type()->ToString() + " not 
supported");
+    }
+    const auto& data = *values.data();
+    PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
+              static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0), 
data.offset);
+  }
 
- private:
-  void PutArrowArray(const ::arrow::Array& values);
+  ::arrow::BufferBuilder sink_;
+  int64_t num_values_in_buffer_;
 };
 
 template <typename DType>
 ByteStreamSplitEncoder<DType>::ByteStreamSplitEncoder(const ColumnDescriptor* 
descr,
                                                       ::arrow::MemoryPool* 
pool)
-    : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool), values_{pool} {}
+    : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
+      sink_{pool},
+      num_values_in_buffer_{0} {}
 
 template <typename DType>
 int64_t ByteStreamSplitEncoder<DType>::EstimatedDataEncodedSize() {
-  return values_.length() * sizeof(T);
+  return sink_.length();
 }
 
 template <typename DType>
@@ -882,34 +893,30 @@ std::shared_ptr<Buffer> 
ByteStreamSplitEncoder<DType>::FlushValues() {
   std::shared_ptr<ResizableBuffer> output_buffer =
       AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
   uint8_t* output_buffer_raw = output_buffer->mutable_data();
-  const size_t num_values = values_.length();
-  const uint8_t* raw_values = reinterpret_cast<const uint8_t*>(values_.data());
-  ::arrow::util::internal::ByteStreamSplitEncode<T>(raw_values, num_values,
+  const uint8_t* raw_values = sink_.data();
+  ::arrow::util::internal::ByteStreamSplitEncode<T>(raw_values, 
num_values_in_buffer_,
                                                     output_buffer_raw);
-  values_.Reset();
+  sink_.Reset();
+  num_values_in_buffer_ = 0;
   return std::move(output_buffer);
 }
 
 template <typename DType>
 void ByteStreamSplitEncoder<DType>::Put(const T* buffer, int num_values) {
-  if (num_values > 0) PARQUET_THROW_NOT_OK(values_.Append(buffer, num_values));
-}
-
-template <typename DType>
-void ByteStreamSplitEncoder<DType>::Put(const ::arrow::Array& values) {
-  PutArrowArray(values);
+  if (num_values > 0) {
+    PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+    num_values_in_buffer_ += num_values;
+  }
 }
 
 template <>
-void ByteStreamSplitEncoder<FloatType>::PutArrowArray(const ::arrow::Array& 
values) {
-  DirectPutImpl<::arrow::FloatArray>(values,
-                                     
reinterpret_cast<::arrow::BufferBuilder*>(&values_));
+void ByteStreamSplitEncoder<FloatType>::Put(const ::arrow::Array& values) {
+  PutImpl<::arrow::FloatType>(values);
 }
 
 template <>
-void ByteStreamSplitEncoder<DoubleType>::PutArrowArray(const ::arrow::Array& 
values) {
-  DirectPutImpl<::arrow::DoubleArray>(
-      values, reinterpret_cast<::arrow::BufferBuilder*>(&values_));
+void ByteStreamSplitEncoder<DoubleType>::Put(const ::arrow::Array& values) {
+  PutImpl<::arrow::DoubleType>(values);
 }
 
 template <typename DType>

Reply via email to