[ 
https://issues.apache.org/jira/browse/ARROW-1735?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16269106#comment-16269106
 ] 

ASF GitHub Bot commented on ARROW-1735:
---------------------------------------

wesm closed pull request #1369: ARROW-1735: [C++] Test CastKernel writing into 
output array with non-zero offset
URL: https://github.com/apache/arrow/pull/1369
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 4b1fabfdc..0b235cc19 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -103,7 +103,7 @@ static inline std::shared_ptr<ArrayData> SliceData(const 
ArrayData& data, int64_
   length = std::min(data.length - offset, length);
   offset += data.offset;
 
-  auto new_data = data.ShallowCopy();
+  auto new_data = data.Copy();
   new_data->length = length;
   new_data->offset = offset;
   new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0;
@@ -482,14 +482,14 @@ DictionaryArray::DictionaryArray(const 
std::shared_ptr<DataType>& type,
     : dict_type_(static_cast<const DictionaryType*>(type.get())) {
   DCHECK_EQ(type->id(), Type::DICTIONARY);
   DCHECK_EQ(indices->type_id(), dict_type_->index_type()->id());
-  auto data = indices->data()->ShallowCopy();
+  auto data = indices->data()->Copy();
   data->type = type;
   SetData(data);
 }
 
 void DictionaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
   this->Array::SetData(data);
-  auto indices_data = data_->ShallowCopy();
+  auto indices_data = data_->Copy();
   indices_data->type = dict_type_->index_type();
   std::shared_ptr<Array> result;
   indices_ = MakeArray(indices_data);
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index ec5381d6e..ebe54adcb 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -143,9 +143,14 @@ struct ARROW_EXPORT ArrayData {
     return *this;
   }
 
-  std::shared_ptr<ArrayData> ShallowCopy() const {
-    return std::make_shared<ArrayData>(*this);
-  }
+  std::shared_ptr<ArrayData> Copy() const { return 
std::make_shared<ArrayData>(*this); }
+
+#ifndef ARROW_NO_DEPRECATED_API
+
+  // Deprecated since 0.8.0
+  std::shared_ptr<ArrayData> ShallowCopy() const { return Copy(); }
+
+#endif
 
   std::shared_ptr<DataType> type;
   int64_t length;
diff --git a/cpp/src/arrow/compute/compute-test.cc 
b/cpp/src/arrow/compute/compute-test.cc
index d5158978c..c73bfa309 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -709,6 +709,66 @@ TEST_F(TestCast, PreallocatedMemory) {
   ASSERT_ARRAYS_EQUAL(*expected, *result);
 }
 
+template <typename InType, typename InT, typename OutType, typename OutT>
+void CheckOffsetOutputCase(FunctionContext* ctx, const 
std::shared_ptr<DataType>& in_type,
+                           const vector<InT>& in_values,
+                           const std::shared_ptr<DataType>& out_type,
+                           const vector<OutT>& out_values) {
+  using OutTraits = TypeTraits<OutType>;
+
+  CastOptions options;
+
+  const int64_t length = static_cast<int64_t>(in_values.size());
+
+  shared_ptr<Array> arr, expected;
+  ArrayFromVector<InType, InT>(in_type, in_values, &arr);
+  ArrayFromVector<OutType, OutT>(out_type, out_values, &expected);
+
+  shared_ptr<Buffer> out_buffer;
+  ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer));
+
+  std::unique_ptr<UnaryKernel> kernel;
+  ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel));
+
+  const int64_t first_half = length / 2;
+
+  auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer});
+  auto out_second_data = out_data->Copy();
+  out_second_data->offset = first_half;
+
+  Datum out_first(out_data);
+  Datum out_second(out_second_data);
+
+  // Cast each bit
+  ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(0, first_half)), &out_first));
+  ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(first_half)), &out_second));
+
+  shared_ptr<Array> result = MakeArray(out_data);
+
+  ASSERT_ARRAYS_EQUAL(*expected, *result);
+}
+
+TEST_F(TestCast, OffsetOutputBuffer) {
+  // ARROW-1735
+  vector<int32_t> v1 = {0, 10000, 2000, 1000, 0};
+  vector<int64_t> e1 = {0, 10000, 2000, 1000, 0};
+
+  auto in_type = int32();
+  auto out_type = int64();
+  CheckOffsetOutputCase<Int32Type, int32_t, Int64Type, int64_t>(&this->ctx_, 
in_type, v1,
+                                                                out_type, e1);
+
+  vector<bool> e2 = {false, true, true, true, false};
+
+  out_type = boolean();
+  CheckOffsetOutputCase<Int32Type, int32_t, BooleanType, bool>(&this->ctx_, 
in_type, v1,
+                                                               boolean(), e2);
+
+  vector<int16_t> e3 = {0, 10000, 2000, 1000, 0};
+  CheckOffsetOutputCase<Int32Type, int32_t, Int16Type, int16_t>(&this->ctx_, 
in_type, v1,
+                                                                int16(), e3);
+}
+
 template <typename TestType>
 class TestDictionaryCast : public TestCast {};
 
diff --git a/cpp/src/arrow/compute/kernels/cast.cc 
b/cpp/src/arrow/compute/kernels/cast.cc
index d48d66992..465be958c 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -124,12 +124,7 @@ template <typename T>
 struct CastFunctor<T, NullType, typename std::enable_if<
                                     std::is_base_of<FixedWidthType, 
T>::value>::type> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    // Simply initialize data to 0
-    auto buf = output->buffers[1];
-    DCHECK_EQ(output->offset, 0);
-    memset(buf->mutable_data(), 0, buf->size());
-  }
+                  const ArrayData& input, ArrayData* output) {}
 };
 
 template <>
@@ -199,14 +194,19 @@ struct CastFunctor<O, I, typename 
std::enable_if<std::is_same<BooleanType, O>::v
                                                  !std::is_same<O, 
I>::value>::type> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
                   const ArrayData& input, ArrayData* output) {
-    using in_type = typename I::c_type;
-    DCHECK_EQ(output->offset, 0);
+    auto in_data = GetValues<typename I::c_type>(input, 1);
+    internal::BitmapWriter writer(output->buffers[1]->mutable_data(), 
output->offset,
+                                  input.length);
 
-    const in_type* in_data = GetValues<in_type>(input, 1);
-    uint8_t* out_data = GetMutableValues<uint8_t>(output, 1);
     for (int64_t i = 0; i < input.length; ++i) {
-      BitUtil::SetBitTo(out_data, i, (*in_data++) != 0);
+      if (*in_data++ != 0) {
+        writer.Set();
+      } else {
+        writer.Clear();
+      }
+      writer.Next();
     }
+    writer.Finish();
   }
 };
 
@@ -217,7 +217,6 @@ struct CastFunctor<O, I,
                   const ArrayData& input, ArrayData* output) {
     using in_type = typename I::c_type;
     using out_type = typename O::c_type;
-    DCHECK_EQ(output->offset, 0);
 
     auto in_offset = input.offset;
 
@@ -475,9 +474,10 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, 
const Array& indices,
 
   const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
 
-  uint8_t* out = output->buffers[1]->mutable_data();
   int32_t byte_width =
       static_cast<const FixedSizeBinaryType&>(*output->type).byte_width();
+
+  uint8_t* out = output->buffers[1]->mutable_data() + byte_width * 
output->offset;
   for (int64_t i = 0; i < indices.length(); ++i) {
     if (valid_bits_reader.IsSet()) {
       const uint8_t* value = dictionary.Value(in[i]);
@@ -493,7 +493,7 @@ struct CastFunctor<
     typename std::enable_if<std::is_base_of<FixedSizeBinaryType, 
T>::value>::type> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
                   const ArrayData& input, ArrayData* output) {
-    DictionaryArray dict_array(input.ShallowCopy());
+    DictionaryArray dict_array(input.Copy());
 
     const DictionaryType& type = static_cast<const 
DictionaryType&>(*input.type);
     const DataType& values_type = *type.dictionary()->type();
@@ -565,7 +565,7 @@ struct CastFunctor<T, DictionaryType,
                    typename std::enable_if<std::is_base_of<BinaryType, 
T>::value>::type> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
                   const ArrayData& input, ArrayData* output) {
-    DictionaryArray dict_array(input.ShallowCopy());
+    DictionaryArray dict_array(input.Copy());
 
     const DictionaryType& type = static_cast<const 
DictionaryType&>(*input.type);
     const DataType& values_type = *type.dictionary()->type();
@@ -605,12 +605,10 @@ struct CastFunctor<T, DictionaryType,
 template <typename IndexType, typename c_type>
 void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
                                c_type* out) {
-  using index_c_type = typename IndexType::c_type;
-
   internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), 
indices.offset(),
                                            indices.length());
 
-  const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
+  auto in = GetValues<typename IndexType::c_type>(*indices.data(), 1);
   for (int64_t i = 0; i < indices.length(); ++i) {
     if (valid_bits_reader.IsSet()) {
       out[i] = dictionary[in[i]];
@@ -627,7 +625,7 @@ struct CastFunctor<T, DictionaryType,
                   const ArrayData& input, ArrayData* output) {
     using c_type = typename T::c_type;
 
-    DictionaryArray dict_array(input.ShallowCopy());
+    DictionaryArray dict_array(input.Copy());
 
     const DictionaryType& type = static_cast<const 
DictionaryType&>(*input.type);
     const DataType& values_type = *type.dictionary()->type();
@@ -638,7 +636,7 @@ struct CastFunctor<T, DictionaryType,
 
     const c_type* dictionary = GetValues<c_type>(*type.dictionary()->data(), 
1);
 
-    auto out = reinterpret_cast<c_type*>(output->buffers[1]->mutable_data());
+    auto out = GetMutableValues<c_type>(output, 1);
     const Array& indices = *dict_array.indices();
     switch (indices.type()->id()) {
       case Type::INT8:
diff --git a/cpp/src/arrow/python/arrow_to_python.h 
b/cpp/src/arrow/python/arrow_to_python.h
index 9440ffb32..02a22f07d 100644
--- a/cpp/src/arrow/python/arrow_to_python.h
+++ b/cpp/src/arrow/python/arrow_to_python.h
@@ -51,8 +51,8 @@ Status ReadSerializedObject(io::RandomAccessFile* src, 
SerializedPyObject* out);
 /// \brief Reconstruct SerializedPyObject from representation produced by
 /// SerializedPyObject::GetComponents.
 ///
-/// \param[in] num_tensors
-/// \param[in] num_buffers
+/// \param[in] num_tensors number of tensors in the object
+/// \param[in] num_buffers number of buffers in the object
 /// \param[in] data a list containing pyarrow.Buffer instances. Must be 1 +
 /// num_tensors * 2 + num_buffers in length
 /// \param[out] out the reconstructed object
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 0c0d1a9b3..0b1124d30 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -765,7 +765,7 @@ Status NumPyConverter::ConvertObjectStrings() {
   // If we saw PyBytes, convert everything to BinaryArray
   if (global_have_bytes) {
     for (size_t i = 0; i < out_arrays_.size(); ++i) {
-      auto binary_data = out_arrays_[i]->data()->ShallowCopy();
+      auto binary_data = out_arrays_[i]->data()->Copy();
       binary_data->type = ::arrow::binary();
       out_arrays_[i] = std::make_shared<BinaryArray>(binary_data);
     }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [C++] Cast kernels cannot write into sliced output array
> --------------------------------------------------------
>
>                 Key: ARROW-1735
>                 URL: https://issues.apache.org/jira/browse/ARROW-1735
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++
>            Reporter: Wes McKinney
>            Assignee: Wes McKinney
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> If the offset is non-zero, the results will be written into the wrong 
> location:
> https://github.com/apache/arrow/blob/master/cpp/src/arrow/compute/cast.cc#L218



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to