[ 
https://issues.apache.org/jira/browse/ARROW-2172?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16370558#comment-16370558
 ] 

ASF GitHub Bot commented on ARROW-2172:
---------------------------------------

xhochy closed pull request #1628: ARROW-2172: [C++/Python] Fix converting from 
Numpy array with non-natural stride
URL: https://github.com/apache/arrow/pull/1628
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 522bf5174..23418ad92 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -554,12 +554,22 @@ Status StaticCastBuffer(const Buffer& input, const 
int64_t length, MemoryPool* p
   return Status::OK();
 }
 
-template <typename T, typename T2>
-void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* 
output_data) {
+template <typename T>
+void CopyStridedBytewise(int8_t* input_data, int64_t length, int64_t stride,
+                         T* output_data) {
+  // Passing input_data as non-const is a concession to PyObject*
+  for (int64_t i = 0; i < length; ++i) {
+    memcpy(output_data + i, input_data, sizeof(T));
+    input_data += stride;
+  }
+}
+
+template <typename T>
+void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* 
output_data) {
   // Passing input_data as non-const is a concession to PyObject*
   int64_t j = 0;
   for (int64_t i = 0; i < length; ++i) {
-    output_data[i] = static_cast<T2>(input_data[j]);
+    output_data[i] = input_data[j];
     j += stride;
   }
 }
@@ -571,13 +581,19 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t 
length, MemoryPool* po
   using T = typename traits::T;
 
   // Strided, must copy into new contiguous memory
-  const int64_t stride = PyArray_STRIDES(arr)[0];
-  const int64_t stride_elements = stride / sizeof(T);
-
   auto new_buffer = std::make_shared<PoolBuffer>(pool);
   RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length));
-  CopyStrided(reinterpret_cast<T*>(PyArray_DATA(arr)), length, stride_elements,
-              reinterpret_cast<T*>(new_buffer->mutable_data()));
+
+  const int64_t stride = PyArray_STRIDES(arr)[0];
+  if (stride % sizeof(T) == 0) {
+    const int64_t stride_elements = stride / sizeof(T);
+    CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length, 
stride_elements,
+                       reinterpret_cast<T*>(new_buffer->mutable_data()));
+  } else {
+    CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length, 
stride,
+                        reinterpret_cast<T*>(new_buffer->mutable_data()));
+  }
+
   *out = new_buffer;
   return Status::OK();
 }
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 95137ffb2..9bd3c77d2 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1582,6 +1582,19 @@ def test_convert_empty_table(self):
         arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
         tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
 
+    def test_non_natural_stride(self):
+        """
+        ARROW-2172: converting from a Numpy array with a stride that's
+        not a multiple of itemsize.
+        """
+        dtype = np.dtype([('x', np.int32), ('y', np.int16)])
+        data = np.array([(42, -1), (-43, 2)], dtype=dtype)
+        assert data.strides == (6,)
+        arr = pa.array(data['x'], type=pa.int32())
+        assert arr.to_pylist() == [42, -43]
+        arr = pa.array(data['y'], type=pa.int16())
+        assert arr.to_pylist() == [-1, 2]
+
 
 def _fully_loaded_dataframe_example():
     from distutils.version import LooseVersion


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Incorrect conversion from Numpy array when stride % itemsize != 0
> --------------------------------------------------------------------------
>
>                 Key: ARROW-2172
>                 URL: https://issues.apache.org/jira/browse/ARROW-2172
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.8.0
>            Reporter: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> In the example below, the input array has a stride that's not a multiple of 
> the itemsize:
> {code:python}
> >>> data = np.array([(42, True), (43, False)],
> ...:                dtype=[('x', np.int32), ('y', np.bool_)])
> ...:                
> ...:                                        
> >>> data['x']
> array([42, 43], dtype=int32)
> >>> pa.array(data['x'], type=pa.int32())
> <pyarrow.lib.Int32Array object at 0x7ff60a8415e8>
> [
>   42,
>   11009
> ]
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to