[ 
https://issues.apache.org/jira/browse/ARROW-2099?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16394772#comment-16394772
 ] 

ASF GitHub Bot commented on ARROW-2099:
---------------------------------------

wesm closed pull request #1734: ARROW-2099: [Python] Add safe option to 
DictionaryArray.from_arrays to do boundschecking of indices by default
URL: https://github.com/apache/arrow/pull/1734
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 321809fa2d..2ea131bca6 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -831,7 +831,8 @@ cdef class DictionaryArray(Array):
 
     @staticmethod
     def from_arrays(indices, dictionary, mask=None, ordered=False,
-                    from_pandas=False, MemoryPool memory_pool=None):
+                    from_pandas=False, safe=True,
+                    MemoryPool memory_pool=None):
         """
         Construct Arrow DictionaryArray from array of indices (must be
         non-negative integers) and corresponding array of dictionary values
@@ -847,6 +848,8 @@ cdef class DictionaryArray(Array):
             a pandas.Categorical (null encoded as -1)
         ordered : boolean, default False
             Set to True if the category values are ordered
+        safe : boolean, default True
+            If True, check that the dictionary indices are in range
         memory_pool : MemoryPool, default None
             For memory allocations, if required, otherwise uses default pool
 
@@ -885,7 +888,14 @@ cdef class DictionaryArray(Array):
 
         c_type.reset(new CDictionaryType(_indices.type.sp_type,
                                          _dictionary.sp_array, c_ordered))
-        c_result.reset(new CDictionaryArray(c_type, _indices.sp_array))
+
+        if safe:
+            with nogil:
+                check_status(
+                    CDictionaryArray.FromArrays(c_type, _indices.sp_array,
+                                                &c_result))
+        else:
+            c_result.reset(new CDictionaryArray(c_type, _indices.sp_array))
 
         result = DictionaryArray()
         result.init(c_result)
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 2622300134..503ee88887 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -127,6 +127,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         CDictionaryArray(const shared_ptr[CDataType]& type,
                          const shared_ptr[CArray]& indices)
 
+        @staticmethod
+        CStatus FromArrays(const shared_ptr[CDataType]& type,
+                           const shared_ptr[CArray]& indices,
+                           shared_ptr[CArray]* out)
+
         shared_ptr[CArray] indices()
         shared_ptr[CArray] dictionary()
 
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index 45b3f9ec5e..69d6a93fe0 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -199,6 +199,27 @@ def test_dictionary_from_boxed_arrays():
         assert d1[i].as_py() == dictionary[indices[i]]
 
 
+def test_dictionary_from_arrays_boundscheck():
+    indices1 = pa.array([0, 1, 2, 0, 1, 2])
+    indices2 = pa.array([0, -1, 2])
+    indices3 = pa.array([0, 1, 2, 3])
+
+    dictionary = pa.array(['foo', 'bar', 'baz'])
+
+    # Works fine
+    pa.DictionaryArray.from_arrays(indices1, dictionary)
+
+    with pytest.raises(pa.ArrowException):
+        pa.DictionaryArray.from_arrays(indices2, dictionary)
+
+    with pytest.raises(pa.ArrowException):
+        pa.DictionaryArray.from_arrays(indices3, dictionary)
+
+    # If we are confident that the indices are "safe" we can pass safe=False to
+    # disable the boundschecking
+    pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False)
+
+
 def test_dictionary_with_pandas():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 333199ab28..7929135959 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -281,7 +281,7 @@ def test_dictionary_indices_boundscheck(self):
         indices = [[0, 1], [0, -1]]
 
         for inds in indices:
-            arr = pa.DictionaryArray.from_arrays(inds, ['a'])
+            arr = pa.DictionaryArray.from_arrays(inds, ['a'], safe=False)
             batch = pa.RecordBatch.from_arrays([arr], ['foo'])
             table = pa.Table.from_batches([batch, batch, batch])
 
diff --git a/python/setup.py b/python/setup.py
index 6f0b0fa4d4..453626020f 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -111,8 +111,7 @@ def initialize_options(self):
         _build_ext.initialize_options(self)
         self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '')
         self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'debug').lower()
-        self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE',
-                                              'boost')
+        self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE')
 
         self.cmake_cxxflags = os.environ.get('PYARROW_CXXFLAGS', '')
 
@@ -208,8 +207,10 @@ def _run_cmake(self):
 
             cmake_options.append('-DCMAKE_BUILD_TYPE={0}'
                                  .format(self.build_type.lower()))
-            cmake_options.append('-DBoost_NAMESPACE={}'.format(
-                self.boost_namespace))
+
+            if self.boost_namespace is not None:
+                cmake_options.append('-DBoost_NAMESPACE={}'
+                                     .format(self.boost_namespace))
 
             extra_cmake_args = shlex.split(self.extra_cmake_args)
             if sys.platform != 'win32':


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Support DictionaryArray::FromArrays in Python bindings
> ---------------------------------------------------------------
>
>                 Key: ARROW-2099
>                 URL: https://issues.apache.org/jira/browse/ARROW-2099
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>            Reporter: Wes McKinney
>            Assignee: Wes McKinney
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> Follow up work from ARROW-1757.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to