This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new dc45a1a  ARROW-2099: [Python] Add safe option to 
DictionaryArray.from_arrays to do boundschecking of indices by default
dc45a1a is described below

commit dc45a1a7bcfc916d5b5f98cf40c03fad68d06b9b
Author: Wes McKinney <wes.mckin...@twosigma.com>
AuthorDate: Sun Mar 11 23:41:38 2018 -0400

    ARROW-2099: [Python] Add safe option to DictionaryArray.from_arrays to do 
boundschecking of indices by default
    
    Author: Wes McKinney <wes.mckin...@twosigma.com>
    
    Closes #1734 from wesm/ARROW-2099 and squashes the following commits:
    
    eabc5d19 <Wes McKinney> Add safe option to DictionaryArray.from_arrays to 
do boundschecking of indices by default
---
 python/pyarrow/array.pxi                    | 14 ++++++++++++--
 python/pyarrow/includes/libarrow.pxd        |  5 +++++
 python/pyarrow/tests/test_array.py          | 21 +++++++++++++++++++++
 python/pyarrow/tests/test_convert_pandas.py |  2 +-
 python/setup.py                             |  9 +++++----
 5 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 321809f..2ea131b 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -831,7 +831,8 @@ cdef class DictionaryArray(Array):
 
     @staticmethod
     def from_arrays(indices, dictionary, mask=None, ordered=False,
-                    from_pandas=False, MemoryPool memory_pool=None):
+                    from_pandas=False, safe=True,
+                    MemoryPool memory_pool=None):
         """
         Construct Arrow DictionaryArray from array of indices (must be
         non-negative integers) and corresponding array of dictionary values
@@ -847,6 +848,8 @@ cdef class DictionaryArray(Array):
             a pandas.Categorical (null encoded as -1)
         ordered : boolean, default False
             Set to True if the category values are ordered
+        safe : boolean, default True
+            If True, check that the dictionary indices are in range
         memory_pool : MemoryPool, default None
             For memory allocations, if required, otherwise uses default pool
 
@@ -885,7 +888,14 @@ cdef class DictionaryArray(Array):
 
         c_type.reset(new CDictionaryType(_indices.type.sp_type,
                                          _dictionary.sp_array, c_ordered))
-        c_result.reset(new CDictionaryArray(c_type, _indices.sp_array))
+
+        if safe:
+            with nogil:
+                check_status(
+                    CDictionaryArray.FromArrays(c_type, _indices.sp_array,
+                                                &c_result))
+        else:
+            c_result.reset(new CDictionaryArray(c_type, _indices.sp_array))
 
         result = DictionaryArray()
         result.init(c_result)
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 2622300..503ee88 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -127,6 +127,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         CDictionaryArray(const shared_ptr[CDataType]& type,
                          const shared_ptr[CArray]& indices)
 
+        @staticmethod
+        CStatus FromArrays(const shared_ptr[CDataType]& type,
+                           const shared_ptr[CArray]& indices,
+                           shared_ptr[CArray]* out)
+
         shared_ptr[CArray] indices()
         shared_ptr[CArray] dictionary()
 
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index 45b3f9e..69d6a93 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -199,6 +199,27 @@ def test_dictionary_from_boxed_arrays():
         assert d1[i].as_py() == dictionary[indices[i]]
 
 
+def test_dictionary_from_arrays_boundscheck():
+    indices1 = pa.array([0, 1, 2, 0, 1, 2])
+    indices2 = pa.array([0, -1, 2])
+    indices3 = pa.array([0, 1, 2, 3])
+
+    dictionary = pa.array(['foo', 'bar', 'baz'])
+
+    # Works fine
+    pa.DictionaryArray.from_arrays(indices1, dictionary)
+
+    with pytest.raises(pa.ArrowException):
+        pa.DictionaryArray.from_arrays(indices2, dictionary)
+
+    with pytest.raises(pa.ArrowException):
+        pa.DictionaryArray.from_arrays(indices3, dictionary)
+
+    # If we are confident that the indices are "safe" we can pass safe=False to
+    # disable the boundschecking
+    pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False)
+
+
 def test_dictionary_with_pandas():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 333199a..7929135 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -281,7 +281,7 @@ class TestConvertMetadata(object):
         indices = [[0, 1], [0, -1]]
 
         for inds in indices:
-            arr = pa.DictionaryArray.from_arrays(inds, ['a'])
+            arr = pa.DictionaryArray.from_arrays(inds, ['a'], safe=False)
             batch = pa.RecordBatch.from_arrays([arr], ['foo'])
             table = pa.Table.from_batches([batch, batch, batch])
 
diff --git a/python/setup.py b/python/setup.py
index 6f0b0fa..4536260 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -111,8 +111,7 @@ class build_ext(_build_ext):
         _build_ext.initialize_options(self)
         self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '')
         self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'debug').lower()
-        self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE',
-                                              'boost')
+        self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE')
 
         self.cmake_cxxflags = os.environ.get('PYARROW_CXXFLAGS', '')
 
@@ -208,8 +207,10 @@ class build_ext(_build_ext):
 
             cmake_options.append('-DCMAKE_BUILD_TYPE={0}'
                                  .format(self.build_type.lower()))
-            cmake_options.append('-DBoost_NAMESPACE={}'.format(
-                self.boost_namespace))
+
+            if self.boost_namespace is not None:
+                cmake_options.append('-DBoost_NAMESPACE={}'
+                                     .format(self.boost_namespace))
 
             extra_cmake_args = shlex.split(self.extra_cmake_args)
             if sys.platform != 'win32':

-- 
To stop receiving notification emails like this one, please contact
w...@apache.org.

Reply via email to