This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new dc45a1a ARROW-2099: [Python] Add safe option to DictionaryArray.from_arrays to do boundschecking of indices by default dc45a1a is described below commit dc45a1a7bcfc916d5b5f98cf40c03fad68d06b9b Author: Wes McKinney <wes.mckin...@twosigma.com> AuthorDate: Sun Mar 11 23:41:38 2018 -0400 ARROW-2099: [Python] Add safe option to DictionaryArray.from_arrays to do boundschecking of indices by default Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #1734 from wesm/ARROW-2099 and squashes the following commits: eabc5d19 <Wes McKinney> Add safe option to DictionaryArray.from_arrays to do boundschecking of indices by default --- python/pyarrow/array.pxi | 14 ++++++++++++-- python/pyarrow/includes/libarrow.pxd | 5 +++++ python/pyarrow/tests/test_array.py | 21 +++++++++++++++++++++ python/pyarrow/tests/test_convert_pandas.py | 2 +- python/setup.py | 9 +++++---- 5 files changed, 44 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 321809f..2ea131b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -831,7 +831,8 @@ cdef class DictionaryArray(Array): @staticmethod def from_arrays(indices, dictionary, mask=None, ordered=False, - from_pandas=False, MemoryPool memory_pool=None): + from_pandas=False, safe=True, + MemoryPool memory_pool=None): """ Construct Arrow DictionaryArray from array of indices (must be non-negative integers) and corresponding array of dictionary values @@ -847,6 +848,8 @@ cdef class DictionaryArray(Array): a pandas.Categorical (null encoded as -1) ordered : boolean, default False Set to True if the category values are ordered + safe : boolean, default True + If True, check that the dictionary indices are in range memory_pool : MemoryPool, default None For memory allocations, if required, otherwise uses default pool @@ -885,7 +888,14 @@ cdef class DictionaryArray(Array): c_type.reset(new CDictionaryType(_indices.type.sp_type, _dictionary.sp_array, c_ordered)) - c_result.reset(new CDictionaryArray(c_type, _indices.sp_array)) + + if safe: + with nogil: + check_status( + CDictionaryArray.FromArrays(c_type, _indices.sp_array, + &c_result)) + else: + c_result.reset(new CDictionaryArray(c_type, _indices.sp_array)) result = DictionaryArray() result.init(c_result) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2622300..503ee88 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -127,6 +127,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CDictionaryArray(const shared_ptr[CDataType]& type, const shared_ptr[CArray]& indices) + @staticmethod + CStatus FromArrays(const shared_ptr[CDataType]& type, + const shared_ptr[CArray]& indices, + shared_ptr[CArray]* out) + shared_ptr[CArray] indices() shared_ptr[CArray] dictionary() diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 45b3f9e..69d6a93 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -199,6 +199,27 @@ def test_dictionary_from_boxed_arrays(): assert d1[i].as_py() == dictionary[indices[i]] +def test_dictionary_from_arrays_boundscheck(): + indices1 = pa.array([0, 1, 2, 0, 1, 2]) + indices2 = pa.array([0, -1, 2]) + indices3 = pa.array([0, 1, 2, 3]) + + dictionary = pa.array(['foo', 'bar', 'baz']) + + # Works fine + pa.DictionaryArray.from_arrays(indices1, dictionary) + + with pytest.raises(pa.ArrowException): + pa.DictionaryArray.from_arrays(indices2, dictionary) + + with pytest.raises(pa.ArrowException): + pa.DictionaryArray.from_arrays(indices3, dictionary) + + # If we are confident that the indices are "safe" we can pass safe=False to + # disable the boundschecking + pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False) + + def test_dictionary_with_pandas(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 333199a..7929135 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -281,7 +281,7 @@ class TestConvertMetadata(object): indices = [[0, 1], [0, -1]] for inds in indices: - arr = pa.DictionaryArray.from_arrays(inds, ['a']) + arr = pa.DictionaryArray.from_arrays(inds, ['a'], safe=False) batch = pa.RecordBatch.from_arrays([arr], ['foo']) table = pa.Table.from_batches([batch, batch, batch]) diff --git a/python/setup.py b/python/setup.py index 6f0b0fa..4536260 100644 --- a/python/setup.py +++ b/python/setup.py @@ -111,8 +111,7 @@ class build_ext(_build_ext): _build_ext.initialize_options(self) self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '') self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'debug').lower() - self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE', - 'boost') + self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE') self.cmake_cxxflags = os.environ.get('PYARROW_CXXFLAGS', '') @@ -208,8 +207,10 @@ class build_ext(_build_ext): cmake_options.append('-DCMAKE_BUILD_TYPE={0}' .format(self.build_type.lower())) - cmake_options.append('-DBoost_NAMESPACE={}'.format( - self.boost_namespace)) + + if self.boost_namespace is not None: + cmake_options.append('-DBoost_NAMESPACE={}' + .format(self.boost_namespace)) extra_cmake_args = shlex.split(self.extra_cmake_args) if sys.platform != 'win32': -- To stop receiving notification emails like this one, please contact w...@apache.org.