This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new dc45a1a ARROW-2099: [Python] Add safe option to
DictionaryArray.from_arrays to do boundschecking of indices by default
dc45a1a is described below
commit dc45a1a7bcfc916d5b5f98cf40c03fad68d06b9b
Author: Wes McKinney <[email protected]>
AuthorDate: Sun Mar 11 23:41:38 2018 -0400
ARROW-2099: [Python] Add safe option to DictionaryArray.from_arrays to do
boundschecking of indices by default
Author: Wes McKinney <[email protected]>
Closes #1734 from wesm/ARROW-2099 and squashes the following commits:
eabc5d19 <Wes McKinney> Add safe option to DictionaryArray.from_arrays to
do boundschecking of indices by default
---
python/pyarrow/array.pxi | 14 ++++++++++++--
python/pyarrow/includes/libarrow.pxd | 5 +++++
python/pyarrow/tests/test_array.py | 21 +++++++++++++++++++++
python/pyarrow/tests/test_convert_pandas.py | 2 +-
python/setup.py | 9 +++++----
5 files changed, 44 insertions(+), 7 deletions(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 321809f..2ea131b 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -831,7 +831,8 @@ cdef class DictionaryArray(Array):
@staticmethod
def from_arrays(indices, dictionary, mask=None, ordered=False,
- from_pandas=False, MemoryPool memory_pool=None):
+ from_pandas=False, safe=True,
+ MemoryPool memory_pool=None):
"""
Construct Arrow DictionaryArray from array of indices (must be
non-negative integers) and corresponding array of dictionary values
@@ -847,6 +848,8 @@ cdef class DictionaryArray(Array):
a pandas.Categorical (null encoded as -1)
ordered : boolean, default False
Set to True if the category values are ordered
+ safe : boolean, default True
+ If True, check that the dictionary indices are in range
memory_pool : MemoryPool, default None
For memory allocations, if required, otherwise uses default pool
@@ -885,7 +888,14 @@ cdef class DictionaryArray(Array):
c_type.reset(new CDictionaryType(_indices.type.sp_type,
_dictionary.sp_array, c_ordered))
- c_result.reset(new CDictionaryArray(c_type, _indices.sp_array))
+
+ if safe:
+ with nogil:
+ check_status(
+ CDictionaryArray.FromArrays(c_type, _indices.sp_array,
+ &c_result))
+ else:
+ c_result.reset(new CDictionaryArray(c_type, _indices.sp_array))
result = DictionaryArray()
result.init(c_result)
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 2622300..503ee88 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -127,6 +127,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CDictionaryArray(const shared_ptr[CDataType]& type,
const shared_ptr[CArray]& indices)
+ @staticmethod
+ CStatus FromArrays(const shared_ptr[CDataType]& type,
+ const shared_ptr[CArray]& indices,
+ shared_ptr[CArray]* out)
+
shared_ptr[CArray] indices()
shared_ptr[CArray] dictionary()
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index 45b3f9e..69d6a93 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -199,6 +199,27 @@ def test_dictionary_from_boxed_arrays():
assert d1[i].as_py() == dictionary[indices[i]]
+def test_dictionary_from_arrays_boundscheck():
+ indices1 = pa.array([0, 1, 2, 0, 1, 2])
+ indices2 = pa.array([0, -1, 2])
+ indices3 = pa.array([0, 1, 2, 3])
+
+ dictionary = pa.array(['foo', 'bar', 'baz'])
+
+ # Works fine
+ pa.DictionaryArray.from_arrays(indices1, dictionary)
+
+ with pytest.raises(pa.ArrowException):
+ pa.DictionaryArray.from_arrays(indices2, dictionary)
+
+ with pytest.raises(pa.ArrowException):
+ pa.DictionaryArray.from_arrays(indices3, dictionary)
+
+ # If we are confident that the indices are "safe" we can pass safe=False to
+ # disable the boundschecking
+ pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False)
+
+
def test_dictionary_with_pandas():
indices = np.repeat([0, 1, 2], 2)
dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 333199a..7929135 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -281,7 +281,7 @@ class TestConvertMetadata(object):
indices = [[0, 1], [0, -1]]
for inds in indices:
- arr = pa.DictionaryArray.from_arrays(inds, ['a'])
+ arr = pa.DictionaryArray.from_arrays(inds, ['a'], safe=False)
batch = pa.RecordBatch.from_arrays([arr], ['foo'])
table = pa.Table.from_batches([batch, batch, batch])
diff --git a/python/setup.py b/python/setup.py
index 6f0b0fa..4536260 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -111,8 +111,7 @@ class build_ext(_build_ext):
_build_ext.initialize_options(self)
self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '')
self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'debug').lower()
- self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE',
- 'boost')
+ self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE')
self.cmake_cxxflags = os.environ.get('PYARROW_CXXFLAGS', '')
@@ -208,8 +207,10 @@ class build_ext(_build_ext):
cmake_options.append('-DCMAKE_BUILD_TYPE={0}'
.format(self.build_type.lower()))
- cmake_options.append('-DBoost_NAMESPACE={}'.format(
- self.boost_namespace))
+
+ if self.boost_namespace is not None:
+ cmake_options.append('-DBoost_NAMESPACE={}'
+ .format(self.boost_namespace))
extra_cmake_args = shlex.split(self.extra_cmake_args)
if sys.platform != 'win32':
--
To stop receiving notification emails like this one, please contact
[email protected].