[
https://issues.apache.org/jira/browse/ARROW-1658?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16227103#comment-16227103
]
ASF GitHub Bot commented on ARROW-1658:
---------------------------------------
wesm closed pull request #1270: ARROW-1658: [Python] Add boundschecking of
dictionary indices when creating CategoricalBlock
URL: https://github.com/apache/arrow/pull/1270
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc
b/cpp/src/arrow/ipc/metadata-internal.cc
index f04e9b05a..f0f0f6758 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -72,7 +72,7 @@ MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion
version) {
case flatbuf::MetadataVersion_V4:
// Arrow >= 0.8
return MetadataVersion::V4;
- // Add cases as other versions become available
+ // Add cases as other versions become available
default:
return MetadataVersion::V4;
}
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 7f1591213..c92faede1 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -966,9 +966,10 @@ class CategoricalBlock : public PandasBlock {
"CategoricalBlock allocation happens when calling Write");
}
- template <int ARROW_INDEX_TYPE>
+ template <typename ArrowType>
Status WriteIndices(const std::shared_ptr<Column>& col) {
- using TRAITS = internal::arrow_traits<ARROW_INDEX_TYPE>;
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ using TRAITS = internal::arrow_traits<ArrowType::type_id>;
using T = typename TRAITS::T;
constexpr int npy_type = TRAITS::npy_type;
@@ -977,10 +978,22 @@ class CategoricalBlock : public PandasBlock {
// Sniff the first chunk
const std::shared_ptr<Array> arr_first = data.chunk(0);
const auto& dict_arr_first = static_cast<const
DictionaryArray&>(*arr_first);
- const auto& indices_first =
- static_cast<const PrimitiveArray&>(*dict_arr_first.indices());
+ const auto& indices_first = static_cast<const
ArrayType&>(*dict_arr_first.indices());
+
+ auto CheckIndices = [](const ArrayType& arr, int64_t dict_length) {
+ const T* values = arr.raw_values();
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ if (arr.IsValid(i) && (values[i] < 0 || values[i] >= dict_length)) {
+ std::stringstream ss;
+ ss << "Out of bounds dictionary index: " <<
static_cast<int64_t>(values[i]);
+ return Status::Invalid(ss.str());
+ }
+ }
+ return Status::OK();
+ };
if (data.num_chunks() == 1 && indices_first.null_count() == 0) {
+ RETURN_NOT_OK(CheckIndices(indices_first,
dict_arr_first.dictionary()->length()));
RETURN_NOT_OK(AllocateNDArrayFromIndices<T>(npy_type, indices_first));
} else {
if (options_.zero_copy_only) {
@@ -998,9 +1011,10 @@ class CategoricalBlock : public PandasBlock {
const std::shared_ptr<Array> arr = data.chunk(c);
const auto& dict_arr = static_cast<const DictionaryArray&>(*arr);
- const auto& indices = static_cast<const
PrimitiveArray&>(*dict_arr.indices());
+ const auto& indices = static_cast<const
ArrayType&>(*dict_arr.indices());
auto in_values = reinterpret_cast<const T*>(indices.raw_values());
+ RETURN_NOT_OK(CheckIndices(indices, dict_arr.dictionary()->length()));
// Null is -1 in CategoricalBlock
for (int i = 0; i < arr->length(); ++i) {
*out_values++ = indices.IsNull(i) ? -1 : in_values[i];
@@ -1026,16 +1040,16 @@ class CategoricalBlock : public PandasBlock {
switch (dict_type.index_type()->id()) {
case Type::INT8:
- RETURN_NOT_OK(WriteIndices<Type::INT8>(converted_col));
+ RETURN_NOT_OK(WriteIndices<Int8Type>(converted_col));
break;
case Type::INT16:
- RETURN_NOT_OK(WriteIndices<Type::INT16>(converted_col));
+ RETURN_NOT_OK(WriteIndices<Int16Type>(converted_col));
break;
case Type::INT32:
- RETURN_NOT_OK(WriteIndices<Type::INT32>(converted_col));
+ RETURN_NOT_OK(WriteIndices<Int32Type>(converted_col));
break;
case Type::INT64:
- RETURN_NOT_OK(WriteIndices<Type::INT64>(converted_col));
+ RETURN_NOT_OK(WriteIndices<Int64Type>(converted_col));
break;
default: {
std::stringstream ss;
@@ -1091,13 +1105,11 @@ class CategoricalBlock : public PandasBlock {
PyObject* block_arr = PyArray_NewFromDescr(&PyArray_Type, descr, 1,
block_dims,
nullptr, data,
NPY_ARRAY_CARRAY, nullptr);
+ RETURN_IF_PYERROR();
npy_intp placement_dims[1] = {num_columns_};
PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
- if (placement_arr == NULL) {
- // TODO(wesm): propagating Python exception
- return Status::OK();
- }
+ RETURN_IF_PYERROR();
block_arr_.reset(block_arr);
placement_arr_.reset(placement_arr);
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 7da5c3caf..7752d062a 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -162,6 +162,7 @@ def array(object obj, type=None, mask=None,
return DictionaryArray.from_arrays(
values.codes, values.categories.values,
mask=mask, ordered=values.ordered,
+ from_pandas=from_pandas,
memory_pool=memory_pool)
else:
values, type = pdcompat.get_datetimetz_type(values, obj.dtype,
@@ -671,7 +672,7 @@ cdef class DictionaryArray(Array):
@staticmethod
def from_arrays(indices, dictionary, mask=None, ordered=False,
- MemoryPool memory_pool=None):
+ from_pandas=False, MemoryPool memory_pool=None):
"""
Construct Arrow DictionaryArray from array of indices (must be
non-negative integers) and corresponding array of dictionary values
@@ -682,15 +683,20 @@ cdef class DictionaryArray(Array):
dictionary : ndarray or pandas.Series
mask : ndarray or pandas.Series, boolean type
True values indicate that indices are actually null
+ from_pandas : boolean, default False
+ If True, the indices should be treated as though they originated in
+ a pandas.Categorical (null encoded as -1)
ordered : boolean, default False
Set to True if the category values are ordered
+ memory_pool : MemoryPool, default None
+ For memory allocations, if required, otherwise uses default pool
Returns
-------
dict_array : DictionaryArray
"""
cdef:
- Array arrow_indices, arrow_dictionary
+ Array _indices, _dictionary
DictionaryArray result
shared_ptr[CDataType] c_type
shared_ptr[CArray] c_result
@@ -699,29 +705,28 @@ cdef class DictionaryArray(Array):
if mask is not None:
raise NotImplementedError(
"mask not implemented with Arrow array inputs yet")
- arrow_indices = indices
+ _indices = indices
else:
- if mask is None:
- mask = indices == -1
- else:
- mask = mask | (indices == -1)
- arrow_indices = Array.from_pandas(indices, mask=mask,
- memory_pool=memory_pool)
+ if from_pandas:
+ if mask is None:
+ mask = indices == -1
+ else:
+ mask = mask | (indices == -1)
+ _indices = array(indices, mask=mask, memory_pool=memory_pool)
if isinstance(dictionary, Array):
- arrow_dictionary = dictionary
+ _dictionary = dictionary
else:
- arrow_dictionary = Array.from_pandas(dictionary,
- memory_pool=memory_pool)
+ _dictionary = array(dictionary, memory_pool=memory_pool)
- if not isinstance(arrow_indices, IntegerArray):
+ if not isinstance(_indices, IntegerArray):
raise ValueError('Indices must be integer type')
cdef c_bool c_ordered = ordered
- c_type.reset(new CDictionaryType(arrow_indices.type.sp_type,
- arrow_dictionary.sp_array, c_ordered))
- c_result.reset(new CDictionaryArray(c_type, arrow_indices.sp_array))
+ c_type.reset(new CDictionaryType(_indices.type.sp_type,
+ _dictionary.sp_array, c_ordered))
+ c_result.reset(new CDictionaryArray(c_type, _indices.sp_array))
result = DictionaryArray()
result.init(c_result)
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 07ecf3010..dabccac37 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -217,6 +217,21 @@ def test_zero_copy_success(self):
result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
npt.assert_array_equal(result, [0, 1, 2])
+ def test_dictionary_indices_boundscheck(self):
+ # ARROW-1658. No validation of indices leads to segfaults in pandas
+ indices = [[0, 1], [0, -1]]
+
+ for inds in indices:
+ arr = pa.DictionaryArray.from_arrays(inds, ['a'])
+ batch = pa.RecordBatch.from_arrays([arr], ['foo'])
+ table = pa.Table.from_batches([batch, batch, batch])
+
+ with pytest.raises(pa.ArrowException):
+ arr.to_pandas()
+
+ with pytest.raises(pa.ArrowException):
+ table.to_pandas()
+
def test_zero_copy_dictionaries(self):
arr = pa.DictionaryArray.from_arrays(
np.array([0, 0]),
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] Out of bounds dictionary indices causes segfault after converting to
> pandas
> ------------------------------------------------------------------------------------
>
> Key: ARROW-1658
> URL: https://issues.apache.org/jira/browse/ARROW-1658
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Affects Versions: 0.7.1
> Reporter: Wes McKinney
> Assignee: Wes McKinney
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> Minimal reproduction:
> {code}
> import numpy as np
> import pandas as pd
> import pyarrow as pa
>
> num = 100
> arr = pa.DictionaryArray.from_arrays(
> np.arange(0, num),
> np.array(['a'], np.object),
> np.zeros(num, np.bool),
> True)
> print(arr.to_pandas())
> {code}
> At no time in the Arrow codebase do we validate that the dictionary indices
> are in bounds. It seems that pandas is overly trusting of the validity of the
> indices. So we should add a method someplace to validate that the dictionary
> non-null indices are not out of bounds (perhaps in
> {{CategoricalBlock::WriteIndices}}).
> As an aside: there may be other times when doing analytics on categorical
> data that external data will have out of bounds index values. We should plan
> for these and decide whether to raise an exception or treat them as null
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)