[ 
https://issues.apache.org/jira/browse/ARROW-1757?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16354087#comment-16354087
 ] 

ASF GitHub Bot commented on ARROW-1757:
---------------------------------------

wesm closed pull request #1535: ARROW-1757: [C++] Add 
DictionaryArray::FromArrays alternate ctor that can check or sanitized 
"untrusted" indices
URL: https://github.com/apache/arrow/pull/1535
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index c53da8591..fa64d467b 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -2384,6 +2384,37 @@ TEST(TestDictionary, Validate) {
   // ASSERT_OK(ValidateArray(*arr3));
 }
 
+TEST(TestDictionary, FromArray) {
+  std::shared_ptr<Array> dict;
+  vector<string> dict_values = {"foo", "bar", "baz"};
+  ArrayFromVector<StringType, string>(dict_values, &dict);
+  std::shared_ptr<DataType> dict_type = dictionary(int16(), dict);
+
+  std::shared_ptr<Array> indices1;
+  vector<int16_t> indices_values1 = {1, 2, 0, 0, 2, 0};
+  ArrayFromVector<Int16Type, int16_t>(indices_values1, &indices1);
+
+  std::shared_ptr<Array> indices2;
+  vector<int16_t> indices_values2 = {1, 2, 0, 3, 2, 0};
+  ArrayFromVector<Int16Type, int16_t>(indices_values2, &indices2);
+
+  std::shared_ptr<Array> indices3;
+  vector<bool> is_valid3 = {true, true, false, true, true, true};
+  vector<int16_t> indices_values3 = {1, 2, -1, 0, 2, 0};
+  ArrayFromVector<Int16Type, int16_t>(is_valid3, indices_values3, &indices3);
+
+  std::shared_ptr<Array> indices4;
+  vector<bool> is_valid4 = {true, true, false, true, true, true};
+  vector<int16_t> indices_values4 = {1, 2, 1, 3, 2, 0};
+  ArrayFromVector<Int16Type, int16_t>(is_valid4, indices_values4, &indices4);
+
+  std::shared_ptr<Array> arr1, arr2, arr3, arr4;
+  ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices1, &arr1));
+  ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices2, 
&arr2));
+  ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices3, &arr3));
+  ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, 
&arr4));
+}
+
 // ----------------------------------------------------------------------
 // Struct tests
 
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 3d72761ed..a8043d69b 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -476,6 +476,39 @@ const Array* UnionArray::UnsafeChild(int i) const {
 // ----------------------------------------------------------------------
 // DictionaryArray
 
+/// \brief Perform validation check to determine if all dictionary indices
+/// are within valid range (0 <= index < upper_bound)
+///
+/// \param[in] indices array of dictionary indices
+/// \param[in] upper_bound upper bound of valid range for indices
+/// \return Status
+template <typename ArrowType>
+Status ValidateDictionaryIndices(const std::shared_ptr<Array>& indices,
+                                 const int64_t upper_bound) {
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+  const auto& array = static_cast<const ArrayType&>(*indices);
+  const typename ArrowType::c_type* data = array.raw_values();
+  const int64_t size = array.length();
+
+  if (array.null_count() == 0) {
+    for (int64_t idx = 0; idx < size; ++idx) {
+      if (data[idx] < 0 || data[idx] >= upper_bound) {
+        return Status::Invalid("Dictionary has out-of-bound index [0, 
dict.length)");
+      }
+    }
+  } else {
+    for (int64_t idx = 0; idx < size; ++idx) {
+      if (!array.IsNull(idx)) {
+        if (data[idx] < 0 || data[idx] >= upper_bound) {
+          return Status::Invalid("Dictionary has out-of-bound index [0, 
dict.length)");
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 DictionaryArray::DictionaryArray(const std::shared_ptr<ArrayData>& data)
     : dict_type_(static_cast<const DictionaryType*>(data->type.get())) {
   DCHECK_EQ(data->type->id(), Type::DICTIONARY);
@@ -492,11 +525,51 @@ DictionaryArray::DictionaryArray(const 
std::shared_ptr<DataType>& type,
   SetData(data);
 }
 
+Status DictionaryArray::FromArrays(const std::shared_ptr<DataType>& type,
+                                   const std::shared_ptr<Array>& indices,
+                                   std::shared_ptr<Array>* out) {
+  if (indices->length() == 0) {
+    return Status::Invalid("Dictionary indices must have non-zero length");
+  }
+
+  DCHECK_EQ(type->id(), Type::DICTIONARY);
+  const auto& dict = static_cast<const DictionaryType&>(*type);
+  DCHECK_EQ(indices->type_id(), dict.index_type()->id());
+
+  int64_t upper_bound = dict.dictionary()->length();
+  Status is_valid;
+
+  switch (indices->type_id()) {
+    case Type::INT8:
+      is_valid = ValidateDictionaryIndices<Int8Type>(indices, upper_bound);
+      break;
+    case Type::INT16:
+      is_valid = ValidateDictionaryIndices<Int16Type>(indices, upper_bound);
+      break;
+    case Type::INT32:
+      is_valid = ValidateDictionaryIndices<Int32Type>(indices, upper_bound);
+      break;
+    case Type::INT64:
+      is_valid = ValidateDictionaryIndices<Int64Type>(indices, upper_bound);
+      break;
+    default:
+      std::stringstream ss;
+      ss << "Categorical index type not supported: " << 
indices->type()->ToString();
+      return Status::NotImplemented(ss.str());
+  }
+
+  if (!is_valid.ok()) {
+    return is_valid;
+  }
+
+  *out = std::make_shared<DictionaryArray>(type, indices);
+  return is_valid;
+}
+
 void DictionaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
   this->Array::SetData(data);
   auto indices_data = data_->Copy();
   indices_data->type = dict_type_->index_type();
-  std::shared_ptr<Array> result;
   indices_ = MakeArray(indices_data);
 }
 
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index f0a786131..5b9ce9a01 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -726,6 +726,19 @@ class ARROW_EXPORT DictionaryArray : public Array {
   DictionaryArray(const std::shared_ptr<DataType>& type,
                   const std::shared_ptr<Array>& indices);
 
+  /// \brief Construct DictionaryArray from dictonary data type and indices 
array
+  ///
+  /// This function does the validation of the indices and input type. It 
checks if
+  /// all indices are non-negative and smaller than the size of the dictionary
+  ///
+  /// \param[in] type a data type containing a dictionary
+  /// \param[in] indices an array of non-negative signed
+  /// integers smaller than the size of the dictionary
+  /// \param[out] out the resulting DictionaryArray instance
+  static Status FromArrays(const std::shared_ptr<DataType>& type,
+                           const std::shared_ptr<Array>& indices,
+                           std::shared_ptr<Array>* out);
+
   std::shared_ptr<Array> indices() const;
   std::shared_ptr<Array> dictionary() const;
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [C++] Add DictionaryArray::FromArrays alternate ctor that can check or 
> sanitized "untrusted" indices
> ----------------------------------------------------------------------------------------------------
>
>                 Key: ARROW-1757
>                 URL: https://issues.apache.org/jira/browse/ARROW-1757
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: C++
>            Reporter: Wes McKinney
>            Assignee: Panchen Xue
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> Related to ARROW-1658. This is related to the offset sanitization in 
> {{ListArray::FromArrays}}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to