[ 
https://issues.apache.org/jira/browse/ARROW-972?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16245970#comment-16245970
 ] 

ASF GitHub Bot commented on ARROW-972:
--------------------------------------

wesm closed pull request #1216: ARROW-972: UnionArray in pyarrow
URL: https://github.com/apache/arrow/pull/1216
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index b523876bf..9c91d619c 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -393,6 +393,62 @@ UnionArray::UnionArray(const std::shared_ptr<DataType>& 
type, int64_t length,
   SetData(internal_data);
 }
 
+Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets,
+                             const std::vector<std::shared_ptr<Array>>& 
children,
+                             std::shared_ptr<Array>* out) {
+  if (value_offsets.length() == 0) {
+    return Status::Invalid("UnionArray offsets must have non-zero length");
+  }
+
+  if (value_offsets.type_id() != Type::INT32) {
+    return Status::Invalid("UnionArray offsets must be signed int32");
+  }
+
+  if (type_ids.type_id() != Type::INT8) {
+    return Status::Invalid("UnionArray type_ids must be signed int8");
+  }
+
+  if (value_offsets.null_count() != 0) {
+    return Status::Invalid("MakeDense does not allow NAs in value_offsets");
+  }
+
+  BufferVector buffers = {type_ids.null_bitmap(),
+                          static_cast<const UInt8Array&>(type_ids).values(),
+                          static_cast<const 
Int32Array&>(value_offsets).values()};
+  auto union_type = union_(children, UnionMode::DENSE);
+  auto internal_data =
+      std::make_shared<ArrayData>(union_type, type_ids.length(), 
std::move(buffers),
+                                  type_ids.null_count(), type_ids.offset());
+  for (const auto& child : children) {
+    internal_data->child_data.push_back(child->data());
+  }
+  *out = std::make_shared<UnionArray>(internal_data);
+  return Status::OK();
+}
+
+Status UnionArray::MakeSparse(const Array& type_ids,
+                              const std::vector<std::shared_ptr<Array>>& 
children,
+                              std::shared_ptr<Array>* out) {
+  if (type_ids.type_id() != Type::INT8) {
+    return Status::Invalid("UnionArray type_ids must be signed int8");
+  }
+  BufferVector buffers = {type_ids.null_bitmap(),
+                          static_cast<const UInt8Array&>(type_ids).values(), 
nullptr};
+  auto union_type = union_(children, UnionMode::SPARSE);
+  auto internal_data =
+      std::make_shared<ArrayData>(union_type, type_ids.length(), 
std::move(buffers),
+                                  type_ids.null_count(), type_ids.offset());
+  for (const auto& child : children) {
+    internal_data->child_data.push_back(child->data());
+    if (child->length() != type_ids.length()) {
+      return Status::Invalid(
+          "Sparse UnionArray must have len(child) == len(type_ids) for all 
children");
+    }
+  }
+  *out = std::make_shared<UnionArray>(internal_data);
+  return Status::OK();
+}
+
 std::shared_ptr<Array> UnionArray::child(int i) const {
   if (!boxed_fields_[i]) {
     boxed_fields_[i] = MakeArray(data_->child_data[i]);
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index afbd780dd..f7762ce10 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -612,16 +612,47 @@ class ARROW_EXPORT UnionArray : public Array {
              const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t 
null_count = 0,
              int64_t offset = 0);
 
+  /// \brief Construct Dense UnionArray from types_ids, value_offsets and 
children
+  ///
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types. The value_offsets are assumed to be well-formed.
+  ///
+  /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+  /// 0 corresponding to each type.
+  /// \param[in] value_offsets An array of signed int32 values indicating the
+  /// relative offset into the respective child array for the type in a given 
slot.
+  /// The respective offsets for each child value array must be in order / 
increasing.
+  /// \param[in] children Vector of children Arrays containing the data for 
each type.
+  /// \param[out] out Will have length equal to value_offsets.length()
+  static Status MakeDense(const Array& type_ids, const Array& value_offsets,
+                          const std::vector<std::shared_ptr<Array>>& children,
+                          std::shared_ptr<Array>* out);
+
+  /// \brief Construct Sparse UnionArray from type_ids and children
+  ///
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types.
+  ///
+  /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+  /// 0 corresponding to each type.
+  /// \param[in] children Vector of children Arrays containing the data for 
each type.
+  /// \param[out] out Will have length equal to type_ids.length()
+  static Status MakeSparse(const Array& type_ids,
+                           const std::vector<std::shared_ptr<Array>>& children,
+                           std::shared_ptr<Array>* out);
+
   /// Note that this buffer does not account for any slice offset
   std::shared_ptr<Buffer> type_ids() const { return data_->buffers[1]; }
 
   /// Note that this buffer does not account for any slice offset
   std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
 
+  int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + 
data_->offset]; }
+
   const type_id_t* raw_type_ids() const { return raw_type_ids_ + 
data_->offset; }
   const int32_t* raw_value_offsets() const { return raw_value_offsets_ + 
data_->offset; }
 
-  UnionMode mode() const { return static_cast<const 
UnionType&>(*type()).mode(); }
+  UnionMode::type mode() const { return static_cast<const 
UnionType&>(*type()).mode(); }
 
   std::shared_ptr<Array> child(int pos) const;
 
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 2ec86c369..a2d4de7b7 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -152,7 +152,7 @@ class RangeEqualsVisitor {
   bool CompareUnions(const UnionArray& left) const {
     const auto& right = static_cast<const UnionArray&>(right_);
 
-    const UnionMode union_mode = left.mode();
+    const UnionMode::type union_mode = left.mode();
     if (union_mode != right.mode()) {
       return false;
     }
diff --git a/cpp/src/arrow/ipc/json-internal.cc 
b/cpp/src/arrow/ipc/json-internal.cc
index c1c0661d6..1b9baee7d 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -774,7 +774,7 @@ static Status GetUnion(const RjObject& json_type,
   RETURN_NOT_STRING("mode", it_mode, json_type);
 
   std::string mode_str = it_mode->value.GetString();
-  UnionMode mode;
+  UnionMode::type mode;
 
   if (mode_str == "SPARSE") {
     mode = UnionMode::SPARSE;
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc 
b/cpp/src/arrow/ipc/metadata-internal.cc
index f0f0f6758..63ef8a549 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -163,8 +163,9 @@ static Status StructToFlatbuffer(FBB& fbb, const DataType& 
type,
 static Status UnionFromFlatbuffer(const flatbuf::Union* union_data,
                                   const std::vector<std::shared_ptr<Field>>& 
children,
                                   std::shared_ptr<DataType>* out) {
-  UnionMode mode = union_data->mode() == flatbuf::UnionMode_Sparse ? 
UnionMode::SPARSE
-                                                                   : 
UnionMode::DENSE;
+  UnionMode::type mode =
+      (union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE
+                                                       : UnionMode::DENSE);
 
   std::vector<uint8_t> type_codes;
 
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index a9bf59191..0d1985fb2 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -190,7 +190,7 @@ std::string TimestampType::ToString() const {
 // Union type
 
 UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields,
-                     const std::vector<uint8_t>& type_codes, UnionMode mode)
+                     const std::vector<uint8_t>& type_codes, UnionMode::type 
mode)
     : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) {
   children_ = fields;
 }
@@ -440,10 +440,24 @@ std::shared_ptr<DataType> struct_(const 
std::vector<std::shared_ptr<Field>>& fie
 }
 
 std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& 
child_fields,
-                                 const std::vector<uint8_t>& type_codes, 
UnionMode mode) {
+                                 const std::vector<uint8_t>& type_codes,
+                                 UnionMode::type mode) {
   return std::make_shared<UnionType>(child_fields, type_codes, mode);
 }
 
+std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Array>>& 
children,
+                                 UnionMode::type mode) {
+  std::vector<std::shared_ptr<Field>> types;
+  std::vector<uint8_t> type_codes;
+  uint8_t counter = 0;
+  for (const auto& child : children) {
+    types.push_back(field(std::to_string(counter), child->type()));
+    type_codes.push_back(counter);
+    counter++;
+  }
+  return union_(types, type_codes, mode);
+}
+
 std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& 
index_type,
                                      const std::shared_ptr<Array>& dict_values,
                                      bool ordered) {
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 446f4d3a0..9e11a0344 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -517,14 +517,17 @@ class ARROW_EXPORT DecimalType : public 
FixedSizeBinaryType {
   int32_t scale_;
 };
 
-enum class UnionMode : char { SPARSE, DENSE };
+struct UnionMode {
+  enum type { SPARSE, DENSE };
+};
 
 class ARROW_EXPORT UnionType : public NestedType {
  public:
   static constexpr Type::type type_id = Type::UNION;
 
   UnionType(const std::vector<std::shared_ptr<Field>>& fields,
-            const std::vector<uint8_t>& type_codes, UnionMode mode = 
UnionMode::SPARSE);
+            const std::vector<uint8_t>& type_codes,
+            UnionMode::type mode = UnionMode::SPARSE);
 
   std::string ToString() const override;
   std::string name() const override { return "union"; }
@@ -534,10 +537,10 @@ class ARROW_EXPORT UnionType : public NestedType {
 
   const std::vector<uint8_t>& type_codes() const { return type_codes_; }
 
-  UnionMode mode() const { return mode_; }
+  UnionMode::type mode() const { return mode_; }
 
  private:
-  UnionMode mode_;
+  UnionMode::type mode_;
 
   // The type id used in the data to indicate each data type in the union. For
   // example, the first type in the union might be denoted by the id 5 (instead
@@ -842,7 +845,12 @@ struct_(const std::vector<std::shared_ptr<Field>>& fields);
 /// \brief Create an instance of Union type
 std::shared_ptr<DataType> ARROW_EXPORT
 union_(const std::vector<std::shared_ptr<Field>>& child_fields,
-       const std::vector<uint8_t>& type_codes, UnionMode mode = 
UnionMode::SPARSE);
+       const std::vector<uint8_t>& type_codes, UnionMode::type mode = 
UnionMode::SPARSE);
+
+/// \brief Create and instance of Union type
+std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Array>>& children,
+       UnionMode::type mode = UnionMode::SPARSE);
 
 /// \brief Create an instance of Dictionary type
 std::shared_ptr<DataType> ARROW_EXPORT
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 1215c822d..2d7d7288b 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -36,7 +36,7 @@
                          time32, time64, timestamp, date32, date64,
                          float16, float32, float64,
                          binary, string, decimal,
-                         list_, struct, dictionary, field,
+                         list_, struct, union, dictionary, field,
                          type_for_alias,
                          DataType, NAType,
                          Field,
@@ -52,7 +52,7 @@
                          Int16Array, UInt16Array,
                          Int32Array, UInt32Array,
                          Int64Array, UInt64Array,
-                         ListArray,
+                         ListArray, UnionArray,
                          BinaryArray, StringArray,
                          FixedSizeBinaryArray,
                          DictionaryArray,
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 04a5b1368..7e5e57509 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -192,7 +192,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" 
nogil:
         int64_t num_values() const
         shared_ptr[ColumnPath] path_in_schema() const
         bint is_stats_set() const
-        shared_ptr[CRowGroupStatistics] statistics() const;
+        shared_ptr[CRowGroupStatistics] statistics() const
         ParquetCompression compression() const
         const vector[ParquetEncoding]& encodings() const
 
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 7752d062a..9991411e5 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -631,6 +631,58 @@ cdef class ListArray(Array):
         return pyarrow_wrap_array(out)
 
 
+cdef class UnionArray(Array):
+
+    @staticmethod
+    def from_dense(Array types, Array value_offsets, list children):
+        """
+        Construct dense UnionArray from arrays of int8 types, int32 offsets and
+        children arrays
+
+        Parameters
+        ----------
+        types : Array (int8 type)
+        value_offsets : Array (int32 type)
+        children : list
+
+        Returns
+        -------
+        union_array : UnionArray
+        """
+        cdef shared_ptr[CArray] out
+        cdef vector[shared_ptr[CArray]] c
+        cdef Array child
+        for child in children:
+            c.push_back(child.sp_array)
+        with nogil:
+            check_status(CUnionArray.MakeDense(
+                deref(types.ap), deref(value_offsets.ap), c, &out))
+        return pyarrow_wrap_array(out)
+
+    @staticmethod
+    def from_sparse(Array types, list children):
+        """
+        Construct sparse UnionArray from arrays of int8 types and children
+        arrays
+
+        Parameters
+        ----------
+        types : Array (int8 type)
+        children : list
+
+        Returns
+        -------
+        union_array : UnionArray
+        """
+        cdef shared_ptr[CArray] out
+        cdef vector[shared_ptr[CArray]] c
+        cdef Array child
+        for child in children:
+            c.push_back(child.sp_array)
+        with nogil:
+            check_status(CUnionArray.MakeSparse(deref(types.ap), c, &out))
+        return pyarrow_wrap_array(out)
+
 cdef class StringArray(Array):
     pass
 
@@ -789,6 +841,7 @@ cdef dict _array_classes = {
     _Type_FLOAT: FloatArray,
     _Type_DOUBLE: DoubleArray,
     _Type_LIST: ListArray,
+    _Type_UNION: UnionArray,
     _Type_BINARY: BinaryArray,
     _Type_STRING: StringArray,
     _Type_DICTIONARY: DictionaryArray,
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 731ef9497..dfafd371b 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -67,6 +67,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         _Type_DICTIONARY" arrow::Type::DICTIONARY"
         _Type_MAP" arrow::Type::MAP"
 
+    enum UnionMode" arrow::UnionMode::type":
+        _UnionMode_SPARSE" arrow::UnionMode::SPARSE"
+        _UnionMode_DENSE" arrow::UnionMode::DENSE"
+
     enum TimeUnit" arrow::TimeUnit::type":
         TimeUnit_SECOND" arrow::TimeUnit::SECOND"
         TimeUnit_MILLI" arrow::TimeUnit::MILLI"
@@ -222,6 +226,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CStructType" arrow::StructType"(CDataType):
         CStructType(const vector[shared_ptr[CField]]& fields)
 
+    cdef cppclass CUnionType" arrow::UnionType"(CDataType):
+        CUnionType(const vector[shared_ptr[CField]]& fields,
+                   const vector[uint8_t]& type_codes, UnionMode mode)
+        UnionMode mode()
+
     cdef cppclass CSchema" arrow::Schema":
         CSchema(const vector[shared_ptr[CField]]& fields)
         CSchema(const vector[shared_ptr[CField]]& fields,
@@ -317,6 +326,22 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CArray] values()
         shared_ptr[CDataType] value_type()
 
+    cdef cppclass CUnionArray" arrow::UnionArray"(CArray):
+        @staticmethod
+        CStatus MakeSparse(const CArray& type_ids,
+                           const vector[shared_ptr[CArray]]& children,
+                           shared_ptr[CArray]* out)
+
+        @staticmethod
+        CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets,
+                          const vector[shared_ptr[CArray]]& children,
+                          shared_ptr[CArray]* out)
+        uint8_t* raw_type_ids()
+        int32_t value_offset(int i)
+        shared_ptr[CArray] child(int pos)
+        const CArray* UnsafeChild(int pos)
+        UnionMode mode()
+
     cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray):
         const uint8_t* GetValue(int i, int32_t* length)
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 8fdcf553c..531489490 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -56,6 +56,11 @@ cdef class DictionaryType(DataType):
         const CDictionaryType* dict_type
 
 
+cdef class UnionType(DataType):
+    cdef:
+        list child_types
+
+
 cdef class TimestampType(DataType):
     cdef:
         const CTimestampType* ts_type
@@ -139,6 +144,13 @@ cdef class ListValue(ArrayValue):
     cdef getitem(self, int64_t i)
 
 
+cdef class UnionValue(ArrayValue):
+    cdef:
+        CUnionArray* ap
+        list value_types
+
+    cdef getitem(self, int64_t i)
+
 cdef class StringValue(ArrayValue):
     pass
 
@@ -242,6 +254,10 @@ cdef class ListArray(Array):
     pass
 
 
+cdef class UnionArray(Array):
+    pass
+
+
 cdef class StringArray(Array):
     pass
 
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 6f4451e3f..b4ca49caf 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -92,6 +92,8 @@ Type_UNION = _Type_UNION
 Type_DICTIONARY = _Type_DICTIONARY
 Type_MAP = _Type_MAP
 
+UnionMode_SPARSE = _UnionMode_SPARSE
+UnionMode_DENSE = _UnionMode_DENSE
 
 # Exception types
 include "error.pxi"
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 9f1051228..90aff9e93 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -72,7 +72,7 @@ cdef public api object pyarrow_wrap_data_type(
     elif type.get().id() == _Type_STRUCT:
         out = StructType()
     elif type.get().id() == _Type_UNION:
-        out = StructType()
+        out = UnionType()
     elif type.get().id() == _Type_TIMESTAMP:
         out = TimestampType()
     elif type.get().id() == _Type_FIXED_SIZE_BINARY:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index c37ed3b20..a396fa763 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -315,6 +315,24 @@ cdef class ListValue(ArrayValue):
         return result
 
 
+cdef class UnionValue(ArrayValue):
+
+    cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+        self.sp_array = sp_array
+        self.ap = <CUnionArray*> sp_array.get()
+
+    cdef getitem(self, int64_t i):
+        cdef int8_t type_id = self.ap.raw_type_ids()[i]
+        cdef shared_ptr[CArray] child = self.ap.child(type_id)
+        if self.ap.mode() == _UnionMode_SPARSE:
+            return box_scalar(self.type[type_id], child, i)
+        else:
+            return box_scalar(self.type[type_id], child,
+                              self.ap.value_offset(i))
+
+    def as_py(self):
+        return self.getitem(self.index).as_py()
+
 cdef class FixedSizeBinaryValue(ArrayValue):
 
     def as_py(self):
@@ -364,6 +382,7 @@ cdef dict _scalar_classes = {
     _Type_FLOAT: FloatValue,
     _Type_DOUBLE: DoubleValue,
     _Type_LIST: ListValue,
+    _Type_UNION: UnionValue,
     _Type_BINARY: BinaryValue,
     _Type_STRING: StringValue,
     _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue,
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index e3a4c9756..7dc93c28e 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -235,6 +235,28 @@ def test_list_from_arrays():
     assert result.equals(expected)
 
 
+def test_union_from_dense():
+    binary = pa.array([b'a', b'b', b'c', b'd'], type='binary')
+    int64 = pa.array([1, 2, 3], type='int64')
+    types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
+    value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32')
+
+    result = pa.UnionArray.from_dense(types, value_offsets, [binary, int64])
+
+    assert result.to_pylist() == [b'a', 1, b'c', b'b', 2, 3, b'd']
+
+
+def test_union_from_sparse():
+    binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'],
+                      type='binary')
+    int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64')
+    types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
+
+    result = pa.UnionArray.from_sparse(types, [binary, int64])
+
+    assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd']
+
+
 def _check_cast_case(case, safe=True):
     in_data, in_type, out_data, out_type = case
 
diff --git a/python/pyarrow/tests/test_schema.py 
b/python/pyarrow/tests/test_schema.py
index d6b2655b7..116f39783 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -319,6 +319,14 @@ def test_type_schema_pickling():
             pa.field('a', 'int8'),
             pa.field('b', 'string')
         ]),
+        pa.union([
+            pa.field('a', pa.int8()),
+            pa.field('b', pa.int16())
+        ], pa.lib.UnionMode_SPARSE),
+        pa.union([
+            pa.field('a', pa.int8()),
+            pa.field('b', pa.int16())
+        ], pa.lib.UnionMode_DENSE),
         pa.time32('s'),
         pa.time64('us'),
         pa.date32(),
diff --git a/python/pyarrow/tests/test_types.py 
b/python/pyarrow/tests/test_types.py
index e6ff5b156..0e3ea1fd4 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -85,16 +85,17 @@ def test_is_nested_or_struct():
     assert not types.is_nested(pa.int32())
 
 
-# TODO(wesm): Union types not yet implemented in pyarrow
+def test_is_union():
+    assert types.is_union(pa.union([pa.field('a', pa.int32()),
+                                    pa.field('b', pa.int8()),
+                                    pa.field('c', pa.string())],
+                                   pa.lib.UnionMode_SPARSE))
+    assert not types.is_union(pa.list_(pa.int32()))
 
-# def test_is_union():
-#     assert types.is_union(pa.union([pa.field('a', pa.int32()),
-#                                     pa.field('b', pa.int8()),
-#                                     pa.field('c', pa.string())]))
-#     assert not types.is_union(pa.list_(pa.int32()))
 
 # TODO(wesm): is_map, once implemented
 
+
 def test_is_binary_string():
     assert types.is_binary(pa.binary())
     assert not types.is_binary(pa.string())
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index c9a490960..d2e68ff79 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -186,7 +186,32 @@ cdef class UnionType(DataType):
 
     cdef void init(self, const shared_ptr[CDataType]& type):
         DataType.init(self, type)
+        self.child_types = [
+            pyarrow_wrap_data_type(type.get().child(i).get().type())
+            for i in range(self.num_children)]
 
+    property num_children:
+
+        def __get__(self):
+            return self.type.num_children()
+
+    property mode:
+
+        def __get__(self):
+            cdef CUnionType* type = <CUnionType*> self.sp_type.get()
+            return type.mode()
+
+    def __getitem__(self, i):
+        return self.child_types[i]
+
+    def __getstate__(self):
+        children = [pyarrow_wrap_field(self.type.child(i))
+                    for i in range(self.num_children)]
+        return children, self.mode
+
+    def __setstate__(self, state):
+        cdef DataType reconstituted = union(*state)
+        self.init(reconstituted.sp_type)
 
 cdef class TimestampType(DataType):
 
@@ -1056,6 +1081,30 @@ def struct(fields):
     return pyarrow_wrap_data_type(struct_type)
 
 
+def union(children_fields, mode):
+    """
+    Create UnionType from children fields.
+    """
+    cdef:
+        Field child_field
+        vector[shared_ptr[CField]] c_fields
+        vector[uint8_t] type_codes
+        shared_ptr[CDataType] union_type
+        int i
+
+    for i, child_field in enumerate(children_fields):
+        type_codes.push_back(i)
+        c_fields.push_back(child_field.sp_field)
+
+        if mode == UnionMode_SPARSE:
+            union_type.reset(new CUnionType(c_fields, type_codes,
+                                            _UnionMode_SPARSE))
+        else:
+            union_type.reset(new CUnionType(c_fields, type_codes,
+                                            _UnionMode_DENSE))
+
+    return pyarrow_wrap_data_type(union_type)
+
 cdef dict _type_aliases = {
     'null': null,
     'i1': int8,


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Add test cases and basic APIs for UnionArray
> -----------------------------------------------------
>
>                 Key: ARROW-972
>                 URL: https://issues.apache.org/jira/browse/ARROW-972
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: Python
>    Affects Versions: 0.3.0
>            Reporter: Wes McKinney
>            Assignee: Philipp Moritz
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> While this is implemented in C++, there isn't any API exposure yet in Python



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to