[
https://issues.apache.org/jira/browse/ARROW-2142?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16389431#comment-16389431
]
ASF GitHub Bot commented on ARROW-2142:
---------------------------------------
pitrou commented on a change in pull request #1635: ARROW-2142: [Python] Allow
conversion from Numpy struct array
URL: https://github.com/apache/arrow/pull/1635#discussion_r172814583
##########
File path: cpp/src/arrow/python/numpy_to_arrow.cc
##########
@@ -1590,6 +1592,85 @@ Status NumPyConverter::Visit(const StringType& type) {
return PushArray(result->data());
}
+Status NumPyConverter::Visit(const StructType& type) {
+ std::vector<NumPyConverter> sub_converters;
+ std::vector<OwnedRefNoGIL> sub_arrays;
+
+ {
+ PyAcquireGIL gil_lock;
+
+ // Create converters for each struct type field
+ if (dtype_->fields == NULL || !PyDict_Check(dtype_->fields)) {
+ return Status::TypeError("Expected struct array");
+ }
+
+ for (auto field : type.children()) {
+ PyObject* tup = PyDict_GetItemString(dtype_->fields,
field->name().c_str());
+ if (tup == NULL) {
+ std::stringstream ss;
+ ss << "Missing field '" << field->name() << "' in struct array";
+ return Status::TypeError(ss.str());
+ }
+ PyArray_Descr* sub_dtype =
+ reinterpret_cast<PyArray_Descr*>(PyTuple_GET_ITEM(tup, 0));
+ DCHECK(PyArray_DescrCheck(sub_dtype));
+ int offset = static_cast<int>(PyLong_AsLong(PyTuple_GET_ITEM(tup, 1)));
+ RETURN_IF_PYERROR();
+ Py_INCREF(sub_dtype); /* PyArray_GetField() steals ref */
+ PyObject* sub_array = PyArray_GetField(arr_, sub_dtype, offset);
+ RETURN_IF_PYERROR();
+ sub_arrays.emplace_back(sub_array);
+ sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */,
field->type(),
+ use_pandas_null_sentinels_);
+ }
+ }
+
+ std::vector<ArrayVector> groups;
+
+ // Compute null bitmap and store it as a Null Array to include it
+ // in the rechunking below
+ {
+ int64_t null_count = 0;
+ if (mask_ != nullptr) {
+ RETURN_NOT_OK(InitNullBitmap());
+ null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
+ }
+ auto null_data = ArrayData::Make(std::make_shared<NullType>(), length_,
+ {null_bitmap_}, null_count, 0);
+ DCHECK_EQ(null_data->buffers.size(), 1);
+ groups.push_back({std::make_shared<NullArray>(null_data)});
+ }
+
+ // Convert child data
+ for (auto& converter : sub_converters) {
+ RETURN_NOT_OK(converter.Convert());
+ groups.push_back(converter.result());
+ }
+ // Ensure the different array groups are chunked consistently
+ groups = ::arrow::internal::RechunkArraysConsistently(groups);
+
+ // Make struct array chunks by combining groups
+ size_t ngroups = groups.size();
+ size_t chunk, nchunks = groups[0].size();
+ for (chunk = 0; chunk < nchunks; chunk++) {
+ // Create struct array chunk and populate it
+ // First group has the null bitmaps as Null Arrays
+ auto null_data = groups[0][chunk]->data();
+ DCHECK_EQ(null_data->type->id(), Type::NA);
+ DCHECK_EQ(null_data->buffers.size(), 1);
+
+ auto arr_data = ArrayData::Make(type_, length_, null_data->null_count, 0);
Review comment:
Is it problematic to have `null_count == -1`? From my understanding it seems
to be a supported condition (i.e. "I don't know the exact number of nulls, just
use the null bitmap to compute it when necessary").
Understood about the offset. Indeed, testing it may involve passing some
large data...
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] Conversion from Numpy struct array unimplemented
> ---------------------------------------------------------
>
> Key: ARROW-2142
> URL: https://issues.apache.org/jira/browse/ARROW-2142
> Project: Apache Arrow
> Issue Type: Improvement
> Components: Python
> Affects Versions: 0.8.0
> Reporter: Antoine Pitrou
> Assignee: Antoine Pitrou
> Priority: Major
> Labels: pull-request-available
>
> {code:python}
> >>> arr = np.array([(1.5,)], dtype=np.dtype([('x', np.float32)]))
> >>> arr
> array([(1.5,)], dtype=[('x', '<f4')])
> >>> arr[0]
> (1.5,)
> >>> arr['x']
> array([1.5], dtype=float32)
> >>> arr['x'][0]
> 1.5
> >>> pa.array(arr, type=pa.struct([pa.field('x', pa.float32())]))
> Traceback (most recent call last):
> File "<ipython-input-18-27a52820b7d8>", line 1, in <module>
> pa.array(arr, type=pa.struct([pa.field('x', pa.float32())]))
> File "array.pxi", line 177, in pyarrow.lib.array
> File "error.pxi", line 77, in pyarrow.lib.check_status
> File "error.pxi", line 85, in pyarrow.lib.check_status
> ArrowNotImplementedError:
> /home/antoine/arrow/cpp/src/arrow/python/numpy_to_arrow.cc:1585 code:
> converter.Convert()
> NumPyConverter doesn't implement <struct<x: float>> conversion.
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)