[
https://issues.apache.org/jira/browse/ARROW-1705?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16345239#comment-16345239
]
ASF GitHub Bot commented on ARROW-1705:
---------------------------------------
pitrou commented on a change in pull request #1530: ARROW-1705: [Python] allow
building array from dicts
URL: https://github.com/apache/arrow/pull/1530#discussion_r164780782
##########
File path: cpp/src/arrow/python/builtin_convert.cc
##########
@@ -369,63 +384,55 @@ class TypedConverter : public SeqConverter {
template <typename BuilderType, class Derived>
class TypedConverterVisitor : public TypedConverter<BuilderType> {
public:
- Status AppendData(PyObject* obj, int64_t size) override {
+ Status AppendSingle(PyObject* obj) override {
+ // static_cast<> below devirtualizes the method calls (for performance?)
+ if (obj == Py_None) {
+ return static_cast<Derived*>(this)->AppendNull();
+ } else {
+ return static_cast<Derived*>(this)->AppendItem(obj);
+ }
+ }
+
+ Status AppendMultiple(PyObject* obj, int64_t size) override {
/// Ensure we've allocated enough space
RETURN_NOT_OK(this->typed_builder_->Reserve(size));
// Iterate over the items adding each one
if (PySequence_Check(obj)) {
for (int64_t i = 0; i < size; ++i) {
OwnedRef ref(PySequence_GetItem(obj, i));
- if (ref.obj() == Py_None) {
- RETURN_NOT_OK(this->typed_builder_->AppendNull());
- } else {
- RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
- }
- }
- } else if (PyObject_HasAttrString(obj, "__iter__")) {
- PyObject* iter = PyObject_GetIter(obj);
- OwnedRef iter_ref(iter);
- PyObject* item;
- int64_t i = 0;
- // To allow people with long generators to only convert a subset, stop
- // consuming at size.
- while ((item = PyIter_Next(iter)) && i < size) {
- OwnedRef ref(item);
- if (ref.obj() == Py_None) {
- RETURN_NOT_OK(this->typed_builder_->AppendNull());
- } else {
- RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
- }
- ++i;
- }
- if (size != i) {
- RETURN_NOT_OK(this->typed_builder_->Resize(i));
+ RETURN_NOT_OK(static_cast<Derived*>(this)->AppendSingle(ref.obj()));
}
} else {
- return Status::TypeError("Object is not a sequence or iterable");
+ return Status::TypeError("Object is not a sequence");
}
return Status::OK();
}
+
+ protected:
+ // Append a non-missing item
+ virtual Status AppendItem(PyObject* obj) = 0;
Review comment:
Hmm, actually you may be right that even the `static_cast<>` won't be enough
to devirtualize the method call (since we may be given a derived type defined
somewhere).
I ran a simple micro-benchmark:
```
$ python -m timeit -s "import pyarrow as pa; ty=pa.int64();
data=[42]*1000000" "pa.array(data, type=ty)"
```
* before: 10 loops, best of 3: 42.4 msec per loop
* after: 10 loops, best of 3: 45.5 msec per loop
So there is a slight slowdown indeed...
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] Create StructArray from sequence of dicts given a known data type
> --------------------------------------------------------------------------
>
> Key: ARROW-1705
> URL: https://issues.apache.org/jira/browse/ARROW-1705
> Project: Apache Arrow
> Issue Type: New Feature
> Components: Python
> Reporter: Wes McKinney
> Assignee: Antoine Pitrou
> Priority: Major
> Labels: pull-request-available
> Fix For: 0.9.0
>
>
> See https://github.com/apache/arrow/issues/1217
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)