This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 88b72df ARROW-2073: [Python] Create struct array from sequence of
tuples
88b72df is described below
commit 88b72df27514a8ec47736bca1da010d9a2f84f3c
Author: Antoine Pitrou <[email protected]>
AuthorDate: Thu Feb 8 13:22:23 2018 +0100
ARROW-2073: [Python] Create struct array from sequence of tuples
Author: Antoine Pitrou <[email protected]>
Closes #1572 from pitrou/ARROW-2073-struct-from-tuples and squashes the
following commits:
0a41ccc [Antoine Pitrou] ARROW-2073: [Python] Create struct array from
sequence of tuples
---
cpp/src/arrow/python/builtin_convert.cc | 47 ++++++++++++++++++++++------
python/benchmarks/convert_builtins.py | 21 ++++++++++---
python/pyarrow/tests/test_convert_builtin.py | 39 +++++++++++++++++++++++
3 files changed, 93 insertions(+), 14 deletions(-)
diff --git a/cpp/src/arrow/python/builtin_convert.cc
b/cpp/src/arrow/python/builtin_convert.cc
index 1e431c2..f0e5449 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -771,18 +771,21 @@ class StructConverter : public
TypedConverterVisitor<StructBuilder, StructConver
// Append a non-missing item
Status AppendItem(PyObject* obj) {
RETURN_NOT_OK(typed_builder_->Append());
- if (!PyDict_Check(obj)) {
- return Status::TypeError("dict value expected for struct type");
+ // Note heterogenous sequences are not allowed
+ if (ARROW_PREDICT_FALSE(source_kind_ == UNKNOWN)) {
+ if (PyDict_Check(obj)) {
+ source_kind_ = DICTS;
+ } else if (PyTuple_Check(obj)) {
+ source_kind_ = TUPLES;
+ }
}
- // NOTE we're ignoring any extraneous dict items
- for (int i = 0; i < num_fields_; i++) {
- PyObject* nameobj = PyList_GET_ITEM(field_name_list_.obj(), i);
- PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed
- RETURN_IF_PYERROR();
- RETURN_NOT_OK(value_converters_[i]->AppendSingle(valueobj ? valueobj :
Py_None));
+ if (PyDict_Check(obj) && source_kind_ == DICTS) {
+ return AppendDictItem(obj);
+ } else if (PyTuple_Check(obj) && source_kind_ == TUPLES) {
+ return AppendTupleItem(obj);
+ } else {
+ return Status::TypeError("Expected sequence of dicts or tuples for
struct type");
}
-
- return Status::OK();
}
// Append a missing item
@@ -797,9 +800,33 @@ class StructConverter : public
TypedConverterVisitor<StructBuilder, StructConver
}
protected:
+ Status AppendDictItem(PyObject* obj) {
+ // NOTE we're ignoring any extraneous dict items
+ for (int i = 0; i < num_fields_; i++) {
+ PyObject* nameobj = PyList_GET_ITEM(field_name_list_.obj(), i);
+ PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed
+ RETURN_IF_PYERROR();
+ RETURN_NOT_OK(value_converters_[i]->AppendSingle(valueobj ? valueobj :
Py_None));
+ }
+ return Status::OK();
+ }
+
+ Status AppendTupleItem(PyObject* obj) {
+ if (PyTuple_GET_SIZE(obj) != num_fields_) {
+ return Status::Invalid("Tuple size must be equal to number of struct
fields");
+ }
+ for (int i = 0; i < num_fields_; i++) {
+ PyObject* valueobj = PyTuple_GET_ITEM(obj, i);
+ RETURN_NOT_OK(value_converters_[i]->AppendSingle(valueobj));
+ }
+ return Status::OK();
+ }
+
std::vector<std::unique_ptr<SeqConverter>> value_converters_;
OwnedRef field_name_list_;
int num_fields_;
+ // Whether we're converting from a sequence of dicts or tuples
+ enum { UNKNOWN, DICTS, TUPLES } source_kind_ = UNKNOWN;
};
class DecimalConverter
diff --git a/python/benchmarks/convert_builtins.py
b/python/benchmarks/convert_builtins.py
index 92b2b85..a4dc9f2 100644
--- a/python/benchmarks/convert_builtins.py
+++ b/python/benchmarks/convert_builtins.py
@@ -144,11 +144,21 @@ class BuiltinsGenerator(object):
partial(self.generate_int_list, none_prob=none_prob),
n, min_size, max_size, none_prob)
+ def generate_tuple_list(self, n, none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of tuples with random values.
+ Each tuple has the form `(int value, float value, bool value)`
+ """
+ dicts = self.generate_dict_list(n, none_prob=none_prob)
+ tuples = [(d.get('u'), d.get('v'), d.get('w'))
+ if d is not None else None
+ for d in dicts]
+ assert len(tuples) == n
+ return tuples
def generate_dict_list(self, n, none_prob=DEFAULT_NONE_PROB):
"""
- Generate a list of dicts with a random size between *min_size* and
- *max_size*.
+ Generate a list of dicts with random values.
Each dict has the form `{'u': int value, 'v': float value, 'w': bool
value}`
"""
ints = self.generate_int_list(n, none_prob=none_prob)
@@ -179,12 +189,14 @@ class BuiltinsGenerator(object):
"""
size = None
- if type_name in ('bool', 'ascii', 'unicode', 'int64 list', 'struct'):
+ if type_name in ('bool', 'ascii', 'unicode', 'int64 list'):
kind = type_name
elif type_name.startswith(('int', 'uint')):
kind = 'int'
elif type_name.startswith('float'):
kind = 'float'
+ elif type_name.startswith('struct'):
+ kind = 'struct'
elif type_name == 'binary':
kind = 'varying binary'
elif type_name.startswith('binary'):
@@ -226,6 +238,7 @@ class BuiltinsGenerator(object):
'int64 list': partial(self.generate_int_list_list,
min_size=0, max_size=20),
'struct': self.generate_dict_list,
+ 'struct from tuples': self.generate_tuple_list,
}
data = factories[kind](n)
return ty, data
@@ -239,7 +252,7 @@ class ConvertPyListToArray(object):
types = ('int32', 'uint32', 'int64', 'uint64',
'float32', 'float64', 'bool',
'binary', 'binary10', 'ascii', 'unicode',
- 'int64 list', 'struct')
+ 'int64 list', 'struct', 'struct from tuples')
param_names = ['type']
params = [types]
diff --git a/python/pyarrow/tests/test_convert_builtin.py
b/python/pyarrow/tests/test_convert_builtin.py
index ce54f23..5cd4a52 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -531,6 +531,45 @@ def test_struct_from_dicts():
assert arr.to_pylist() == expected
+def test_struct_from_tuples():
+ ty = pa.struct([pa.field('a', pa.int32()),
+ pa.field('b', pa.string()),
+ pa.field('c', pa.bool_())])
+
+ data = [(5, 'foo', True),
+ (6, 'bar', False)]
+ expected = [{'a': 5, 'b': 'foo', 'c': True},
+ {'a': 6, 'b': 'bar', 'c': False}]
+ arr = pa.array(data, type=ty)
+ assert arr.to_pylist() == expected
+
+ # With omitted values
+ data = [(5, 'foo', None),
+ None,
+ (6, None, False)]
+ expected = [{'a': 5, 'b': 'foo', 'c': None},
+ None,
+ {'a': 6, 'b': None, 'c': False}]
+ arr = pa.array(data, type=ty)
+ assert arr.to_pylist() == expected
+
+ # Invalid tuple size
+ for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]:
+ with pytest.raises(ValueError, match="(?i)tuple size"):
+ pa.array([tup], type=ty)
+
+
+def test_struct_from_mixed_sequence():
+ # It is forbidden to mix dicts and tuples when initializing a struct array
+ ty = pa.struct([pa.field('a', pa.int32()),
+ pa.field('b', pa.string()),
+ pa.field('c', pa.bool_())])
+ data = [(5, 'foo', True),
+ {'a': 6, 'b': 'bar', 'c': False}]
+ with pytest.raises(TypeError):
+ pa.array(data, type=ty)
+
+
def test_structarray_from_arrays_coerce():
# ARROW-1706
ints = [None, 2, 3]
--
To stop receiving notification emails like this one, please contact
[email protected].