This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f6bfa7b292 GH-39010: [Python] Introduce `maps_as_pydicts` parameter
for `to_pylist`, `to_pydict`, `as_py` (#45471)
f6bfa7b292 is described below
commit f6bfa7b292097359b1ab743fc6b2253feac3fa72
Author: Jonas Dedden <[email protected]>
AuthorDate: Thu Feb 20 16:17:48 2025 +0100
GH-39010: [Python] Introduce `maps_as_pydicts` parameter for `to_pylist`,
`to_pydict`, `as_py` (#45471)
### Rationale for this change
Currently, unfortunately `MapScalar`/`Array` types are not deserialized
into proper Python `dict`s, which is unfortunate since this breaks "roundtrips"
from Python -> Arrow -> Python:
```
import pyarrow as pa
schema = pa.schema([pa.field('x', pa.map_(pa.string(), pa.int64()))])
data = [{'x': {'a': 1}}]
pa.RecordBatch.from_pylist(data, schema=schema).to_pylist()
# [{'x': [('a', 1)]}]
```
This is especially bad when storing TiBs of deeply nested data (think of
lists in structs in maps...) that were created from Python and serialized into
Arrow/Parquet, since they can't be read in again with native `pyarrow` methods
without doing extremely ugly and computationally costly workarounds.
### What changes are included in this PR?
A new parameter `maps_as_pydicts` is introduced to `to_pylist`,
`to_pydict`, `as_py` which will allow proper roundtrips:
```
import pyarrow as pa
schema = pa.schema([pa.field('x', pa.map_(pa.string(), pa.int64()))])
data = [{'x': {'a': 1}}]
pa.RecordBatch.from_pylist(data,
schema=schema).to_pylist(maps_as_pydicts="strict")
# [{'x': {'a': 1}}]
```
### Are these changes tested?
Yes. There are tests for `to_pylist` and `to_pydict` included for
`pyarrow.Table`, whilst low-level `MapScalar` and especially a nesting with
`ListScalar` and `StructScalar` is tested.
Also, duplicate keys now should throw an error, which is also tested for.
### Are there any user-facing changes?
No callsites should be broken, simply a new keyword-only optional parameter
is added.
* GitHub Issue: #39010
Authored-by: Jonas Dedden <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/array.pxi | 26 +-
python/pyarrow/scalar.pxi | 398 ++++++++++++++++++++++++----
python/pyarrow/table.pxi | 54 +++-
python/pyarrow/tests/test_extension_type.py | 2 +-
python/pyarrow/tests/test_scalars.py | 28 ++
python/pyarrow/tests/test_table.py | 20 ++
6 files changed, 473 insertions(+), 55 deletions(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 2ef42051d9..91770a5219 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1651,16 +1651,30 @@ cdef class Array(_PandasConvertible):
array = array.copy()
return array
- def to_pylist(self):
+ def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert to a list of native Python objects.
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
+
Returns
-------
lst : list
"""
self._assert_cpu()
- return [x.as_py() for x in self]
+ return [x.as_py(maps_as_pydicts=maps_as_pydicts) for x in self]
def tolist(self):
"""
@@ -2286,12 +2300,18 @@ cdef class MonthDayNanoIntervalArray(Array):
Concrete class for Arrow arrays of interval[MonthDayNano] type.
"""
- def to_pylist(self):
+ def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert to a list of native Python objects.
pyarrow.MonthDayNano is used as the native representation.
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
+
Returns
-------
lst : list
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index e877b0965d..04442c1f5d 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -16,6 +16,7 @@
# under the License.
import collections
+import warnings
from uuid import UUID
@@ -148,7 +149,24 @@ cdef class Scalar(_Weakrefable):
def __reduce__(self):
return scalar, (self.as_py(), self.type)
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
+ """
+ Return this value as a Python representation.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
+ """
raise NotImplementedError()
@@ -169,9 +187,15 @@ cdef class NullScalar(Scalar):
def __init__(self):
pass
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python None.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
return None
@@ -184,9 +208,15 @@ cdef class BooleanScalar(Scalar):
Concrete class for boolean scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python bool.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CBooleanScalar* sp = <CBooleanScalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -197,9 +227,15 @@ cdef class UInt8Scalar(Scalar):
Concrete class for uint8 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python int.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CUInt8Scalar* sp = <CUInt8Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -210,9 +246,15 @@ cdef class Int8Scalar(Scalar):
Concrete class for int8 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python int.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CInt8Scalar* sp = <CInt8Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -223,9 +265,15 @@ cdef class UInt16Scalar(Scalar):
Concrete class for uint16 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python int.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CUInt16Scalar* sp = <CUInt16Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -236,9 +284,15 @@ cdef class Int16Scalar(Scalar):
Concrete class for int16 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python int.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CInt16Scalar* sp = <CInt16Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -249,9 +303,15 @@ cdef class UInt32Scalar(Scalar):
Concrete class for uint32 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python int.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CUInt32Scalar* sp = <CUInt32Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -262,9 +322,15 @@ cdef class Int32Scalar(Scalar):
Concrete class for int32 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python int.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CInt32Scalar* sp = <CInt32Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -275,9 +341,15 @@ cdef class UInt64Scalar(Scalar):
Concrete class for uint64 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python int.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CUInt64Scalar* sp = <CUInt64Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -288,9 +360,15 @@ cdef class Int64Scalar(Scalar):
Concrete class for int64 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python int.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CInt64Scalar* sp = <CInt64Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -301,9 +379,15 @@ cdef class HalfFloatScalar(Scalar):
Concrete class for float scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python float.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CHalfFloatScalar* sp = <CHalfFloatScalar*> self.wrapped.get()
return PyHalf_FromHalf(sp.value) if sp.is_valid else None
@@ -314,9 +398,15 @@ cdef class FloatScalar(Scalar):
Concrete class for float scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python float.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CFloatScalar* sp = <CFloatScalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -327,9 +417,15 @@ cdef class DoubleScalar(Scalar):
Concrete class for double scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python float.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CDoubleScalar* sp = <CDoubleScalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
@@ -340,9 +436,15 @@ cdef class Decimal32Scalar(Scalar):
Concrete class for decimal32 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python Decimal.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef:
CDecimal32Scalar* sp = <CDecimal32Scalar*> self.wrapped.get()
@@ -360,9 +462,15 @@ cdef class Decimal64Scalar(Scalar):
Concrete class for decimal64 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python Decimal.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef:
CDecimal64Scalar* sp = <CDecimal64Scalar*> self.wrapped.get()
@@ -380,9 +488,15 @@ cdef class Decimal128Scalar(Scalar):
Concrete class for decimal128 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python Decimal.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef:
CDecimal128Scalar* sp = <CDecimal128Scalar*> self.wrapped.get()
@@ -400,9 +514,15 @@ cdef class Decimal256Scalar(Scalar):
Concrete class for decimal256 scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python Decimal.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef:
CDecimal256Scalar* sp = <CDecimal256Scalar*> self.wrapped.get()
@@ -425,9 +545,15 @@ cdef class Date32Scalar(Scalar):
cdef CDate32Scalar* sp = <CDate32Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python datetime.datetime instance.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CDate32Scalar* sp = <CDate32Scalar*> self.wrapped.get()
@@ -450,9 +576,15 @@ cdef class Date64Scalar(Scalar):
cdef CDate64Scalar* sp = <CDate64Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python datetime.datetime instance.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef CDate64Scalar* sp = <CDate64Scalar*> self.wrapped.get()
@@ -504,9 +636,15 @@ cdef class Time32Scalar(Scalar):
cdef CTime32Scalar* sp = <CTime32Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python datetime.timedelta instance.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef:
CTime32Scalar* sp = <CTime32Scalar*> self.wrapped.get()
@@ -528,9 +666,15 @@ cdef class Time64Scalar(Scalar):
cdef CTime64Scalar* sp = <CTime64Scalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python datetime.timedelta instance.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef:
CTime64Scalar* sp = <CTime64Scalar*> self.wrapped.get()
@@ -552,11 +696,17 @@ cdef class TimestampScalar(Scalar):
cdef CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Pandas Timestamp instance (if units are
nanoseconds and pandas is available), otherwise as a Python
datetime.datetime instance.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef:
CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get()
@@ -600,11 +750,17 @@ cdef class DurationScalar(Scalar):
cdef CDurationScalar* sp = <CDurationScalar*> self.wrapped.get()
return sp.value if sp.is_valid else None
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Pandas Timedelta instance (if units are
nanoseconds and pandas is available), otherwise as a Python
datetime.timedelta instance.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef:
CDurationScalar* sp = <CDurationScalar*> self.wrapped.get()
@@ -647,9 +803,15 @@ cdef class MonthDayNanoIntervalScalar(Scalar):
"""
return self.as_py()
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a pyarrow.MonthDayNano.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
cdef:
PyObject* val
@@ -672,9 +834,15 @@ cdef class BinaryScalar(Scalar):
cdef CBaseBinaryScalar* sp = <CBaseBinaryScalar*> self.wrapped.get()
return pyarrow_wrap_buffer(sp.value) if sp.is_valid else None
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python bytes.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
buffer = self.as_buffer()
return None if buffer is None else buffer.to_pybytes()
@@ -693,9 +861,15 @@ cdef class StringScalar(BinaryScalar):
Concrete class for string-like (utf8) scalars.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python string.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
buffer = self.as_buffer()
return None if buffer is None else str(buffer, 'utf8')
@@ -744,12 +918,26 @@ cdef class ListScalar(Scalar):
"""
return iter(self.values)
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python list.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
"""
arr = self.values
- return None if arr is None else arr.to_pylist()
+ return None if arr is None else
arr.to_pylist(maps_as_pydicts=maps_as_pydicts)
cdef class FixedSizeListScalar(ListScalar):
@@ -824,13 +1012,27 @@ cdef class StructScalar(Scalar, collections.abc.Mapping):
else:
raise KeyError(key) from exc
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this value as a Python dict.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
"""
if self.is_valid:
try:
- return {k: self[k].as_py() for k in self.keys()}
+ return {k: self[k].as_py(maps_as_pydicts=maps_as_pydicts) for
k in self.keys()}
except KeyError:
raise ValueError(
"Converting to Python dictionary is not supported when "
@@ -880,12 +1082,47 @@ cdef class MapScalar(ListScalar):
for k, v in zip(arr.field(self.type.key_field.name),
arr.field(self.type.item_field.name)):
yield (k.as_py(), v.as_py())
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
- Return this value as a Python list.
+ Return this value as a Python list or dict, depending on
'maps_as_pydicts'.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
"""
- cdef CStructScalar* sp = <CStructScalar*> self.wrapped.get()
- return list(self) if sp.is_valid else None
+ if maps_as_pydicts not in (None, "lossy", "strict"):
+ raise ValueError(
+ "Invalid value for 'maps_as_pydicts': "
+ + "valid values are 'lossy', 'strict' or `None` (default). "
+ + f"Received {maps_as_pydicts!r}."
+ )
+ if not self.is_valid:
+ return None
+ if not maps_as_pydicts:
+ return list(self)
+ result_dict = {}
+ for key, value in self:
+ if key in result_dict:
+ if maps_as_pydicts == "strict":
+ raise KeyError(
+ "Converting to Python dictionary is not supported in
strict mode "
+ f"when duplicate keys are present (duplicate key was
'{key}')."
+ )
+ else:
+ warnings.warn(
+ f"Encountered key '{key}' which was already
encountered.")
+ result_dict[key] = value
+ return result_dict
cdef class DictionaryScalar(Scalar):
@@ -958,11 +1195,25 @@ cdef class DictionaryScalar(Scalar):
cdef CDictionaryScalar* sp = <CDictionaryScalar*> self.wrapped.get()
return pyarrow_wrap_array(sp.value.dictionary)
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this encoded value as a Python object.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
"""
- return self.value.as_py() if self.is_valid else None
+ return self.value.as_py(maps_as_pydicts=maps_as_pydicts) if
self.is_valid else None
cdef class RunEndEncodedScalar(Scalar):
@@ -977,11 +1228,25 @@ cdef class RunEndEncodedScalar(Scalar):
cdef CRunEndEncodedScalar* sp = <CRunEndEncodedScalar*>
self.wrapped.get()
return Scalar.wrap(sp.value)
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return underlying value as a Python object.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
"""
- return self.value.as_py()
+ return self.value.as_py(maps_as_pydicts=maps_as_pydicts)
cdef class UnionScalar(Scalar):
@@ -1003,12 +1268,26 @@ cdef class UnionScalar(Scalar):
dp = <CDenseUnionScalar*> self.wrapped.get()
return Scalar.wrap(dp.value) if dp.is_valid else None
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return underlying value as a Python object.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
"""
value = self.value
- return None if value is None else value.as_py()
+ return None if value is None else
value.as_py(maps_as_pydicts=maps_as_pydicts)
@property
def type_code(self):
@@ -1032,11 +1311,25 @@ cdef class ExtensionScalar(Scalar):
cdef CExtensionScalar* sp = <CExtensionScalar*> self.wrapped.get()
return Scalar.wrap(sp.value) if sp.is_valid else None
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this scalar as a Python object.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
"""
- return None if self.value is None else self.value.as_py()
+ return None if self.value is None else
self.value.as_py(maps_as_pydicts=maps_as_pydicts)
@staticmethod
def from_storage(BaseExtensionType typ, value):
@@ -1093,7 +1386,16 @@ class UuidScalar(ExtensionScalar):
Concrete class for Uuid extension scalar.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
+ """
+ Return this scalar as a Python UUID.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
+ """
return None if self.value is None else UUID(bytes=self.value.as_py())
@@ -1150,9 +1452,15 @@ cdef class Bool8Scalar(ExtensionScalar):
Concrete class for bool8 extension scalar.
"""
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
"""
Return this scalar as a Python object.
+
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ This parameter is ignored for non-nested Scalars.
"""
py_val = super().as_py()
return None if py_val is None else py_val != 0
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index af241e4be0..5a6cd39048 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -1349,10 +1349,24 @@ cdef class ChunkedArray(_PandasConvertible):
for i in range(self.num_chunks):
yield self.chunk(i)
- def to_pylist(self):
+ def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert to a list of native Python objects.
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
+
Examples
--------
>>> import pyarrow as pa
@@ -1363,7 +1377,7 @@ cdef class ChunkedArray(_PandasConvertible):
self._assert_cpu()
result = []
for i in range(self.num_chunks):
- result += self.chunk(i).to_pylist()
+ result += self.chunk(i).to_pylist(maps_as_pydicts=maps_as_pydicts)
return result
def __arrow_c_stream__(self, requested_schema=None):
@@ -2255,10 +2269,24 @@ cdef class _Tabular(_PandasConvertible):
else:
return _pc().filter(self, mask, null_selection_behavior)
- def to_pydict(self):
+ def to_pydict(self, *, maps_as_pydicts=None):
"""
Convert the Table or RecordBatch to a dict or OrderedDict.
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
+
Returns
-------
dict
@@ -2277,14 +2305,28 @@ cdef class _Tabular(_PandasConvertible):
entries = []
for i in range(self.num_columns):
name = self.field(i).name
- column = self[i].to_pylist()
+ column = self[i].to_pylist(maps_as_pydicts=maps_as_pydicts)
entries.append((name, column))
return ordered_dict(entries)
- def to_pylist(self):
+ def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert the Table or RecordBatch to a list of rows / dictionaries.
+ Parameters
+ ----------
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+
+ If 'lossy', whenever duplicate keys are detected, a warning will
be printed.
+ The last seen value of a duplicate key will be in the Python
dictionary.
+ If 'strict', this instead results in an exception being raised
when detected.
+
Returns
-------
list
@@ -2300,7 +2342,7 @@ cdef class _Tabular(_PandasConvertible):
>>> table.to_pylist()
[{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals':
'Horse'}, ...
"""
- pydict = self.to_pydict()
+ pydict = self.to_pydict(maps_as_pydicts=maps_as_pydicts)
names = self.schema.names
pylist = [{column: pydict[column][row] for column in names}
for row in range(self.num_rows)]
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index 634d9ce2d8..185b5bb424 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -99,7 +99,7 @@ class IntegerEmbeddedType(pa.ExtensionType):
class ExampleUuidScalarType(pa.ExtensionScalar):
- def as_py(self):
+ def as_py(self, *, maps_as_pydicts=None):
return None if self.value is None else UUID(bytes=self.value.as_py())
diff --git a/python/pyarrow/tests/test_scalars.py
b/python/pyarrow/tests/test_scalars.py
index 3f4a53c473..29db36eddc 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -786,6 +786,22 @@ def test_map(pickle_module):
restored = pickle_module.loads(pickle_module.dumps(s))
assert restored.equals(s)
+ assert s.as_py(maps_as_pydicts="strict") == {'a': 1, 'b': 2}
+
+
+def test_map_duplicate_fields():
+ ty = pa.map_(pa.string(), pa.int8())
+ v = [('a', 1), ('a', 2)]
+ s = pa.scalar(v, type=ty)
+
+ assert s.as_py(maps_as_pydicts=None) == v
+
+ with pytest.raises(KeyError):
+ assert s.as_py(maps_as_pydicts="strict")
+
+ with pytest.warns(match="Encountered key 'a' which was already
encountered"):
+ assert s.as_py(maps_as_pydicts="lossy") == {'a': 2}
+
def test_dictionary(pickle_module):
indices = pa.array([2, None, 1, 2, 0, None])
@@ -898,3 +914,15 @@ def test_map_scalar_as_py_with_custom_field_name():
pa.field("custom_value", pa.string()),
),
).as_py() == [("foo", "bar")]
+
+
+def test_nested_map_types_with_maps_as_pydicts():
+ ty = pa.struct([
+ pa.field('x', pa.map_(pa.string(), pa.int8())),
+ pa.field('y', pa.list_(pa.map_(pa.string(), pa.int8()))),
+ ])
+
+ v = {'x': {'a': 1}, 'y': [{'b': 2}, {'c': 3}]}
+ s = pa.scalar(v, type=ty)
+
+ assert s.as_py(maps_as_pydicts="strict") == v
diff --git a/python/pyarrow/tests/test_table.py
b/python/pyarrow/tests/test_table.py
index 4c058ccecd..180ae7b4c1 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -1888,6 +1888,26 @@ def test_table_unify_dictionaries():
assert table.schema.metadata == {b"key1": b"value1"}
+def test_table_maps_as_pydicts():
+ arrays = [
+ pa.array(
+ [{'x': 1, 'y': 2}, {'z': 3}],
+ type=pa.map_(pa.string(), pa.int32())
+ )
+ ]
+ table = pa.Table.from_arrays(arrays, names=['a'])
+
+ table_dict = table.to_pydict(maps_as_pydicts="strict")
+ assert 'a' in table_dict
+ column_list = table_dict['a']
+ assert len(column_list) == 2
+ assert column_list == [{'x': 1, 'y': 2}, {'z': 3}]
+
+ table_list = table.to_pylist(maps_as_pydicts="strict")
+ assert len(table_list) == 2
+ assert table_list == [{'a': {'x': 1, 'y': 2}}, {'a': {'z': 3}}]
+
+
def test_concat_tables():
data = [
list(range(5)),