(arrow) branch main updated: GH-39010: [Python] Introduce `maps_as_pydicts` parameter for `to_pylist`, `to_pydict`, `as_py` (#45471)

apitrou Thu, 20 Feb 2025 08:37:13 -0800

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new f6bfa7b292 GH-39010: [Python] Introduce `maps_as_pydicts` parameter 
for `to_pylist`, `to_pydict`, `as_py` (#45471)
f6bfa7b292 is described below

commit f6bfa7b292097359b1ab743fc6b2253feac3fa72
Author: Jonas Dedden <[email protected]>
AuthorDate: Thu Feb 20 16:17:48 2025 +0100

    GH-39010: [Python] Introduce `maps_as_pydicts` parameter for `to_pylist`, 
`to_pydict`, `as_py` (#45471)
    
    ### Rationale for this change
    
    Currently, unfortunately `MapScalar`/`Array` types are not deserialized 
into proper Python `dict`s, which is unfortunate since this breaks "roundtrips" 
from Python -> Arrow -> Python:
    
    ```
    import pyarrow as pa
    
    schema = pa.schema([pa.field('x', pa.map_(pa.string(), pa.int64()))])
    data = [{'x': {'a': 1}}]
    pa.RecordBatch.from_pylist(data, schema=schema).to_pylist()
    # [{'x': [('a', 1)]}]
    ```
    
    This is especially bad when storing TiBs of deeply nested data (think of 
lists in structs in maps...) that were created from Python and serialized into 
Arrow/Parquet, since they can't be read in again with native `pyarrow` methods 
without doing extremely ugly and computationally costly workarounds.
    
    ### What changes are included in this PR?
    
    A new parameter `maps_as_pydicts` is introduced to `to_pylist`, 
`to_pydict`, `as_py` which will allow proper roundtrips:
    
    ```
    import pyarrow as pa
    
    schema = pa.schema([pa.field('x', pa.map_(pa.string(), pa.int64()))])
    data = [{'x': {'a': 1}}]
    pa.RecordBatch.from_pylist(data, 
schema=schema).to_pylist(maps_as_pydicts="strict")
    # [{'x': {'a': 1}}]
    ```
    
    ### Are these changes tested?
    
    Yes. There are tests for `to_pylist` and `to_pydict` included for 
`pyarrow.Table`, whilst low-level `MapScalar` and especially a nesting with 
`ListScalar` and `StructScalar` is tested.
    
    Also, duplicate keys now should throw an error, which is also tested for.
    
    ### Are there any user-facing changes?
    
    No callsites should be broken, simply a new keyword-only optional parameter 
is added.
    * GitHub Issue: #39010
    
    Authored-by: Jonas Dedden <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 python/pyarrow/array.pxi                    |  26 +-
 python/pyarrow/scalar.pxi                   | 398 ++++++++++++++++++++++++----
 python/pyarrow/table.pxi                    |  54 +++-
 python/pyarrow/tests/test_extension_type.py |   2 +-
 python/pyarrow/tests/test_scalars.py        |  28 ++
 python/pyarrow/tests/test_table.py          |  20 ++
 6 files changed, 473 insertions(+), 55 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 2ef42051d9..91770a5219 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1651,16 +1651,30 @@ cdef class Array(_PandasConvertible):
             array = array.copy()
         return array
 
-    def to_pylist(self):
+    def to_pylist(self, *, maps_as_pydicts=None):
         """
         Convert to a list of native Python objects.
 
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
+
         Returns
         -------
         lst : list
         """
         self._assert_cpu()
-        return [x.as_py() for x in self]
+        return [x.as_py(maps_as_pydicts=maps_as_pydicts) for x in self]
 
     def tolist(self):
         """
@@ -2286,12 +2300,18 @@ cdef class MonthDayNanoIntervalArray(Array):
     Concrete class for Arrow arrays of interval[MonthDayNano] type.
     """
 
-    def to_pylist(self):
+    def to_pylist(self, *, maps_as_pydicts=None):
         """
         Convert to a list of native Python objects.
 
         pyarrow.MonthDayNano is used as the native representation.
 
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
+
         Returns
         -------
         lst : list
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index e877b0965d..04442c1f5d 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -16,6 +16,7 @@
 # under the License.
 
 import collections
+import warnings
 from uuid import UUID
 
 
@@ -148,7 +149,24 @@ cdef class Scalar(_Weakrefable):
     def __reduce__(self):
         return scalar, (self.as_py(), self.type)
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
+        """
+        Return this value as a Python representation.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
+        """
         raise NotImplementedError()
 
 
@@ -169,9 +187,15 @@ cdef class NullScalar(Scalar):
     def __init__(self):
         pass
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python None.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         return None
 
@@ -184,9 +208,15 @@ cdef class BooleanScalar(Scalar):
     Concrete class for boolean scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python bool.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CBooleanScalar* sp = <CBooleanScalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -197,9 +227,15 @@ cdef class UInt8Scalar(Scalar):
     Concrete class for uint8 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python int.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CUInt8Scalar* sp = <CUInt8Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -210,9 +246,15 @@ cdef class Int8Scalar(Scalar):
     Concrete class for int8 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python int.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CInt8Scalar* sp = <CInt8Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -223,9 +265,15 @@ cdef class UInt16Scalar(Scalar):
     Concrete class for uint16 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python int.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CUInt16Scalar* sp = <CUInt16Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -236,9 +284,15 @@ cdef class Int16Scalar(Scalar):
     Concrete class for int16 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python int.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CInt16Scalar* sp = <CInt16Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -249,9 +303,15 @@ cdef class UInt32Scalar(Scalar):
     Concrete class for uint32 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python int.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CUInt32Scalar* sp = <CUInt32Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -262,9 +322,15 @@ cdef class Int32Scalar(Scalar):
     Concrete class for int32 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python int.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CInt32Scalar* sp = <CInt32Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -275,9 +341,15 @@ cdef class UInt64Scalar(Scalar):
     Concrete class for uint64 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python int.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CUInt64Scalar* sp = <CUInt64Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -288,9 +360,15 @@ cdef class Int64Scalar(Scalar):
     Concrete class for int64 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python int.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CInt64Scalar* sp = <CInt64Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -301,9 +379,15 @@ cdef class HalfFloatScalar(Scalar):
     Concrete class for float scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python float.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CHalfFloatScalar* sp = <CHalfFloatScalar*> self.wrapped.get()
         return PyHalf_FromHalf(sp.value) if sp.is_valid else None
@@ -314,9 +398,15 @@ cdef class FloatScalar(Scalar):
     Concrete class for float scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python float.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CFloatScalar* sp = <CFloatScalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -327,9 +417,15 @@ cdef class DoubleScalar(Scalar):
     Concrete class for double scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python float.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CDoubleScalar* sp = <CDoubleScalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
@@ -340,9 +436,15 @@ cdef class Decimal32Scalar(Scalar):
     Concrete class for decimal32 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python Decimal.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef:
             CDecimal32Scalar* sp = <CDecimal32Scalar*> self.wrapped.get()
@@ -360,9 +462,15 @@ cdef class Decimal64Scalar(Scalar):
     Concrete class for decimal64 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python Decimal.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef:
             CDecimal64Scalar* sp = <CDecimal64Scalar*> self.wrapped.get()
@@ -380,9 +488,15 @@ cdef class Decimal128Scalar(Scalar):
     Concrete class for decimal128 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python Decimal.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef:
             CDecimal128Scalar* sp = <CDecimal128Scalar*> self.wrapped.get()
@@ -400,9 +514,15 @@ cdef class Decimal256Scalar(Scalar):
     Concrete class for decimal256 scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python Decimal.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef:
             CDecimal256Scalar* sp = <CDecimal256Scalar*> self.wrapped.get()
@@ -425,9 +545,15 @@ cdef class Date32Scalar(Scalar):
         cdef CDate32Scalar* sp = <CDate32Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python datetime.datetime instance.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CDate32Scalar* sp = <CDate32Scalar*> self.wrapped.get()
 
@@ -450,9 +576,15 @@ cdef class Date64Scalar(Scalar):
         cdef CDate64Scalar* sp = <CDate64Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python datetime.datetime instance.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef CDate64Scalar* sp = <CDate64Scalar*> self.wrapped.get()
 
@@ -504,9 +636,15 @@ cdef class Time32Scalar(Scalar):
         cdef CTime32Scalar* sp = <CTime32Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python datetime.timedelta instance.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef:
             CTime32Scalar* sp = <CTime32Scalar*> self.wrapped.get()
@@ -528,9 +666,15 @@ cdef class Time64Scalar(Scalar):
         cdef CTime64Scalar* sp = <CTime64Scalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python datetime.timedelta instance.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef:
             CTime64Scalar* sp = <CTime64Scalar*> self.wrapped.get()
@@ -552,11 +696,17 @@ cdef class TimestampScalar(Scalar):
         cdef CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Pandas Timestamp instance (if units are
         nanoseconds and pandas is available), otherwise as a Python
         datetime.datetime instance.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef:
             CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get()
@@ -600,11 +750,17 @@ cdef class DurationScalar(Scalar):
         cdef CDurationScalar* sp = <CDurationScalar*> self.wrapped.get()
         return sp.value if sp.is_valid else None
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Pandas Timedelta instance (if units are
         nanoseconds and pandas is available), otherwise as a Python
         datetime.timedelta instance.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef:
             CDurationScalar* sp = <CDurationScalar*> self.wrapped.get()
@@ -647,9 +803,15 @@ cdef class MonthDayNanoIntervalScalar(Scalar):
         """
         return self.as_py()
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a pyarrow.MonthDayNano.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         cdef:
             PyObject* val
@@ -672,9 +834,15 @@ cdef class BinaryScalar(Scalar):
         cdef CBaseBinaryScalar* sp = <CBaseBinaryScalar*> self.wrapped.get()
         return pyarrow_wrap_buffer(sp.value) if sp.is_valid else None
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python bytes.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         buffer = self.as_buffer()
         return None if buffer is None else buffer.to_pybytes()
@@ -693,9 +861,15 @@ cdef class StringScalar(BinaryScalar):
     Concrete class for string-like (utf8) scalars.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python string.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         buffer = self.as_buffer()
         return None if buffer is None else str(buffer, 'utf8')
@@ -744,12 +918,26 @@ cdef class ListScalar(Scalar):
         """
         return iter(self.values)
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python list.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
         """
         arr = self.values
-        return None if arr is None else arr.to_pylist()
+        return None if arr is None else 
arr.to_pylist(maps_as_pydicts=maps_as_pydicts)
 
 
 cdef class FixedSizeListScalar(ListScalar):
@@ -824,13 +1012,27 @@ cdef class StructScalar(Scalar, collections.abc.Mapping):
             else:
                 raise KeyError(key) from exc
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this value as a Python dict.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
         """
         if self.is_valid:
             try:
-                return {k: self[k].as_py() for k in self.keys()}
+                return {k: self[k].as_py(maps_as_pydicts=maps_as_pydicts) for 
k in self.keys()}
             except KeyError:
                 raise ValueError(
                     "Converting to Python dictionary is not supported when "
@@ -880,12 +1082,47 @@ cdef class MapScalar(ListScalar):
         for k, v in zip(arr.field(self.type.key_field.name), 
arr.field(self.type.item_field.name)):
             yield (k.as_py(), v.as_py())
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
-        Return this value as a Python list.
+        Return this value as a Python list or dict, depending on 
'maps_as_pydicts'.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
         """
-        cdef CStructScalar* sp = <CStructScalar*> self.wrapped.get()
-        return list(self) if sp.is_valid else None
+        if maps_as_pydicts not in (None, "lossy", "strict"):
+            raise ValueError(
+                "Invalid value for 'maps_as_pydicts': "
+                + "valid values are 'lossy', 'strict' or `None` (default). "
+                + f"Received {maps_as_pydicts!r}."
+            )
+        if not self.is_valid:
+            return None
+        if not maps_as_pydicts:
+            return list(self)
+        result_dict = {}
+        for key, value in self:
+            if key in result_dict:
+                if maps_as_pydicts == "strict":
+                    raise KeyError(
+                        "Converting to Python dictionary is not supported in 
strict mode "
+                        f"when duplicate keys are present (duplicate key was 
'{key}')."
+                    )
+                else:
+                    warnings.warn(
+                        f"Encountered key '{key}' which was already 
encountered.")
+            result_dict[key] = value
+        return result_dict
 
 
 cdef class DictionaryScalar(Scalar):
@@ -958,11 +1195,25 @@ cdef class DictionaryScalar(Scalar):
         cdef CDictionaryScalar* sp = <CDictionaryScalar*> self.wrapped.get()
         return pyarrow_wrap_array(sp.value.dictionary)
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this encoded value as a Python object.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
         """
-        return self.value.as_py() if self.is_valid else None
+        return self.value.as_py(maps_as_pydicts=maps_as_pydicts) if 
self.is_valid else None
 
 
 cdef class RunEndEncodedScalar(Scalar):
@@ -977,11 +1228,25 @@ cdef class RunEndEncodedScalar(Scalar):
         cdef CRunEndEncodedScalar* sp = <CRunEndEncodedScalar*> 
self.wrapped.get()
         return Scalar.wrap(sp.value)
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return underlying value as a Python object.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
         """
-        return self.value.as_py()
+        return self.value.as_py(maps_as_pydicts=maps_as_pydicts)
 
 
 cdef class UnionScalar(Scalar):
@@ -1003,12 +1268,26 @@ cdef class UnionScalar(Scalar):
             dp = <CDenseUnionScalar*> self.wrapped.get()
             return Scalar.wrap(dp.value) if dp.is_valid else None
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return underlying value as a Python object.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
         """
         value = self.value
-        return None if value is None else value.as_py()
+        return None if value is None else 
value.as_py(maps_as_pydicts=maps_as_pydicts)
 
     @property
     def type_code(self):
@@ -1032,11 +1311,25 @@ cdef class ExtensionScalar(Scalar):
         cdef CExtensionScalar* sp = <CExtensionScalar*> self.wrapped.get()
         return Scalar.wrap(sp.value) if sp.is_valid else None
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this scalar as a Python object.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
         """
-        return None if self.value is None else self.value.as_py()
+        return None if self.value is None else 
self.value.as_py(maps_as_pydicts=maps_as_pydicts)
 
     @staticmethod
     def from_storage(BaseExtensionType typ, value):
@@ -1093,7 +1386,16 @@ class UuidScalar(ExtensionScalar):
     Concrete class for Uuid extension scalar.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
+        """
+        Return this scalar as a Python UUID.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
+        """
         return None if self.value is None else UUID(bytes=self.value.as_py())
 
 
@@ -1150,9 +1452,15 @@ cdef class Bool8Scalar(ExtensionScalar):
     Concrete class for bool8 extension scalar.
     """
 
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         """
         Return this scalar as a Python object.
+
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            This parameter is ignored for non-nested Scalars.
         """
         py_val = super().as_py()
         return None if py_val is None else py_val != 0
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index af241e4be0..5a6cd39048 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -1349,10 +1349,24 @@ cdef class ChunkedArray(_PandasConvertible):
         for i in range(self.num_chunks):
             yield self.chunk(i)
 
-    def to_pylist(self):
+    def to_pylist(self, *, maps_as_pydicts=None):
         """
         Convert to a list of native Python objects.
 
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
+
         Examples
         --------
         >>> import pyarrow as pa
@@ -1363,7 +1377,7 @@ cdef class ChunkedArray(_PandasConvertible):
         self._assert_cpu()
         result = []
         for i in range(self.num_chunks):
-            result += self.chunk(i).to_pylist()
+            result += self.chunk(i).to_pylist(maps_as_pydicts=maps_as_pydicts)
         return result
 
     def __arrow_c_stream__(self, requested_schema=None):
@@ -2255,10 +2269,24 @@ cdef class _Tabular(_PandasConvertible):
         else:
             return _pc().filter(self, mask, null_selection_behavior)
 
-    def to_pydict(self):
+    def to_pydict(self, *, maps_as_pydicts=None):
         """
         Convert the Table or RecordBatch to a dict or OrderedDict.
 
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
+
         Returns
         -------
         dict
@@ -2277,14 +2305,28 @@ cdef class _Tabular(_PandasConvertible):
         entries = []
         for i in range(self.num_columns):
             name = self.field(i).name
-            column = self[i].to_pylist()
+            column = self[i].to_pylist(maps_as_pydicts=maps_as_pydicts)
             entries.append((name, column))
         return ordered_dict(entries)
 
-    def to_pylist(self):
+    def to_pylist(self, *, maps_as_pydicts=None):
         """
         Convert the Table or RecordBatch to a list of rows / dictionaries.
 
+        Parameters
+        ----------
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+
+            If 'lossy', whenever duplicate keys are detected, a warning will 
be printed.
+            The last seen value of a duplicate key will be in the Python 
dictionary.
+            If 'strict', this instead results in an exception being raised 
when detected.
+
         Returns
         -------
         list
@@ -2300,7 +2342,7 @@ cdef class _Tabular(_PandasConvertible):
         >>> table.to_pylist()
         [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 
'Horse'}, ...
         """
-        pydict = self.to_pydict()
+        pydict = self.to_pydict(maps_as_pydicts=maps_as_pydicts)
         names = self.schema.names
         pylist = [{column: pydict[column][row] for column in names}
                   for row in range(self.num_rows)]
diff --git a/python/pyarrow/tests/test_extension_type.py 
b/python/pyarrow/tests/test_extension_type.py
index 634d9ce2d8..185b5bb424 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -99,7 +99,7 @@ class IntegerEmbeddedType(pa.ExtensionType):
 
 
 class ExampleUuidScalarType(pa.ExtensionScalar):
-    def as_py(self):
+    def as_py(self, *, maps_as_pydicts=None):
         return None if self.value is None else UUID(bytes=self.value.as_py())
 
 
diff --git a/python/pyarrow/tests/test_scalars.py 
b/python/pyarrow/tests/test_scalars.py
index 3f4a53c473..29db36eddc 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -786,6 +786,22 @@ def test_map(pickle_module):
     restored = pickle_module.loads(pickle_module.dumps(s))
     assert restored.equals(s)
 
+    assert s.as_py(maps_as_pydicts="strict") == {'a': 1, 'b': 2}
+
+
+def test_map_duplicate_fields():
+    ty = pa.map_(pa.string(), pa.int8())
+    v = [('a', 1), ('a', 2)]
+    s = pa.scalar(v, type=ty)
+
+    assert s.as_py(maps_as_pydicts=None) == v
+
+    with pytest.raises(KeyError):
+        assert s.as_py(maps_as_pydicts="strict")
+
+    with pytest.warns(match="Encountered key 'a' which was already 
encountered"):
+        assert s.as_py(maps_as_pydicts="lossy") == {'a': 2}
+
 
 def test_dictionary(pickle_module):
     indices = pa.array([2, None, 1, 2, 0, None])
@@ -898,3 +914,15 @@ def test_map_scalar_as_py_with_custom_field_name():
             pa.field("custom_value", pa.string()),
         ),
     ).as_py() == [("foo", "bar")]
+
+
+def test_nested_map_types_with_maps_as_pydicts():
+    ty = pa.struct([
+        pa.field('x', pa.map_(pa.string(), pa.int8())),
+        pa.field('y', pa.list_(pa.map_(pa.string(), pa.int8()))),
+    ])
+
+    v = {'x': {'a': 1}, 'y': [{'b': 2}, {'c': 3}]}
+    s = pa.scalar(v, type=ty)
+
+    assert s.as_py(maps_as_pydicts="strict") == v
diff --git a/python/pyarrow/tests/test_table.py 
b/python/pyarrow/tests/test_table.py
index 4c058ccecd..180ae7b4c1 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -1888,6 +1888,26 @@ def test_table_unify_dictionaries():
     assert table.schema.metadata == {b"key1": b"value1"}
 
 
+def test_table_maps_as_pydicts():
+    arrays = [
+        pa.array(
+            [{'x': 1, 'y': 2}, {'z': 3}],
+            type=pa.map_(pa.string(), pa.int32())
+        )
+    ]
+    table = pa.Table.from_arrays(arrays, names=['a'])
+
+    table_dict = table.to_pydict(maps_as_pydicts="strict")
+    assert 'a' in table_dict
+    column_list = table_dict['a']
+    assert len(column_list) == 2
+    assert column_list == [{'x': 1, 'y': 2}, {'z': 3}]
+
+    table_list = table.to_pylist(maps_as_pydicts="strict")
+    assert len(table_list) == 2
+    assert table_list == [{'a': {'x': 1, 'y': 2}}, {'a': {'z': 3}}]
+
+
 def test_concat_tables():
     data = [
         list(range(5)),

(arrow) branch main updated: GH-39010: [Python] Introduce `maps_as_pydicts` parameter for `to_pylist`, `to_pydict`, `as_py` (#45471)

Reply via email to