This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1a1047a  ARROW-8342: [Python] Continue to return dict from "metadata" 
properties accessing KeyValueMetadata
1a1047a is described below

commit 1a1047a0a4c69fcab08643e4e884c3fec7a0d76f
Author: Wes McKinney <[email protected]>
AuthorDate: Tue Apr 7 20:29:23 2020 -0500

    ARROW-8342: [Python] Continue to return dict from "metadata" properties 
accessing KeyValueMetadata
    
    This patch relegates the KeyValueMetadata wrapper to an implementation 
detail, so existing third party code should be unaffected, and we can decide 
later when and how to expose this object publicly in the future without needing 
to revert the ARROW-8079.
    
    I also fixed the change related to the "pandas" metadata key. Relatedly, I 
changed the KeyValueMetadata ctor to raise a KeyError if a use of the 
mixed-argument constructor (merging a prior object with some **kwargs) would 
create duplicate keys
    
    Closes #6855 from wesm/ARROW-8342
    
    Authored-by: Wes McKinney <[email protected]>
    Signed-off-by: Wes McKinney <[email protected]>
---
 python/pyarrow/pandas_compat.py    |  6 ++--
 python/pyarrow/tests/test_types.py | 24 +++++++++++---
 python/pyarrow/types.pxi           | 64 +++++++++++++++++++++++++++++---------
 3 files changed, 72 insertions(+), 22 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 2aca355..55c38c9 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -17,6 +17,7 @@
 
 
 import ast
+from copy import deepcopy
 from itertools import zip_longest
 import json
 import operator
@@ -234,7 +235,7 @@ def construct_metadata(df, column_names, index_levels, 
index_descriptors,
         index_descriptors = index_column_metadata = column_indexes = []
 
     return {
-        'pandas': json.dumps({
+        b'pandas': json.dumps({
             'index_columns': index_descriptors,
             'column_indexes': column_indexes,
             'columns': column_metadata + index_column_metadata,
@@ -590,7 +591,8 @@ def dataframe_to_arrays(df, schema, preserve_index, 
nthreads=1, columns=None,
     pandas_metadata = construct_metadata(df, column_names, index_columns,
                                          index_descriptors, preserve_index,
                                          types)
-    metadata = pa.KeyValueMetadata(schema.metadata or {}, **pandas_metadata)
+    metadata = deepcopy(schema.metadata) if schema.metadata else dict()
+    metadata.update(pandas_metadata)
     schema = schema.with_metadata(metadata)
 
     return arrays, schema
diff --git a/python/pyarrow/tests/test_types.py 
b/python/pyarrow/tests/test_types.py
index 7969dd8..2588f6c 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -622,16 +622,15 @@ def test_key_value_metadata():
         ('b', 'beta'),
         ('a', 'Alpha'),
         ('a', 'ALPHA'),
-    ], b='BETA')
+    ])
 
     expected = [
         (b'a', b'alpha'),
         (b'b', b'beta'),
         (b'a', b'Alpha'),
-        (b'a', b'ALPHA'),
-        (b'b', b'BETA')
+        (b'a', b'ALPHA')
     ]
-    assert len(md) == 5
+    assert len(md) == 4
     assert isinstance(md.keys(), Iterator)
     assert isinstance(md.values(), Iterator)
     assert isinstance(md.items(), Iterator)
@@ -643,9 +642,24 @@ def test_key_value_metadata():
     assert md['a'] == b'alpha'
     assert md['b'] == b'beta'
     assert md.get_all('a') == [b'alpha', b'Alpha', b'ALPHA']
-    assert md.get_all('b') == [b'beta', b'BETA']
+    assert md.get_all('b') == [b'beta']
     assert md.get_all('unkown') == []
 
+    with pytest.raises(KeyError):
+        md = pa.KeyValueMetadata([
+            ('a', 'alpha'),
+            ('b', 'beta'),
+            ('a', 'Alpha'),
+            ('a', 'ALPHA'),
+        ], b='BETA')
+
+
+def test_key_value_metadata_duplicates():
+    meta = pa.KeyValueMetadata({'a': '1', 'b': '2'})
+
+    with pytest.raises(KeyError):
+        pa.KeyValueMetadata(meta, a='3')
+
 
 def test_field_basic():
     t = pa.string()
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index eab5a84..d5a5b73 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -824,28 +824,32 @@ def unregister_extension_type(type_name):
 
 cdef class KeyValueMetadata(_Metadata, Mapping):
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, __arg0__=None, **kwargs):
         cdef:
             vector[c_string] keys, values
-            shared_ptr[const CKeyValueMetadata] meta
+            shared_ptr[const CKeyValueMetadata] result
 
         items = []
-        if args:
-            if len(args) > 1:
-                raise TypeError('expected at most 1 positional argument, '
-                                'got {}'.format(len(args)))
-            other = args[0]
-            items += other.items() if isinstance(other, Mapping) else other
-
-        items += kwargs.items()
+        if __arg0__ is not None:
+            other = (__arg0__.items() if isinstance(__arg0__, Mapping)
+                     else __arg0__)
+            items.extend((tobytes(k), v) for k, v in other)
+
+        prior_keys = {k for k, v in items}
+        for k, v in kwargs.items():
+            k = tobytes(k)
+            if k in prior_keys:
+                raise KeyError("Duplicate key {}, "
+                               "use pass all items as list of tuples if you "
+                               "intend to have duplicate keys")
+            items.append((k, v))
 
         keys.reserve(len(items))
         for key, value in items:
             keys.push_back(tobytes(key))
             values.push_back(tobytes(value))
-
-        meta.reset(new CKeyValueMetadata(keys, values))
-        self.init(meta)
+        result.reset(new CKeyValueMetadata(move(keys), move(values)))
+        self.init(result)
 
     cdef void init(self, const shared_ptr[const CKeyValueMetadata]& wrapped):
         self.wrapped = wrapped
@@ -863,6 +867,9 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
     def equals(self, KeyValueMetadata other):
         return self.metadata.Equals(deref(other.wrapped))
 
+    def __repr__(self):
+        return str(self)
+
     def __str__(self):
         return frombytes(self.metadata.ToString())
 
@@ -896,6 +903,12 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
     def __reduce__(self):
         return KeyValueMetadata, (list(self.items()),)
 
+    def key(self, i):
+        return self.metadata.key(i)
+
+    def value(self, i):
+        return self.metadata.value(i)
+
     def keys(self):
         for i in range(self.metadata.size()):
             yield self.metadata.key(i)
@@ -912,6 +925,19 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
         key = tobytes(key)
         return [v for k, v in self.items() if k == key]
 
+    def to_dict(self):
+        """
+        Convert KeyValueMetadata to dict. If a key occurs twice, the value for
+        the first one is returned
+        """
+        cdef object key  # to force coercion to Python
+        result = ordered_dict()
+        for i in range(self.metadata.size()):
+            key = self.metadata.key(i)
+            if key not in result:
+                result[key] = self.metadata.value(i)
+        return result
+
 
 cdef KeyValueMetadata ensure_metadata(object meta, c_bool allow_none=False):
     if allow_none and meta is None:
@@ -986,7 +1012,11 @@ cdef class Field:
 
     @property
     def metadata(self):
-        return pyarrow_wrap_metadata(self.field.metadata())
+        wrapped = pyarrow_wrap_metadata(self.field.metadata())
+        if wrapped is not None:
+            return wrapped.to_dict()
+        else:
+            return wrapped
 
     def add_metadata(self, metadata):
         warnings.warn("The 'add_metadata' method is deprecated, use "
@@ -1190,7 +1220,11 @@ cdef class Schema:
 
     @property
     def metadata(self):
-        return pyarrow_wrap_metadata(self.schema.metadata())
+        wrapped = pyarrow_wrap_metadata(self.schema.metadata())
+        if wrapped is not None:
+            return wrapped.to_dict()
+        else:
+            return wrapped
 
     def __eq__(self, other):
         try:

Reply via email to