This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1a1047a ARROW-8342: [Python] Continue to return dict from "metadata"
properties accessing KeyValueMetadata
1a1047a is described below
commit 1a1047a0a4c69fcab08643e4e884c3fec7a0d76f
Author: Wes McKinney <[email protected]>
AuthorDate: Tue Apr 7 20:29:23 2020 -0500
ARROW-8342: [Python] Continue to return dict from "metadata" properties
accessing KeyValueMetadata
This patch relegates the KeyValueMetadata wrapper to an implementation
detail, so existing third party code should be unaffected, and we can decide
later when and how to expose this object publicly in the future without needing
to revert the ARROW-8079.
I also fixed the change related to the "pandas" metadata key. Relatedly, I
changed the KeyValueMetadata ctor to raise a KeyError if a use of the
mixed-argument constructor (merging a prior object with some **kwargs) would
create duplicate keys
Closes #6855 from wesm/ARROW-8342
Authored-by: Wes McKinney <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
---
python/pyarrow/pandas_compat.py | 6 ++--
python/pyarrow/tests/test_types.py | 24 +++++++++++---
python/pyarrow/types.pxi | 64 +++++++++++++++++++++++++++++---------
3 files changed, 72 insertions(+), 22 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 2aca355..55c38c9 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -17,6 +17,7 @@
import ast
+from copy import deepcopy
from itertools import zip_longest
import json
import operator
@@ -234,7 +235,7 @@ def construct_metadata(df, column_names, index_levels,
index_descriptors,
index_descriptors = index_column_metadata = column_indexes = []
return {
- 'pandas': json.dumps({
+ b'pandas': json.dumps({
'index_columns': index_descriptors,
'column_indexes': column_indexes,
'columns': column_metadata + index_column_metadata,
@@ -590,7 +591,8 @@ def dataframe_to_arrays(df, schema, preserve_index,
nthreads=1, columns=None,
pandas_metadata = construct_metadata(df, column_names, index_columns,
index_descriptors, preserve_index,
types)
- metadata = pa.KeyValueMetadata(schema.metadata or {}, **pandas_metadata)
+ metadata = deepcopy(schema.metadata) if schema.metadata else dict()
+ metadata.update(pandas_metadata)
schema = schema.with_metadata(metadata)
return arrays, schema
diff --git a/python/pyarrow/tests/test_types.py
b/python/pyarrow/tests/test_types.py
index 7969dd8..2588f6c 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -622,16 +622,15 @@ def test_key_value_metadata():
('b', 'beta'),
('a', 'Alpha'),
('a', 'ALPHA'),
- ], b='BETA')
+ ])
expected = [
(b'a', b'alpha'),
(b'b', b'beta'),
(b'a', b'Alpha'),
- (b'a', b'ALPHA'),
- (b'b', b'BETA')
+ (b'a', b'ALPHA')
]
- assert len(md) == 5
+ assert len(md) == 4
assert isinstance(md.keys(), Iterator)
assert isinstance(md.values(), Iterator)
assert isinstance(md.items(), Iterator)
@@ -643,9 +642,24 @@ def test_key_value_metadata():
assert md['a'] == b'alpha'
assert md['b'] == b'beta'
assert md.get_all('a') == [b'alpha', b'Alpha', b'ALPHA']
- assert md.get_all('b') == [b'beta', b'BETA']
+ assert md.get_all('b') == [b'beta']
assert md.get_all('unkown') == []
+ with pytest.raises(KeyError):
+ md = pa.KeyValueMetadata([
+ ('a', 'alpha'),
+ ('b', 'beta'),
+ ('a', 'Alpha'),
+ ('a', 'ALPHA'),
+ ], b='BETA')
+
+
+def test_key_value_metadata_duplicates():
+ meta = pa.KeyValueMetadata({'a': '1', 'b': '2'})
+
+ with pytest.raises(KeyError):
+ pa.KeyValueMetadata(meta, a='3')
+
def test_field_basic():
t = pa.string()
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index eab5a84..d5a5b73 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -824,28 +824,32 @@ def unregister_extension_type(type_name):
cdef class KeyValueMetadata(_Metadata, Mapping):
- def __init__(self, *args, **kwargs):
+ def __init__(self, __arg0__=None, **kwargs):
cdef:
vector[c_string] keys, values
- shared_ptr[const CKeyValueMetadata] meta
+ shared_ptr[const CKeyValueMetadata] result
items = []
- if args:
- if len(args) > 1:
- raise TypeError('expected at most 1 positional argument, '
- 'got {}'.format(len(args)))
- other = args[0]
- items += other.items() if isinstance(other, Mapping) else other
-
- items += kwargs.items()
+ if __arg0__ is not None:
+ other = (__arg0__.items() if isinstance(__arg0__, Mapping)
+ else __arg0__)
+ items.extend((tobytes(k), v) for k, v in other)
+
+ prior_keys = {k for k, v in items}
+ for k, v in kwargs.items():
+ k = tobytes(k)
+ if k in prior_keys:
+ raise KeyError("Duplicate key {}, "
+ "use pass all items as list of tuples if you "
+ "intend to have duplicate keys")
+ items.append((k, v))
keys.reserve(len(items))
for key, value in items:
keys.push_back(tobytes(key))
values.push_back(tobytes(value))
-
- meta.reset(new CKeyValueMetadata(keys, values))
- self.init(meta)
+ result.reset(new CKeyValueMetadata(move(keys), move(values)))
+ self.init(result)
cdef void init(self, const shared_ptr[const CKeyValueMetadata]& wrapped):
self.wrapped = wrapped
@@ -863,6 +867,9 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
def equals(self, KeyValueMetadata other):
return self.metadata.Equals(deref(other.wrapped))
+ def __repr__(self):
+ return str(self)
+
def __str__(self):
return frombytes(self.metadata.ToString())
@@ -896,6 +903,12 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
def __reduce__(self):
return KeyValueMetadata, (list(self.items()),)
+ def key(self, i):
+ return self.metadata.key(i)
+
+ def value(self, i):
+ return self.metadata.value(i)
+
def keys(self):
for i in range(self.metadata.size()):
yield self.metadata.key(i)
@@ -912,6 +925,19 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
key = tobytes(key)
return [v for k, v in self.items() if k == key]
+ def to_dict(self):
+ """
+ Convert KeyValueMetadata to dict. If a key occurs twice, the value for
+ the first one is returned
+ """
+ cdef object key # to force coercion to Python
+ result = ordered_dict()
+ for i in range(self.metadata.size()):
+ key = self.metadata.key(i)
+ if key not in result:
+ result[key] = self.metadata.value(i)
+ return result
+
cdef KeyValueMetadata ensure_metadata(object meta, c_bool allow_none=False):
if allow_none and meta is None:
@@ -986,7 +1012,11 @@ cdef class Field:
@property
def metadata(self):
- return pyarrow_wrap_metadata(self.field.metadata())
+ wrapped = pyarrow_wrap_metadata(self.field.metadata())
+ if wrapped is not None:
+ return wrapped.to_dict()
+ else:
+ return wrapped
def add_metadata(self, metadata):
warnings.warn("The 'add_metadata' method is deprecated, use "
@@ -1190,7 +1220,11 @@ cdef class Schema:
@property
def metadata(self):
- return pyarrow_wrap_metadata(self.schema.metadata())
+ wrapped = pyarrow_wrap_metadata(self.schema.metadata())
+ if wrapped is not None:
+ return wrapped.to_dict()
+ else:
+ return wrapped
def __eq__(self, other):
try: