This is an automated email from the ASF dual-hosted git repository.
fsaintjacques pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 515197d ARROW-8802: [C++][Dataset] Preserve dataset schema's metadata
on column projection
515197d is described below
commit 515197dfe6e83d6fa6fe82bfec134f41b222b748
Author: François Saint-Jacques <[email protected]>
AuthorDate: Thu Jun 18 11:31:59 2020 -0400
ARROW-8802: [C++][Dataset] Preserve dataset schema's metadata on column
projection
Scanner does not preserve the original schema metadata when columns are
projected.
Closes #7474 from fsaintjacques/ARROW-8802-dataset-schema-metadata
Lead-authored-by: François Saint-Jacques <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: François Saint-Jacques <[email protected]>
---
cpp/src/arrow/dataset/dataset_internal.h | 2 +-
python/pyarrow/tests/test_dataset.py | 18 ++++++++++++++++++
python/pyarrow/tests/test_extension_type.py | 18 +++++++++---------
3 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/cpp/src/arrow/dataset/dataset_internal.h
b/cpp/src/arrow/dataset/dataset_internal.h
index b1d3dcf..40ffab5 100644
--- a/cpp/src/arrow/dataset/dataset_internal.h
+++ b/cpp/src/arrow/dataset/dataset_internal.h
@@ -65,7 +65,7 @@ inline std::shared_ptr<Schema> SchemaFromColumnNames(
}
}
- return schema(std::move(columns));
+ return schema(std::move(columns))->WithMetadata(input->metadata());
}
} // namespace dataset
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index 9fbde57..4bf4192 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1587,3 +1587,21 @@ def test_parquet_dataset_factory_partitioned(tempdir):
result = result.to_pandas().sort_values("f1").reset_index(drop=True)
expected = table.to_pandas()
pd.testing.assert_frame_equal(result, expected)
+
+
[email protected]
[email protected]
+def test_dataset_schema_metadata(tempdir):
+ # ARROW-8802
+ df = pd.DataFrame({'a': [1, 2, 3]})
+ path = tempdir / "test.parquet"
+ df.to_parquet(path)
+ dataset = ds.dataset(path)
+
+ schema = dataset.to_table().schema
+ projected_schema = dataset.to_table(columns=["a"]).schema
+
+ # ensure the pandas metadata is included in the schema
+ assert b"pandas" in schema.metadata
+ # ensure it is still there in a projected schema (with column selection)
+ assert schema.equals(projected_schema, check_metadata=True)
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index fbd8c0b..dafa4f0 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -278,7 +278,7 @@ class PeriodType(pa.ExtensionType):
# attributes need to be set first before calling
# super init (as that calls serialize)
self._freq = freq
- pa.ExtensionType.__init__(self, pa.int64(), 'pandas.period')
+ pa.ExtensionType.__init__(self, pa.int64(), 'test.period')
@property
def freq(self):
@@ -325,14 +325,14 @@ def registered_period_type(request):
yield period_type, period_class
# teardown
try:
- pa.unregister_extension_type('pandas.period')
+ pa.unregister_extension_type('test.period')
except KeyError:
pass
def test_generic_ext_type():
period_type = PeriodType('D')
- assert period_type.extension_name == "pandas.period"
+ assert period_type.extension_name == "test.period"
assert period_type.storage_type == pa.int64()
# default ext_class expected.
assert period_type.__arrow_ext_class__() == pa.ExtensionArray
@@ -353,7 +353,7 @@ def test_generic_ext_type_ipc(registered_period_type):
result = batch.column(0)
# check the deserialized array class is the expected one
assert type(result) == period_class
- assert result.type.extension_name == "pandas.period"
+ assert result.type.extension_name == "test.period"
assert arr.storage.to_pylist() == [1, 2, 3, 4]
# we get back an actual PeriodType
@@ -363,7 +363,7 @@ def test_generic_ext_type_ipc(registered_period_type):
# using different parametrization as how it was registered
period_type_H = period_type.__class__('H')
- assert period_type_H.extension_name == "pandas.period"
+ assert period_type_H.extension_name == "test.period"
assert period_type_H.freq == 'H'
arr = pa.ExtensionArray.from_storage(period_type_H, storage)
@@ -389,7 +389,7 @@ def
test_generic_ext_type_ipc_unknown(registered_period_type):
# unregister type before loading again => reading unknown extension type
# as plain array (but metadata in schema's field are preserved)
- pa.unregister_extension_type('pandas.period')
+ pa.unregister_extension_type('test.period')
batch = ipc_read_batch(buf)
result = batch.column(0)
@@ -398,13 +398,13 @@ def
test_generic_ext_type_ipc_unknown(registered_period_type):
ext_field = batch.schema.field('ext')
assert ext_field.metadata == {
b'ARROW:extension:metadata': b'freq=D',
- b'ARROW:extension:name': b'pandas.period'
+ b'ARROW:extension:name': b'test.period'
}
def test_generic_ext_type_equality():
period_type = PeriodType('D')
- assert period_type.extension_name == "pandas.period"
+ assert period_type.extension_name == "test.period"
period_type2 = PeriodType('D')
period_type3 = PeriodType('H')
@@ -464,7 +464,7 @@ def test_parquet(tmpdir, registered_period_type):
# The extension metadata is present for roundtripping.
assert result.schema.field("ext").metadata == {
b'ARROW:extension:metadata': b'freq=D',
- b'ARROW:extension:name': b'pandas.period',
+ b'ARROW:extension:name': b'test.period',
b'PARQUET:field_id': b'1',
}