[
https://issues.apache.org/jira/browse/ARROW-8057?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Antoine Pitrou updated ARROW-8057:
----------------------------------
Component/s: C++
> [C++] Schema equality not roundtrip safe through Parquet
> --------------------------------------------------------
>
> Key: ARROW-8057
> URL: https://issues.apache.org/jira/browse/ARROW-8057
> Project: Apache Arrow
> Issue Type: Bug
> Components: C++, Python
> Reporter: Florian Jetter
> Priority: Major
>
> When performing schema roundtrips, the equality check for fields break. This
> is a regression from PyArrow 0.16.0
> The equality check for entire schemas has never worked (but should from my
> POV)
> {code:python}
> import pyarrow.parquet as pq
> import pyarrow as pa
> print(pa.__version__)
> fields = [
> pa.field("bool", pa.bool_()),
> pa.field("byte", pa.binary()),
> pa.field("date", pa.date32()),
> pa.field("datetime64", pa.timestamp("us")),
> pa.field("float32", pa.float64()),
> pa.field("float64", pa.float64()),
> pa.field("int16", pa.int64()),
> pa.field("int32", pa.int64()),
> pa.field("int64", pa.int64()),
> pa.field("int8", pa.int64()),
> pa.field("null", pa.null()),
> pa.field("uint16", pa.uint64()),
> pa.field("uint32", pa.uint64()),
> pa.field("uint64", pa.uint64()),
> pa.field("uint8", pa.uint64()),
> pa.field("unicode", pa.string()),
> pa.field("array_float32", pa.list_(pa.float64())),
> pa.field("array_float64", pa.list_(pa.float64())),
> pa.field("array_int16", pa.list_(pa.int64())),
> pa.field("array_int32", pa.list_(pa.int64())),
> pa.field("array_int64", pa.list_(pa.int64())),
> pa.field("array_int8", pa.list_(pa.int64())),
> pa.field("array_uint16", pa.list_(pa.uint64())),
> pa.field("array_uint32", pa.list_(pa.uint64())),
> pa.field("array_uint64", pa.list_(pa.uint64())),
> pa.field("array_uint8", pa.list_(pa.uint64())),
> pa.field("array_unicode", pa.list_(pa.string())),
> ]
> schema = pa.schema(fields)
> buf = pa.BufferOutputStream()
> pq.write_metadata(schema, buf)
> reader = pa.BufferReader(buf.getvalue().to_pybytes())
> reconstructed_schema = pq.read_schema(reader)
> assert reconstructed_schema == reconstructed_schema
> assert reconstructed_schema[0] == reconstructed_schema[0]
> # This breaks on master / regression from 0.16.0
> assert schema[0] == reconstructed_schema[0]
> # This never worked but should
> assert reconstructed_schema == schema
> assert schema == reconstructed_schema
> {code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)