[
https://issues.apache.org/jira/browse/ARROW-9288?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Joris Van den Bossche updated ARROW-9288:
-----------------------------------------
Description:
Testing new feature from ARROW-8647, python test that reproduces it:
{code:python}
@pytest.mark.parquet
@pytest.mark.parametrize('partitioning', ["directory", "hive"])
def test_open_dataset_partitioned_dictionary_type(tempdir, partitioning):
import pyarrow.parquet as pq
table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
path = tempdir / "dataset"
path.mkdir()
for part in ["A", "B", "C"]:
fmt = "{}" if partitioning == "directory" else "part={}"
part = path / fmt.format(part)
part.mkdir()
pq.write_table(table, part / "test.parquet")
if partitioning == "directory":
part = ds.DirectoryPartitioning.discover(["part"],
max_partition_dictionary_size=-1)
else:
part = ds.HivePartitioning.discover(max_partition_dictionary_size=-1)
dataset = ds.dataset(str(path), partitioning=part)
expected_schema = table.schema.append(
pa.field("part", pa.dictionary(pa.int32(), pa.string()))
)
assert dataset.schema.equals(expected_schema)
{code}
This test fails (segfaults) for HivePartitioning, but works for
DirectoryPartitioning
was:
Python test that reproduces it:
{code:python}
@pytest.mark.parquet
@pytest.mark.parametrize('partitioning', ["directory", "hive"])
def test_open_dataset_partitioned_dictionary_type(tempdir, partitioning):
import pyarrow.parquet as pq
table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
path = tempdir / "dataset"
path.mkdir()
for part in ["A", "B", "C"]:
fmt = "{}" if partitioning == "directory" else "part={}"
part = path / fmt.format(part)
part.mkdir()
pq.write_table(table, part / "test.parquet")
if partitioning == "directory":
part = ds.DirectoryPartitioning.discover(["part"],
max_partition_dictionary_size=-1)
else:
part = ds.HivePartitioning.discover(max_partition_dictionary_size=-1)
dataset = ds.dataset(str(path), partitioning=part)
expected_schema = table.schema.append(
pa.field("part", pa.dictionary(pa.int32(), pa.string()))
)
assert dataset.schema.equals(expected_schema)
{code}
This test fails (segfaults) for HivePartitioning, but works for
DirectoryPartitioning
> [C++][Dataset] Discovery of partition field as dictionary type segfaulting
> with HivePartitioning
> ------------------------------------------------------------------------------------------------
>
> Key: ARROW-9288
> URL: https://issues.apache.org/jira/browse/ARROW-9288
> Project: Apache Arrow
> Issue Type: Bug
> Components: C++
> Reporter: Joris Van den Bossche
> Priority: Major
> Labels: dataset
> Fix For: 1.0.0
>
>
> Testing new feature from ARROW-8647, python test that reproduces it:
> {code:python}
> @pytest.mark.parquet
> @pytest.mark.parametrize('partitioning', ["directory", "hive"])
> def test_open_dataset_partitioned_dictionary_type(tempdir, partitioning):
> import pyarrow.parquet as pq
> table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
> path = tempdir / "dataset"
> path.mkdir()
> for part in ["A", "B", "C"]:
> fmt = "{}" if partitioning == "directory" else "part={}"
> part = path / fmt.format(part)
> part.mkdir()
> pq.write_table(table, part / "test.parquet")
> if partitioning == "directory":
> part = ds.DirectoryPartitioning.discover(["part"],
> max_partition_dictionary_size=-1)
> else:
> part = ds.HivePartitioning.discover(max_partition_dictionary_size=-1)
>
> dataset = ds.dataset(str(path), partitioning=part)
> expected_schema = table.schema.append(
> pa.field("part", pa.dictionary(pa.int32(), pa.string()))
> )
> assert dataset.schema.equals(expected_schema)
> {code}
> This test fails (segfaults) for HivePartitioning, but works for
> DirectoryPartitioning
--
This message was sent by Atlassian Jira
(v8.3.4#803005)