jorisvandenbossche commented on a change in pull request #10628:
URL: https://github.com/apache/arrow/pull/10628#discussion_r665280601
##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -2672,47 +2672,56 @@ def test_feather_format(tempdir, dataset_reader):
dataset_reader.to_table(ds.dataset(basedir, format="feather"))
-def _create_parquet_dataset_simple(root_path):
+def _create_parquet_dataset_simple(root_path, use_legacy_dataset):
import pyarrow.parquet as pq
metadata_collector = []
- for i in range(4):
- table = pa.table({'f1': [i] * 10, 'f2': np.random.randn(10)})
- pq.write_to_dataset(
- table, str(root_path), metadata_collector=metadata_collector
- )
+ f1_vals = [item for chunk in range(4) for item in [chunk] * 10]
+
+ table = pa.table({'f1': f1_vals, 'f2': np.random.randn(40)})
+ pq.write_to_dataset(
+ table, str(root_path), partition_cols=['f1'],
+ use_legacy_dataset=use_legacy_dataset,
+ metadata_collector=metadata_collector
+ )
+
+ partitionless_schema = pa.schema([pa.field('f2', pa.float64())])
metadata_path = str(root_path / '_metadata')
# write _metadata file
pq.write_metadata(
- table.schema, metadata_path,
+ partitionless_schema, metadata_path,
metadata_collector=metadata_collector
)
- return metadata_path, table
+ return metadata_path, partitionless_schema
@pytest.mark.parquet
@pytest.mark.pandas # write_to_dataset currently requires pandas
-def test_parquet_dataset_factory(tempdir):
[email protected]('use_legacy_dataset', [False, True])
+def test_parquet_dataset_factory(tempdir, use_legacy_dataset):
root_path = tempdir / "test_parquet_dataset"
- metadata_path, table = _create_parquet_dataset_simple(root_path)
+ metadata_path, partitionless_schema = _create_parquet_dataset_simple(
+ root_path, use_legacy_dataset)
dataset = ds.parquet_dataset(metadata_path)
- assert dataset.schema.equals(table.schema)
+ assert dataset.schema.equals(partitionless_schema)
Review comment:
But so I don't fully understand what this PR changed that causes this?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]