amol- commented on a change in pull request #11008:
URL: https://github.com/apache/arrow/pull/11008#discussion_r712966902
##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -3231,6 +3232,75 @@ def test_write_dataset_partitioned(tempdir):
partitioning=partitioning_schema)
+def test_write_dataset_with_field_names(tempdir):
+ table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+ ds.write_dataset(table, tempdir, format='parquet',
+ partitioning=["b"])
+
+ load_back = ds.dataset(tempdir, partitioning=["b"])
+ files = load_back.files
+ partitioning_dirs = {
+ str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+ }
+ assert partitioning_dirs == {"x", "y", "z"}
+
+ load_back_table = load_back.to_table()
+ assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_field_names_hive(tempdir):
+ table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+ ds.write_dataset(table, tempdir, format='parquet',
+ partitioning=["b"], partitioning_flavor="hive")
+
+ load_back = ds.dataset(tempdir, partitioning="hive")
+ files = load_back.files
+ partitioning_dirs = {
+ str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+ }
+ assert partitioning_dirs == {"b=x", "b=y", "b=z"}
+
+ load_back_table = load_back.to_table()
+ assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_scanner(tempdir):
+ table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z'],
+ 'c': [1, 2, 3]})
+
+ ds.write_dataset(table, tempdir, format='parquet',
+ partitioning=["b"])
+
+ dataset = ds.dataset(tempdir, partitioning=["b"])
+
+ with tempfile.TemporaryDirectory() as tempdir2:
+ ds.write_dataset(dataset.scanner(columns=["b", "c"]), tempdir2,
+ format='parquet', partitioning=["b"])
+
+ load_back = ds.dataset(tempdir2, partitioning=["b"])
+ load_back_table = load_back.to_table()
+ assert load_back_table.to_pydict() == table.drop(["a"]).to_pydict()
+
+
+def test_write_dataset_with_dataset(tempdir):
+ table = pa.table({'b': ['x', 'y', 'z'], 'c': [1, 2, 3]})
+
+ ds.write_dataset(table, tempdir, format='parquet',
+ partitioning=["b"])
+
+ dataset = ds.dataset(tempdir, partitioning=["b"])
+
+ with tempfile.TemporaryDirectory() as tempdir2:
+ ds.write_dataset(dataset, tempdir2,
+ format='parquet', partitioning=["b"])
+
+ load_back = ds.dataset(tempdir2, partitioning=["b"])
+ load_back_table = load_back.to_table()
+ assert load_back_table.to_pydict() == table.to_pydict()
Review comment:
Seems the issue is a bit more complex than that. It also fails using
`load_back_table.equals(table)`. The order of columns is different when the
dataset is loaded back. I guess this might be something we want at least to
document (that the partitioning column might not be in the same order that it
had when the dataset was saved) because I don't think there is an easy way we
can solve it without saving some extra metadata, or saving the partitioning
column into the saved file too, thus duplicating it (which would solve also
another issue discussed on Zulip ->
https://ursalabs.zulipchat.com/#narrow/stream/180245-dev/topic/Possible.20bug.20in.20dataset/near/251386307
)
Meanwhile I'll ship a workaround for the test
##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -3231,6 +3232,75 @@ def test_write_dataset_partitioned(tempdir):
partitioning=partitioning_schema)
+def test_write_dataset_with_field_names(tempdir):
+ table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+ ds.write_dataset(table, tempdir, format='parquet',
+ partitioning=["b"])
+
+ load_back = ds.dataset(tempdir, partitioning=["b"])
+ files = load_back.files
+ partitioning_dirs = {
+ str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+ }
+ assert partitioning_dirs == {"x", "y", "z"}
+
+ load_back_table = load_back.to_table()
+ assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_field_names_hive(tempdir):
+ table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+ ds.write_dataset(table, tempdir, format='parquet',
+ partitioning=["b"], partitioning_flavor="hive")
+
+ load_back = ds.dataset(tempdir, partitioning="hive")
+ files = load_back.files
+ partitioning_dirs = {
+ str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+ }
+ assert partitioning_dirs == {"b=x", "b=y", "b=z"}
+
+ load_back_table = load_back.to_table()
+ assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_scanner(tempdir):
+ table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z'],
+ 'c': [1, 2, 3]})
+
+ ds.write_dataset(table, tempdir, format='parquet',
+ partitioning=["b"])
+
+ dataset = ds.dataset(tempdir, partitioning=["b"])
+
+ with tempfile.TemporaryDirectory() as tempdir2:
+ ds.write_dataset(dataset.scanner(columns=["b", "c"]), tempdir2,
+ format='parquet', partitioning=["b"])
+
+ load_back = ds.dataset(tempdir2, partitioning=["b"])
+ load_back_table = load_back.to_table()
+ assert load_back_table.to_pydict() == table.drop(["a"]).to_pydict()
+
+
+def test_write_dataset_with_dataset(tempdir):
+ table = pa.table({'b': ['x', 'y', 'z'], 'c': [1, 2, 3]})
+
+ ds.write_dataset(table, tempdir, format='parquet',
+ partitioning=["b"])
+
+ dataset = ds.dataset(tempdir, partitioning=["b"])
+
+ with tempfile.TemporaryDirectory() as tempdir2:
+ ds.write_dataset(dataset, tempdir2,
+ format='parquet', partitioning=["b"])
+
+ load_back = ds.dataset(tempdir2, partitioning=["b"])
+ load_back_table = load_back.to_table()
+ assert load_back_table.to_pydict() == table.to_pydict()
Review comment:
Seems the issue is a bit more complex than that. It also fails using
`load_back_table.equals(table)`.
The order of columns is different when the dataset is loaded back. I guess
this might be something we want at least to document (that the partitioning
column might not be in the same order that it had when the dataset was saved)
because I don't think there is an easy way we can solve it without saving some
extra metadata, or saving the partitioning column into the saved file too, thus
duplicating it (which would solve also another issue discussed on Zulip ->
https://ursalabs.zulipchat.com/#narrow/stream/180245-dev/topic/Possible.20bug.20in.20dataset/near/251386307
)
Meanwhile I'll ship a workaround for the test
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]