[GitHub] [arrow] amol- commented on a change in pull request #11008: ARROW-13755: [Python] Allow writing datasets using a partitioning that only specifies field_names

GitBox Tue, 21 Sep 2021 04:58:45 -0700


amol- commented on a change in pull request #11008:
URL: https://github.com/apache/arrow/pull/11008#discussion_r712966902




##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -3231,6 +3232,75 @@ def test_write_dataset_partitioned(tempdir):
         partitioning=partitioning_schema)
 
 
+def test_write_dataset_with_field_names(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    load_back = ds.dataset(tempdir, partitioning=["b"])
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"x", "y", "z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_field_names_hive(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"], partitioning_flavor="hive")
+
+    load_back = ds.dataset(tempdir, partitioning="hive")
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"b=x", "b=y", "b=z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_scanner(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z'],
+                      'c': [1, 2, 3]})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    dataset = ds.dataset(tempdir, partitioning=["b"])
+
+    with tempfile.TemporaryDirectory() as tempdir2:
+        ds.write_dataset(dataset.scanner(columns=["b", "c"]), tempdir2,
+                         format='parquet', partitioning=["b"])
+
+        load_back = ds.dataset(tempdir2, partitioning=["b"])
+        load_back_table = load_back.to_table()
+        assert load_back_table.to_pydict() == table.drop(["a"]).to_pydict()
+
+
+def test_write_dataset_with_dataset(tempdir):
+    table = pa.table({'b': ['x', 'y', 'z'], 'c': [1, 2, 3]})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    dataset = ds.dataset(tempdir, partitioning=["b"])
+
+    with tempfile.TemporaryDirectory() as tempdir2:
+        ds.write_dataset(dataset, tempdir2,
+                         format='parquet', partitioning=["b"])
+
+        load_back = ds.dataset(tempdir2, partitioning=["b"])
+        load_back_table = load_back.to_table()
+        assert load_back_table.to_pydict() == table.to_pydict()

Review comment:
       Seems the issue is a bit more complex than that. It also fails using 
`load_back_table.equals(table)`. The order of columns is different when the 
dataset is loaded back. I guess this might be something we want at least to 
document (that the partitioning column might not be in the same order that it 
had when the dataset was saved) because I don't think there is an easy way we 
can solve it without saving some extra metadata, or saving the partitioning 
column into the saved file too, thus duplicating it (which would solve also 
another issue discussed on Zulip -> 
https://ursalabs.zulipchat.com/#narrow/stream/180245-dev/topic/Possible.20bug.20in.20dataset/near/251386307
 )
   
   Meanwhile I'll ship a workaround for the test

##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -3231,6 +3232,75 @@ def test_write_dataset_partitioned(tempdir):
         partitioning=partitioning_schema)
 
 
+def test_write_dataset_with_field_names(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    load_back = ds.dataset(tempdir, partitioning=["b"])
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"x", "y", "z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_field_names_hive(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"], partitioning_flavor="hive")
+
+    load_back = ds.dataset(tempdir, partitioning="hive")
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"b=x", "b=y", "b=z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_scanner(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z'],
+                      'c': [1, 2, 3]})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    dataset = ds.dataset(tempdir, partitioning=["b"])
+
+    with tempfile.TemporaryDirectory() as tempdir2:
+        ds.write_dataset(dataset.scanner(columns=["b", "c"]), tempdir2,
+                         format='parquet', partitioning=["b"])
+
+        load_back = ds.dataset(tempdir2, partitioning=["b"])
+        load_back_table = load_back.to_table()
+        assert load_back_table.to_pydict() == table.drop(["a"]).to_pydict()
+
+
+def test_write_dataset_with_dataset(tempdir):
+    table = pa.table({'b': ['x', 'y', 'z'], 'c': [1, 2, 3]})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    dataset = ds.dataset(tempdir, partitioning=["b"])
+
+    with tempfile.TemporaryDirectory() as tempdir2:
+        ds.write_dataset(dataset, tempdir2,
+                         format='parquet', partitioning=["b"])
+
+        load_back = ds.dataset(tempdir2, partitioning=["b"])
+        load_back_table = load_back.to_table()
+        assert load_back_table.to_pydict() == table.to_pydict()

Review comment:
       Seems the issue is a bit more complex than that. It also fails using 
`load_back_table.equals(table)`. 
   
   The order of columns is different when the dataset is loaded back. I guess 
this might be something we want at least to document (that the partitioning 
column might not be in the same order that it had when the dataset was saved) 
because I don't think there is an easy way we can solve it without saving some 
extra metadata, or saving the partitioning column into the saved file too, thus 
duplicating it (which would solve also another issue discussed on Zulip -> 
https://ursalabs.zulipchat.com/#narrow/stream/180245-dev/topic/Possible.20bug.20in.20dataset/near/251386307
 )
   
   Meanwhile I'll ship a workaround for the test




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] amol- commented on a change in pull request #11008: ARROW-13755: [Python] Allow writing datasets using a partitioning that only specifies field_names

Reply via email to