[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #11008: ARROW-13755: [Python] Allow writing datasets using a partitioning that only specifies field_names

GitBox Tue, 21 Sep 2021 05:42:27 -0700


jorisvandenbossche commented on a change in pull request #11008:
URL: https://github.com/apache/arrow/pull/11008#discussion_r712999895




##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -3231,6 +3232,75 @@ def test_write_dataset_partitioned(tempdir):
         partitioning=partitioning_schema)
 
 
+def test_write_dataset_with_field_names(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    load_back = ds.dataset(tempdir, partitioning=["b"])
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"x", "y", "z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_field_names_hive(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"], partitioning_flavor="hive")
+
+    load_back = ds.dataset(tempdir, partitioning="hive")
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"b=x", "b=y", "b=z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_scanner(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z'],
+                      'c': [1, 2, 3]})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    dataset = ds.dataset(tempdir, partitioning=["b"])
+
+    with tempfile.TemporaryDirectory() as tempdir2:
+        ds.write_dataset(dataset.scanner(columns=["b", "c"]), tempdir2,
+                         format='parquet', partitioning=["b"])
+
+        load_back = ds.dataset(tempdir2, partitioning=["b"])
+        load_back_table = load_back.to_table()
+        assert load_back_table.to_pydict() == table.drop(["a"]).to_pydict()
+
+
+def test_write_dataset_with_dataset(tempdir):
+    table = pa.table({'b': ['x', 'y', 'z'], 'c': [1, 2, 3]})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    dataset = ds.dataset(tempdir, partitioning=["b"])
+
+    with tempfile.TemporaryDirectory() as tempdir2:
+        ds.write_dataset(dataset, tempdir2,
+                         format='parquet', partitioning=["b"])
+
+        load_back = ds.dataset(tempdir2, partitioning=["b"])
+        load_back_table = load_back.to_table()
+        assert load_back_table.to_pydict() == table.to_pydict()

Review comment:
       That's actually expected behaviour: the partitioning columns are at the 
moment always added at the end of the schema (so since the original table 
schema had columns [b, c], and you are partitioning on b, it is expected to 
read as [c, b]). Because you initially tested against a plain python dict 
(`assert load_back_table.to_pydict() == {'b': ..., 'c': ...}`), which ignores 
ordering of the keys to evaluate equality, this passed.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #11008: ARROW-13755: [Python] Allow writing datasets using a partitioning that only specifies field_names

Reply via email to