[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #11008: ARROW-13755: [Python] Allow writing datasets using a partitioning that only specifies field_names

GitBox Thu, 16 Sep 2021 01:55:25 -0700


jorisvandenbossche commented on a change in pull request #11008:
URL: https://github.com/apache/arrow/pull/11008#discussion_r709921217




##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -3231,6 +3224,85 @@ def test_write_dataset_partitioned(tempdir):
         partitioning=partitioning_schema)
 
 
+def test_dictionary_partitioning_outer_nulls_raises(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+    part = ds.partitioning(
+        pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]))
+    with pytest.raises(pa.ArrowInvalid):
+        ds.write_dataset(table, tempdir, format='parquet', partitioning=part)
+
+
+def test_write_dataset_with_field_names(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    load_back = ds.dataset(tempdir, partitioning=["b"])
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"x", "y", "z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_field_names_hive(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"], partitioning_flavor="hive")
+
+    load_back = ds.dataset(tempdir, partitioning="hive")
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"b=x", "b=y", "b=z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_scanner(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z'],
+                      'c': [1, 2, 3]})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    dataset = ds.dataset(tempdir, partitioning=["b"])
+
+    with tempfile.TemporaryDirectory() as tempdir2:
+        ds.write_dataset(dataset.scanner(columns=["b", "c"]), tempdir2,
+                         format='parquet', partitioning=["b"])
+
+        load_back = ds.dataset(tempdir2, partitioning=["b"])
+        load_back_table = load_back.to_table()
+        assert load_back_table.to_pydict() == {'b': ['x', 'y', 'z'],
+                                               'c': [1, 2, 3]}

Review comment:
       ```suggestion
           assert load_back_table.equals(table)
   ```

##########
File path: python/pyarrow/dataset.py
##########
@@ -714,9 +729,12 @@ def write_dataset(data, base_dir, basename_template=None, 
format=None,
         and `format` is not specified, it defaults to the same format as the
         specified FileSystemDataset. When writing a Table or RecordBatch, this
         keyword is required.
-    partitioning : Partitioning, optional
+    partitioning : Partitioning or list[str], optional
         The partitioning scheme specified with the ``partitioning()``
-        function.
+        function or a list of field names.

Review comment:
       Maybe mention here that *if* passing a list of field names, you can also 
specify `partitioning_flavor` in addition.

##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -1569,14 +1570,6 @@ def test_partitioning_factory_segment_encoding():
         inferred_schema = factory.inspect()
 
 
-def test_dictionary_partitioning_outer_nulls_raises(tempdir):
-    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
-    part = ds.partitioning(
-        pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]))
-    with pytest.raises(pa.ArrowInvalid):
-        ds.write_dataset(table, tempdir, format='parquet', partitioning=part)

Review comment:
       since there are other tests below about how nulls are treated in 
partition fields that is closely related to this one, I would leave this one 
test at this location

##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -3231,6 +3224,85 @@ def test_write_dataset_partitioned(tempdir):
         partitioning=partitioning_schema)
 
 
+def test_dictionary_partitioning_outer_nulls_raises(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+    part = ds.partitioning(
+        pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]))
+    with pytest.raises(pa.ArrowInvalid):
+        ds.write_dataset(table, tempdir, format='parquet', partitioning=part)
+
+
+def test_write_dataset_with_field_names(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    load_back = ds.dataset(tempdir, partitioning=["b"])
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"x", "y", "z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_field_names_hive(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"], partitioning_flavor="hive")
+
+    load_back = ds.dataset(tempdir, partitioning="hive")
+    files = load_back.files
+    partitioning_dirs = {
+        str(pathlib.Path(f).relative_to(tempdir).parent) for f in files
+    }
+    assert partitioning_dirs == {"b=x", "b=y", "b=z"}
+
+    load_back_table = load_back.to_table()
+    assert load_back_table.equals(table)
+
+
+def test_write_dataset_with_scanner(tempdir):
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z'],
+                      'c': [1, 2, 3]})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    dataset = ds.dataset(tempdir, partitioning=["b"])
+
+    with tempfile.TemporaryDirectory() as tempdir2:
+        ds.write_dataset(dataset.scanner(columns=["b", "c"]), tempdir2,
+                         format='parquet', partitioning=["b"])
+
+        load_back = ds.dataset(tempdir2, partitioning=["b"])
+        load_back_table = load_back.to_table()
+        assert load_back_table.to_pydict() == {'b': ['x', 'y', 'z'],
+                                               'c': [1, 2, 3]}
+
+
+def test_write_dataset_with_dataset(tempdir):
+    table = pa.table({'b': ['x', 'y', 'z'], 'c': [1, 2, 3]})
+
+    ds.write_dataset(table, tempdir, format='parquet',
+                     partitioning=["b"])
+
+    dataset = ds.dataset(tempdir, partitioning=["b"])
+
+    with tempfile.TemporaryDirectory() as tempdir2:
+        ds.write_dataset(dataset, tempdir2,
+                         format='parquet', partitioning=["b"])
+
+        load_back = ds.dataset(tempdir2, partitioning=["b"])
+        load_back_table = load_back.to_table()
+        assert load_back_table.to_pydict() == {'b': ['x', 'y', 'z'],
+                                               'c': [1, 2, 3]}

Review comment:
       ```suggestion
           assert load_back_table.equals(table)
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #11008: ARROW-13755: [Python] Allow writing datasets using a partitioning that only specifies field_names

Reply via email to