[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #9130: ARROW-10247: [C++][Dataset] Support writing datasets partitioned on dictionary columns

GitBox Mon, 11 Jan 2021 11:23:51 -0800


jorisvandenbossche commented on a change in pull request #9130:
URL: https://github.com/apache/arrow/pull/9130#discussion_r555275756




##########
File path: cpp/src/arrow/dataset/partition.h
##########
@@ -288,16 +288,58 @@ class ARROW_DS_EXPORT PartitioningOrFactory {
 /// \brief Assemble lists of indices of identical rows.
 ///
 /// \param[in] by A StructArray whose columns will be used as grouping 
criteria.
-/// \return A StructArray mapping unique rows (in field "values", represented 
as a
-///         StructArray with the same fields as `by`) to lists of indices where
-///         that row appears (in field "groupings").
+///               Top level nulls are invalid, as are empty criteria (no 
grouping
+///               columns).
+/// \return A array of type `struct<values: by.type, groupings: list<int64>>`,
+///         which is a mapping from unique rows (field "values") to lists of
+///         indices into `by` where that row appears (field "groupings").
+///
+/// For example,
+///   MakeGroupings([
+///       {"a": "ex",  "b": 0},
+///       {"a": "ex",  "b": 0},
+///       {"a": "why", "b": 0},
+///       {"a": "why", "b": 0},
+///       {"a": "ex",  "b": 0},
+///       {"a": "why", "b": 1}
+///   ]) == [
+///       {"values": {"a": "ex",  "b": 0}, "groupings": [0, 1, 4]},
+///       {"values": {"a": "why", "b": 0}, "groupings": [2, 3]},
+///       {"values": {"a": "why", "b": 1}, "groupings": [5]}
+///   ]

Review comment:
       Nice docstrings! Those examples help a lot

##########
File path: cpp/src/arrow/dataset/partition.h
##########
@@ -288,16 +288,58 @@ class ARROW_DS_EXPORT PartitioningOrFactory {
 /// \brief Assemble lists of indices of identical rows.
 ///
 /// \param[in] by A StructArray whose columns will be used as grouping 
criteria.
-/// \return A StructArray mapping unique rows (in field "values", represented 
as a
-///         StructArray with the same fields as `by`) to lists of indices where
-///         that row appears (in field "groupings").
+///               Top level nulls are invalid, as are empty criteria (no 
grouping
+///               columns).
+/// \return A array of type `struct<values: by.type, groupings: list<int64>>`,
+///         which is a mapping from unique rows (field "values") to lists of
+///         indices into `by` where that row appears (field "groupings").
+///
+/// For example,
+///   MakeGroupings([
+///       {"a": "ex",  "b": 0},
+///       {"a": "ex",  "b": 0},
+///       {"a": "why", "b": 0},
+///       {"a": "why", "b": 0},
+///       {"a": "ex",  "b": 0},
+///       {"a": "why", "b": 1}
+///   ]) == [
+///       {"values": {"a": "ex",  "b": 0}, "groupings": [0, 1, 4]},
+///       {"values": {"a": "why", "b": 0}, "groupings": [2, 3]},
+///       {"values": {"a": "why", "b": 1}, "groupings": [5]}
+///   ]
 ARROW_DS_EXPORT
 Result<std::shared_ptr<StructArray>> MakeGroupings(const StructArray& by);
 
-/// \brief Produce slices of an Array which correspond to the provided 
groupings.
+/// \brief Produce a ListArray whose slots are selections of `array` which 
correspond to
+/// the provided groupings.
+///
+/// For example,
+///   ApplyGroupings([[0, 1, 4], [2, 3], [5]], [
+///       {"a": "ex",  "b": 0, "passenger": 0},
+///       {"a": "ex",  "b": 0, "passenger": 1},
+///       {"a": "why", "b": 0, "passenger": 2},
+///       {"a": "why", "b": 0, "passenger": 3},
+///       {"a": "ex",  "b": 0, "passenger": 4},
+///       {"a": "why", "b": 1, "passenger": 5}
+///   ]) == [
+///     [
+///       {"a": "ex",  "b": 0, "passenger": 0},
+///       {"a": "ex",  "b": 0, "passenger": 1},
+///       {"a": "ex",  "b": 0, "passenger": 4},
+///     ],
+///     [
+///       {"a": "why", "b": 0, "passenger": 2},
+///       {"a": "why", "b": 0, "passenger": 3},
+///     ],
+///     [
+///       {"a": "why", "b": 1, "passenger": 5}
+///     ]
+///   ]
 ARROW_DS_EXPORT
 Result<std::shared_ptr<ListArray>> ApplyGroupings(const ListArray& groupings,
                                                   const Array& array);
+
+/// \brief Produce slices of a RecordBatch which correspond to the provided 
groupings.

Review comment:
       Also here "slices" -> "selections" ?

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1433,12 +1433,19 @@ cdef class DirectoryPartitioning(Partitioning):
     cdef:
         CDirectoryPartitioning* directory_partitioning
 
-    def __init__(self, Schema schema not None):
-        cdef shared_ptr[CDirectoryPartitioning] partitioning
-        partitioning = make_shared[CDirectoryPartitioning](
-            pyarrow_unwrap_schema(schema)
+    def __init__(self, Schema schema not None, dictionaries=None):

Review comment:
       Can you update the above docstring for this new keyword? (and same for 
HivePartitioning, and partitioning() below) 

##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -2315,6 +2315,29 @@ def test_write_dataset_partitioned(tempdir):
         partitioning=partitioning_schema)
 
 
[email protected]
[email protected]
+def test_write_dataset_partitioned_dict(tempdir):
+    directory = tempdir / "partitioned"
+    _ = _create_parquet_dataset_partitioned(directory)
+
+    # directory partitioning, dictionary partition columns
+    dataset = ds.dataset(
+        directory,
+        partitioning=ds.HivePartitioning.discover(infer_dictionary=True))
+    target = tempdir / 'partitioned-dir-target'
+    expected_paths = [
+        target / "a", target / "a" / "part-0.feather",
+        target / "b", target / "b" / "part-1.feather"
+    ]
+    partitioning_schema = ds.partitioning(pa.schema([
+        dataset.schema.field('part')]),
+        dictionaries=[pa.array(['a', 'b'])])

Review comment:
       Or even more, it doesn't seem to impact the result if I set different 
values there than the actual values in the partition column

##########
File path: python/pyarrow/tests/test_dataset.py
##########
@@ -2315,6 +2315,29 @@ def test_write_dataset_partitioned(tempdir):
         partitioning=partitioning_schema)
 
 
[email protected]
[email protected]
+def test_write_dataset_partitioned_dict(tempdir):
+    directory = tempdir / "partitioned"
+    _ = _create_parquet_dataset_partitioned(directory)
+
+    # directory partitioning, dictionary partition columns
+    dataset = ds.dataset(
+        directory,
+        partitioning=ds.HivePartitioning.discover(infer_dictionary=True))
+    target = tempdir / 'partitioned-dir-target'
+    expected_paths = [
+        target / "a", target / "a" / "part-0.feather",
+        target / "b", target / "b" / "part-1.feather"
+    ]
+    partitioning_schema = ds.partitioning(pa.schema([
+        dataset.schema.field('part')]),
+        dictionaries=[pa.array(['a', 'b'])])

Review comment:
       Are you required to pass those dictionaries for writing? 
   Based on a quick test locally, it seems to work without as well? 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #9130: ARROW-10247: [C++][Dataset] Support writing datasets partitioned on dictionary columns

Reply via email to