This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 519b582ca0e [SPARK-45679][CONNECT] Add proto for clusterBy
519b582ca0e is described below

commit 519b582ca0e82f5766dcb92e2ea66be27cb76f9c
Author: Zhen Li <[email protected]>
AuthorDate: Fri Oct 27 10:47:17 2023 +0900

    [SPARK-45679][CONNECT] Add proto for clusterBy
    
    ### What changes were proposed in this pull request?
    Adding the proto change needed for clusterBy.
    Example of the clusterBy API:
    DataframeWriterV1
    ```
    df.write
      .format(...)
      .clusterBy("clusteringColumn1", "clusteringColumn2")
      .save(...) or saveAsTable(...)
    ```
    
    DataFrameWriterV2
    ```
    df.writeTo(...).using(...)
      .clusterBy("clusteringColumn1", "clusteringColumn2")
      .create() or replace() or createOrReplace()
    ```
    
    ### Why are the changes needed?
    The first PR to introduce the proto change for the feature.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Existing tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #43544 from zhenlineo/clusterby.
    
    Authored-by: Zhen Li <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 .../src/main/protobuf/spark/connect/commands.proto |   6 +
 python/pyspark/sql/connect/proto/commands_pb2.py   | 144 ++++++++++-----------
 python/pyspark/sql/connect/proto/commands_pb2.pyi  |  18 +++
 3 files changed, 96 insertions(+), 72 deletions(-)

diff --git 
a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto 
b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto
index 89a82c2c7e8..d120fb7b530 100644
--- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto
+++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto
@@ -127,6 +127,9 @@ message WriteOperation {
   // (Optional) A list of configuration options.
   map<string, string> options = 9;
 
+  // (Optional) Columns used for clustering the table.
+  repeated string clustering_columns = 10;
+
   message SaveTable {
     // (Required) The table name.
     string table_name = 1;
@@ -191,6 +194,9 @@ message WriteOperationV2 {
 
   // (Optional) A condition for overwrite saving mode
   Expression overwrite_condition = 8;
+
+  // (Optional) Columns used for clustering the table.
+  repeated string clustering_columns = 9;
 }
 
 // Starts write stream operation as streaming query. Query ID and Run ID of 
the streaming
diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py 
b/python/pyspark/sql/connect/proto/commands_pb2.py
index a1cbea09b3c..309d34852d2 100644
--- a/python/pyspark/sql/connect/proto/commands_pb2.py
+++ b/python/pyspark/sql/connect/proto/commands_pb2.py
@@ -35,7 +35,7 @@ from pyspark.sql.connect.proto import relations_pb2 as 
spark_dot_connect_dot_rel
 
 
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    
b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\xf5\x07\n\x07\x43ommand\x12]\n\x11register_function\x18\x01
 
\x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02
 
\x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x
 [...]
+    
b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\xf5\x07\n\x07\x43ommand\x12]\n\x11register_function\x18\x01
 
\x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02
 
\x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x
 [...]
 )
 
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
@@ -74,75 +74,75 @@ if _descriptor._USE_C_DESCRIPTORS == False:
     _CREATEDATAFRAMEVIEWCOMMAND._serialized_start = 1681
     _CREATEDATAFRAMEVIEWCOMMAND._serialized_end = 1831
     _WRITEOPERATION._serialized_start = 1834
-    _WRITEOPERATION._serialized_end = 2885
-    _WRITEOPERATION_OPTIONSENTRY._serialized_start = 2309
-    _WRITEOPERATION_OPTIONSENTRY._serialized_end = 2367
-    _WRITEOPERATION_SAVETABLE._serialized_start = 2370
-    _WRITEOPERATION_SAVETABLE._serialized_end = 2628
-    _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_start = 2504
-    _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_end = 2628
-    _WRITEOPERATION_BUCKETBY._serialized_start = 2630
-    _WRITEOPERATION_BUCKETBY._serialized_end = 2721
-    _WRITEOPERATION_SAVEMODE._serialized_start = 2724
-    _WRITEOPERATION_SAVEMODE._serialized_end = 2861
-    _WRITEOPERATIONV2._serialized_start = 2888
-    _WRITEOPERATIONV2._serialized_end = 3701
-    _WRITEOPERATIONV2_OPTIONSENTRY._serialized_start = 2309
-    _WRITEOPERATIONV2_OPTIONSENTRY._serialized_end = 2367
-    _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_start = 3460
-    _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_end = 3526
-    _WRITEOPERATIONV2_MODE._serialized_start = 3529
-    _WRITEOPERATIONV2_MODE._serialized_end = 3688
-    _WRITESTREAMOPERATIONSTART._serialized_start = 3704
-    _WRITESTREAMOPERATIONSTART._serialized_end = 4504
-    _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_start = 2309
-    _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_end = 2367
-    _STREAMINGFOREACHFUNCTION._serialized_start = 4507
-    _STREAMINGFOREACHFUNCTION._serialized_end = 4686
-    _WRITESTREAMOPERATIONSTARTRESULT._serialized_start = 4688
-    _WRITESTREAMOPERATIONSTARTRESULT._serialized_end = 4809
-    _STREAMINGQUERYINSTANCEID._serialized_start = 4811
-    _STREAMINGQUERYINSTANCEID._serialized_end = 4876
-    _STREAMINGQUERYCOMMAND._serialized_start = 4879
-    _STREAMINGQUERYCOMMAND._serialized_end = 5511
-    _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 5378
-    _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 5422
-    _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 5424
-    _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 5500
-    _STREAMINGQUERYCOMMANDRESULT._serialized_start = 5514
-    _STREAMINGQUERYCOMMANDRESULT._serialized_end = 6655
-    _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 6097
-    _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 6267
-    _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 6269
-    _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 6341
-    _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 6343
-    _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 6382
-    _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 6385
-    _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 6582
-    _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 
6584
-    _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 6640
-    _STREAMINGQUERYMANAGERCOMMAND._serialized_start = 6658
-    _STREAMINGQUERYMANAGERCOMMAND._serialized_end = 7487
-    _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_start 
= 7189
-    _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_end = 
7268
-    
_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_start = 
7271
-    
_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_end = 
7476
-    _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_start = 7490
-    _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_end = 8566
-    _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_start = 8098
-    _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_end = 8225
-    
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_start = 
8227
-    _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_end 
= 8342
-    
_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_start 
= 8344
-    
_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_end = 
8403
-    
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_start
 = 8405
-    
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_end
 = 8480
-    
_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_start
 = 8482
-    
_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_end
 = 8551
-    _GETRESOURCESCOMMAND._serialized_start = 8568
-    _GETRESOURCESCOMMAND._serialized_end = 8589
-    _GETRESOURCESCOMMANDRESULT._serialized_start = 8592
-    _GETRESOURCESCOMMANDRESULT._serialized_end = 8804
-    _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 8708
-    _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 8804
+    _WRITEOPERATION._serialized_end = 2932
+    _WRITEOPERATION_OPTIONSENTRY._serialized_start = 2356
+    _WRITEOPERATION_OPTIONSENTRY._serialized_end = 2414
+    _WRITEOPERATION_SAVETABLE._serialized_start = 2417
+    _WRITEOPERATION_SAVETABLE._serialized_end = 2675
+    _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_start = 2551
+    _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD._serialized_end = 2675
+    _WRITEOPERATION_BUCKETBY._serialized_start = 2677
+    _WRITEOPERATION_BUCKETBY._serialized_end = 2768
+    _WRITEOPERATION_SAVEMODE._serialized_start = 2771
+    _WRITEOPERATION_SAVEMODE._serialized_end = 2908
+    _WRITEOPERATIONV2._serialized_start = 2935
+    _WRITEOPERATIONV2._serialized_end = 3795
+    _WRITEOPERATIONV2_OPTIONSENTRY._serialized_start = 2356
+    _WRITEOPERATIONV2_OPTIONSENTRY._serialized_end = 2414
+    _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_start = 3554
+    _WRITEOPERATIONV2_TABLEPROPERTIESENTRY._serialized_end = 3620
+    _WRITEOPERATIONV2_MODE._serialized_start = 3623
+    _WRITEOPERATIONV2_MODE._serialized_end = 3782
+    _WRITESTREAMOPERATIONSTART._serialized_start = 3798
+    _WRITESTREAMOPERATIONSTART._serialized_end = 4598
+    _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_start = 2356
+    _WRITESTREAMOPERATIONSTART_OPTIONSENTRY._serialized_end = 2414
+    _STREAMINGFOREACHFUNCTION._serialized_start = 4601
+    _STREAMINGFOREACHFUNCTION._serialized_end = 4780
+    _WRITESTREAMOPERATIONSTARTRESULT._serialized_start = 4782
+    _WRITESTREAMOPERATIONSTARTRESULT._serialized_end = 4903
+    _STREAMINGQUERYINSTANCEID._serialized_start = 4905
+    _STREAMINGQUERYINSTANCEID._serialized_end = 4970
+    _STREAMINGQUERYCOMMAND._serialized_start = 4973
+    _STREAMINGQUERYCOMMAND._serialized_end = 5605
+    _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 5472
+    _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 5516
+    _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 5518
+    _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 5594
+    _STREAMINGQUERYCOMMANDRESULT._serialized_start = 5608
+    _STREAMINGQUERYCOMMANDRESULT._serialized_end = 6749
+    _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 6191
+    _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 6361
+    _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 6363
+    _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 6435
+    _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 6437
+    _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 6476
+    _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 6479
+    _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 6676
+    _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 
6678
+    _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 6734
+    _STREAMINGQUERYMANAGERCOMMAND._serialized_start = 6752
+    _STREAMINGQUERYMANAGERCOMMAND._serialized_end = 7581
+    _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_start 
= 7283
+    _STREAMINGQUERYMANAGERCOMMAND_AWAITANYTERMINATIONCOMMAND._serialized_end = 
7362
+    
_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_start = 
7365
+    
_STREAMINGQUERYMANAGERCOMMAND_STREAMINGQUERYLISTENERCOMMAND._serialized_end = 
7570
+    _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_start = 7584
+    _STREAMINGQUERYMANAGERCOMMANDRESULT._serialized_end = 8660
+    _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_start = 8192
+    _STREAMINGQUERYMANAGERCOMMANDRESULT_ACTIVERESULT._serialized_end = 8319
+    
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_start = 
8321
+    _STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYINSTANCE._serialized_end 
= 8436
+    
_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_start 
= 8438
+    
_STREAMINGQUERYMANAGERCOMMANDRESULT_AWAITANYTERMINATIONRESULT._serialized_end = 
8497
+    
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_start
 = 8499
+    
_STREAMINGQUERYMANAGERCOMMANDRESULT_STREAMINGQUERYLISTENERINSTANCE._serialized_end
 = 8574
+    
_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_start
 = 8576
+    
_STREAMINGQUERYMANAGERCOMMANDRESULT_LISTSTREAMINGQUERYLISTENERRESULT._serialized_end
 = 8645
+    _GETRESOURCESCOMMAND._serialized_start = 8662
+    _GETRESOURCESCOMMAND._serialized_end = 8683
+    _GETRESOURCESCOMMANDRESULT._serialized_start = 8686
+    _GETRESOURCESCOMMANDRESULT._serialized_end = 8898
+    _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 8802
+    _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 8898
 # @@protoc_insertion_point(module_scope)
diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi 
b/python/pyspark/sql/connect/proto/commands_pb2.pyi
index 7855f403e9c..ce7f3e5abfd 100644
--- a/python/pyspark/sql/connect/proto/commands_pb2.pyi
+++ b/python/pyspark/sql/connect/proto/commands_pb2.pyi
@@ -489,6 +489,7 @@ class WriteOperation(google.protobuf.message.Message):
     PARTITIONING_COLUMNS_FIELD_NUMBER: builtins.int
     BUCKET_BY_FIELD_NUMBER: builtins.int
     OPTIONS_FIELD_NUMBER: builtins.int
+    CLUSTERING_COLUMNS_FIELD_NUMBER: builtins.int
     @property
     def input(self) -> pyspark.sql.connect.proto.relations_pb2.Relation:
         """(Required) The output of the `input` relation will be persisted 
according to the options."""
@@ -517,6 +518,11 @@ class WriteOperation(google.protobuf.message.Message):
     @property
     def options(self) -> 
google.protobuf.internal.containers.ScalarMap[builtins.str, builtins.str]:
         """(Optional) A list of configuration options."""
+    @property
+    def clustering_columns(
+        self,
+    ) -> 
google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+        """(Optional) Columns used for clustering the table."""
     def __init__(
         self,
         *,
@@ -529,6 +535,7 @@ class WriteOperation(google.protobuf.message.Message):
         partitioning_columns: collections.abc.Iterable[builtins.str] | None = 
...,
         bucket_by: global___WriteOperation.BucketBy | None = ...,
         options: collections.abc.Mapping[builtins.str, builtins.str] | None = 
...,
+        clustering_columns: collections.abc.Iterable[builtins.str] | None = 
...,
     ) -> None: ...
     def HasField(
         self,
@@ -556,6 +563,8 @@ class WriteOperation(google.protobuf.message.Message):
             b"_source",
             "bucket_by",
             b"bucket_by",
+            "clustering_columns",
+            b"clustering_columns",
             "input",
             b"input",
             "mode",
@@ -662,6 +671,7 @@ class WriteOperationV2(google.protobuf.message.Message):
     TABLE_PROPERTIES_FIELD_NUMBER: builtins.int
     MODE_FIELD_NUMBER: builtins.int
     OVERWRITE_CONDITION_FIELD_NUMBER: builtins.int
+    CLUSTERING_COLUMNS_FIELD_NUMBER: builtins.int
     @property
     def input(self) -> pyspark.sql.connect.proto.relations_pb2.Relation:
         """(Required) The output of the `input` relation will be persisted 
according to the options."""
@@ -693,6 +703,11 @@ class WriteOperationV2(google.protobuf.message.Message):
     @property
     def overwrite_condition(self) -> 
pyspark.sql.connect.proto.expressions_pb2.Expression:
         """(Optional) A condition for overwrite saving mode"""
+    @property
+    def clustering_columns(
+        self,
+    ) -> 
google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+        """(Optional) Columns used for clustering the table."""
     def __init__(
         self,
         *,
@@ -707,6 +722,7 @@ class WriteOperationV2(google.protobuf.message.Message):
         table_properties: collections.abc.Mapping[builtins.str, builtins.str] 
| None = ...,
         mode: global___WriteOperationV2.Mode.ValueType = ...,
         overwrite_condition: 
pyspark.sql.connect.proto.expressions_pb2.Expression | None = ...,
+        clustering_columns: collections.abc.Iterable[builtins.str] | None = 
...,
     ) -> None: ...
     def HasField(
         self,
@@ -726,6 +742,8 @@ class WriteOperationV2(google.protobuf.message.Message):
         field_name: typing_extensions.Literal[
             "_provider",
             b"_provider",
+            "clustering_columns",
+            b"clustering_columns",
             "input",
             b"input",
             "mode",


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to