This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new f01a8db4bcf [SPARK-41164][CONNECT] Update relations.proto to follow Connect proto development guide f01a8db4bcf is described below commit f01a8db4bcf09c4975029e85722053ff82f8a355 Author: Rui Wang <rui.w...@databricks.com> AuthorDate: Thu Nov 17 20:00:52 2022 +0900 [SPARK-41164][CONNECT] Update relations.proto to follow Connect proto development guide ### What changes were proposed in this pull request? As we have a guidance for Connect proto ([adding proto messages](https://github.com/apache/spark/blob/master/connector/connect/docs/adding-proto-messages.md)), this PR updates `relations.proto` to follow the development guide. This PR also adds some missing documentation for the proto. ### Why are the changes needed? 1. Follow development guide. 2. Improve proto Documentation. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? Existing UT Closes #38678 from amaliujia/improve_relation_proto_to_follow_proto_rules. Authored-by: Rui Wang <rui.w...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../main/protobuf/spark/connect/relations.proto | 128 +++++++-- python/pyspark/sql/connect/proto/relations_pb2.py | 114 ++++---- python/pyspark/sql/connect/proto/relations_pb2.pyi | 302 +++++++++++++++------ 3 files changed, 372 insertions(+), 172 deletions(-) diff --git a/connector/connect/src/main/protobuf/spark/connect/relations.proto b/connector/connect/src/main/protobuf/spark/connect/relations.proto index 8fa10c4e093..52ff780d093 100644 --- a/connector/connect/src/main/protobuf/spark/connect/relations.proto +++ b/connector/connect/src/main/protobuf/spark/connect/relations.proto @@ -67,11 +67,13 @@ message Unknown {} // Common metadata of all relations. message RelationCommon { + // (Required) Shared relation metadata. string source_info = 1; } // Relation that uses a SQL query to generate the output. message SQL { + // (Required) The SQL query. string query = 1; } @@ -84,15 +86,20 @@ message Read { } message NamedTable { + // (Required) Unparsed identifier for the table. string unparsed_identifier = 1; } message DataSource { - // Required. Supported formats include: parquet, orc, text, json, parquet, csv, avro. + // (Required) Supported formats include: parquet, orc, text, json, parquet, csv, avro. string format = 1; - // Optional. If not set, Spark will infer the schema. - string schema = 2; - // The key is case insensitive. + + // (Optional) If not set, Spark will infer the schema. + optional string schema = 2; + + // Options for the data source. The context of this map varies based on the + // data source format. This options could be empty for valid data source format. + // The map key is case insensitive. map<string, string> options = 3; } } @@ -106,13 +113,18 @@ message Project { // // For example, `SELECT ABS(-1)` is valid plan without an input plan. Relation input = 1; + + // (Required) A Project requires at least one expression. repeated Expression expressions = 3; } // Relation that applies a boolean expression `condition` on each row of `input` to produce // the output result. message Filter { + // (Required) Input relation for a Filter. Relation input = 1; + + // (Required) A Filter must have a condition expression. Expression condition = 2; } @@ -120,10 +132,20 @@ message Filter { // // `left` and `right` must be present. message Join { + // (Required) Left input relation for a Join. Relation left = 1; + + // (Required) Right input relation for a Join. Relation right = 2; + + // (Optional) The join condition. Could be unset when `using_columns` is utilized. + // + // This field does not co-exist with using_columns. Expression join_condition = 3; + + // (Required) The join type. JoinType join_type = 4; + // Optional. using_columns provides a list of columns that should present on both sides of // the join inputs that this Join will join on. For example A JOIN B USING col_name is // equivalent to A JOIN B on A.col_name = B.col_name. @@ -144,11 +166,25 @@ message Join { // Relation of type [[SetOperation]] message SetOperation { + // (Required) Left input relation for a Set operation. Relation left_input = 1; + + // (Required) Right input relation for a Set operation. Relation right_input = 2; + + // (Required) The Set operation type. SetOpType set_op_type = 3; - bool is_all = 4; - bool by_name = 5; + + // (Optional) If to remove duplicate rows. + // + // True to preserve all results. + // False to remove duplicate rows. + optional bool is_all = 4; + + // (Optional) If to perform the Set operation based on name resolution. + // + // Only UNION supports this option. + optional bool by_name = 5; enum SetOpType { SET_OP_TYPE_UNSPECIFIED = 0; @@ -160,29 +196,42 @@ message SetOperation { // Relation of type [[Limit]] that is used to `limit` rows from the input relation. message Limit { + // (Required) Input relation for a Limit. Relation input = 1; + + // (Required) the limit. int32 limit = 2; } // Relation of type [[Offset]] that is used to read rows staring from the `offset` on // the input relation. message Offset { + // (Required) Input relation for an Offset. Relation input = 1; + + // (Required) the limit. int32 offset = 2; } // Relation of type [[Aggregate]]. message Aggregate { + // (Required) Input relation for a Aggregate. Relation input = 1; + repeated Expression grouping_expressions = 2; repeated Expression result_expressions = 3; } // Relation of type [[Sort]]. message Sort { + // (Required) Input relation for a Sort. Relation input = 1; + + // (Required) Sort fields. repeated SortField sort_fields = 2; - bool is_global = 3; + + // (Optional) if this is a global sort. + optional bool is_global = 3; message SortField { Expression expression = 1; @@ -206,33 +255,56 @@ message Sort { // Relation of type [[Deduplicate]] which have duplicate rows removed, could consider either only // the subset of columns or all the columns. message Deduplicate { + // (Required) Input relation for a Deduplicate. Relation input = 1; + + // (Optional) Deduplicate based on a list of column names. + // + // This field does not co-use with `all_columns_as_keys`. repeated string column_names = 2; - bool all_columns_as_keys = 3; + + // (Optional) Deduplicate based on all the columns of the input relation. + // + // This field does not co-use with `column_names`. + optional bool all_columns_as_keys = 3; } +// A relation that does not need to be qualified by name. message LocalRelation { + // (Optional) A list qualified attributes. repeated Expression.QualifiedAttribute attributes = 1; // TODO: support local data. } // Relation of type [[Sample]] that samples a fraction of the dataset. message Sample { + // (Required) Input relation for a Sample. Relation input = 1; + + // (Required) lower bound. double lower_bound = 2; + + // (Required) upper bound. double upper_bound = 3; - bool with_replacement = 4; + + // (Optional) Whether to sample with replacement. + optional bool with_replacement = 4; + + // (Optional) The random seed. optional int64 seed = 5; } // Relation of type [[Range]] that generates a sequence of integers. message Range { - // Optional. Default value = 0 - int64 start = 1; - // Required. + // (Optional) Default value = 0 + optional int64 start = 1; + + // (Required) int64 end = 2; - // Required. + + // (Required) int64 step = 3; + // Optional. Default value is assigned by 1) SQL conf "spark.sql.leafNodeDefaultParallelism" if // it is set, or 2) spark default parallelism. optional int32 num_partitions = 4; @@ -240,24 +312,26 @@ message Range { // Relation alias. message SubqueryAlias { - // Required. The input relation. + // (Required) The input relation of SubqueryAlias. Relation input = 1; - // Required. The alias. + + // (Required) The alias. string alias = 2; - // Optional. Qualifier of the alias. + + // (Optional) Qualifier of the alias. repeated string qualifier = 3; } // Relation repartition. message Repartition { - // Required. The input relation. + // (Required) The input relation of Repartition. Relation input = 1; - // Required. Must be positive. + // (Required) Must be positive. int32 num_partitions = 2; - // Optional. Default value is false. - bool shuffle = 3; + // (Optional) Default value is false. + optional bool shuffle = 3; } // Compose the string representing rows for output. @@ -267,14 +341,14 @@ message ShowString { Relation input = 1; // (Required) Number of rows to show. - optional int32 numRows = 2; + int32 numRows = 2; // (Required) If set to more than 0, truncates strings to // `truncate` characters and all cells will be aligned right. - optional int32 truncate = 3; + int32 truncate = 3; // (Required) If set to true, prints output rows vertically (one line per column value). - optional bool vertical = 4; + bool vertical = 4; } // Computes specified statistics for numeric and string columns. @@ -344,10 +418,10 @@ message NAFill { // Rename columns on the input relation by the same length of names. message RenameColumnsBySameLengthNames { - // Required. The input relation. + // (Required) The input relation of RenameColumnsBySameLengthNames. Relation input = 1; - // Required. + // (Required) // // The number of columns of the input relation must be equal to the length // of this field. If this is not true, an exception will be returned. @@ -357,11 +431,11 @@ message RenameColumnsBySameLengthNames { // Rename columns on the input relation by a map with name to name mapping. message RenameColumnsByNameToNameMap { - // Required. The input relation. + // (Required) The input relation. Relation input = 1; - // Required. + // (Required) // // Renaming column names of input relation from A to B where A is the map key // and B is the map value. This is a no-op if schema doesn't contain any A. It diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py index 7e77d56eaa6..9bc5e75ea64 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.py +++ b/python/pyspark/sql/connect/proto/relations_pb2.py @@ -32,7 +32,7 @@ from pyspark.sql.connect.proto import expressions_pb2 as spark_dot_connect_dot_e DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\xa6\x0b\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0 [...] + b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\xa6\x0b\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0 [...] ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -54,63 +54,63 @@ if _descriptor._USE_C_DESCRIPTORS == False: _SQL._serialized_start = 1592 _SQL._serialized_end = 1619 _READ._serialized_start = 1622 - _READ._serialized_end = 2032 + _READ._serialized_end = 2048 _READ_NAMEDTABLE._serialized_start = 1764 _READ_NAMEDTABLE._serialized_end = 1825 _READ_DATASOURCE._serialized_start = 1828 - _READ_DATASOURCE._serialized_end = 2019 - _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 1961 - _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2019 - _PROJECT._serialized_start = 2034 - _PROJECT._serialized_end = 2151 - _FILTER._serialized_start = 2153 - _FILTER._serialized_end = 2265 - _JOIN._serialized_start = 2268 - _JOIN._serialized_end = 2718 - _JOIN_JOINTYPE._serialized_start = 2531 - _JOIN_JOINTYPE._serialized_end = 2718 - _SETOPERATION._serialized_start = 2721 - _SETOPERATION._serialized_end = 3084 - _SETOPERATION_SETOPTYPE._serialized_start = 2970 - _SETOPERATION_SETOPTYPE._serialized_end = 3084 - _LIMIT._serialized_start = 3086 - _LIMIT._serialized_end = 3162 - _OFFSET._serialized_start = 3164 - _OFFSET._serialized_end = 3243 - _AGGREGATE._serialized_start = 3246 - _AGGREGATE._serialized_end = 3456 - _SORT._serialized_start = 3459 - _SORT._serialized_end = 3990 - _SORT_SORTFIELD._serialized_start = 3608 - _SORT_SORTFIELD._serialized_end = 3796 - _SORT_SORTDIRECTION._serialized_start = 3798 - _SORT_SORTDIRECTION._serialized_end = 3906 - _SORT_SORTNULLS._serialized_start = 3908 - _SORT_SORTNULLS._serialized_end = 3990 - _DEDUPLICATE._serialized_start = 3993 - _DEDUPLICATE._serialized_end = 4135 - _LOCALRELATION._serialized_start = 4137 - _LOCALRELATION._serialized_end = 4230 - _SAMPLE._serialized_start = 4233 - _SAMPLE._serialized_end = 4431 - _RANGE._serialized_start = 4434 - _RANGE._serialized_end = 4564 - _SUBQUERYALIAS._serialized_start = 4566 - _SUBQUERYALIAS._serialized_end = 4680 - _REPARTITION._serialized_start = 4682 - _REPARTITION._serialized_end = 4807 - _SHOWSTRING._serialized_start = 4810 - _SHOWSTRING._serialized_end = 5004 - _STATSUMMARY._serialized_start = 5006 - _STATSUMMARY._serialized_end = 5098 - _STATCROSSTAB._serialized_start = 5100 - _STATCROSSTAB._serialized_end = 5201 - _NAFILL._serialized_start = 5204 - _NAFILL._serialized_end = 5338 - _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 5340 - _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 5454 - _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 5457 - _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 5716 - _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start = 5649 - _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 5716 + _READ_DATASOURCE._serialized_end = 2035 + _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 1966 + _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2024 + _PROJECT._serialized_start = 2050 + _PROJECT._serialized_end = 2167 + _FILTER._serialized_start = 2169 + _FILTER._serialized_end = 2281 + _JOIN._serialized_start = 2284 + _JOIN._serialized_end = 2734 + _JOIN_JOINTYPE._serialized_start = 2547 + _JOIN_JOINTYPE._serialized_end = 2734 + _SETOPERATION._serialized_start = 2737 + _SETOPERATION._serialized_end = 3133 + _SETOPERATION_SETOPTYPE._serialized_start = 2996 + _SETOPERATION_SETOPTYPE._serialized_end = 3110 + _LIMIT._serialized_start = 3135 + _LIMIT._serialized_end = 3211 + _OFFSET._serialized_start = 3213 + _OFFSET._serialized_end = 3292 + _AGGREGATE._serialized_start = 3295 + _AGGREGATE._serialized_end = 3505 + _SORT._serialized_start = 3508 + _SORT._serialized_end = 4058 + _SORT_SORTFIELD._serialized_start = 3662 + _SORT_SORTFIELD._serialized_end = 3850 + _SORT_SORTDIRECTION._serialized_start = 3852 + _SORT_SORTDIRECTION._serialized_end = 3960 + _SORT_SORTNULLS._serialized_start = 3962 + _SORT_SORTNULLS._serialized_end = 4044 + _DEDUPLICATE._serialized_start = 4061 + _DEDUPLICATE._serialized_end = 4232 + _LOCALRELATION._serialized_start = 4234 + _LOCALRELATION._serialized_end = 4327 + _SAMPLE._serialized_start = 4330 + _SAMPLE._serialized_end = 4554 + _RANGE._serialized_start = 4557 + _RANGE._serialized_end = 4702 + _SUBQUERYALIAS._serialized_start = 4704 + _SUBQUERYALIAS._serialized_end = 4818 + _REPARTITION._serialized_start = 4821 + _REPARTITION._serialized_end = 4963 + _SHOWSTRING._serialized_start = 4966 + _SHOWSTRING._serialized_end = 5107 + _STATSUMMARY._serialized_start = 5109 + _STATSUMMARY._serialized_end = 5201 + _STATCROSSTAB._serialized_start = 5203 + _STATCROSSTAB._serialized_end = 5304 + _NAFILL._serialized_start = 5307 + _NAFILL._serialized_end = 5441 + _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 5443 + _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 5557 + _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 5560 + _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 5819 + _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start = 5752 + _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 5819 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index 762ccd8c0e8..27c6db4e748 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -319,6 +319,7 @@ class RelationCommon(google.protobuf.message.Message): SOURCE_INFO_FIELD_NUMBER: builtins.int source_info: builtins.str + """(Required) Shared relation metadata.""" def __init__( self, *, @@ -337,6 +338,7 @@ class SQL(google.protobuf.message.Message): QUERY_FIELD_NUMBER: builtins.int query: builtins.str + """(Required) The SQL query.""" def __init__( self, *, @@ -358,6 +360,7 @@ class Read(google.protobuf.message.Message): UNPARSED_IDENTIFIER_FIELD_NUMBER: builtins.int unparsed_identifier: builtins.str + """(Required) Unparsed identifier for the table.""" def __init__( self, *, @@ -392,27 +395,43 @@ class Read(google.protobuf.message.Message): SCHEMA_FIELD_NUMBER: builtins.int OPTIONS_FIELD_NUMBER: builtins.int format: builtins.str - """Required. Supported formats include: parquet, orc, text, json, parquet, csv, avro.""" + """(Required) Supported formats include: parquet, orc, text, json, parquet, csv, avro.""" schema: builtins.str - """Optional. If not set, Spark will infer the schema.""" + """(Optional) If not set, Spark will infer the schema.""" @property def options( self, ) -> google.protobuf.internal.containers.ScalarMap[builtins.str, builtins.str]: - """The key is case insensitive.""" + """Options for the data source. The context of this map varies based on the + data source format. This options could be empty for valid data source format. + The map key is case insensitive. + """ def __init__( self, *, format: builtins.str = ..., - schema: builtins.str = ..., + schema: builtins.str | None = ..., options: collections.abc.Mapping[builtins.str, builtins.str] | None = ..., ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["_schema", b"_schema", "schema", b"schema"] + ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ - "format", b"format", "options", b"options", "schema", b"schema" + "_schema", + b"_schema", + "format", + b"format", + "options", + b"options", + "schema", + b"schema", ], ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_schema", b"_schema"] + ) -> typing_extensions.Literal["schema"] | None: ... NAMED_TABLE_FIELD_NUMBER: builtins.int DATA_SOURCE_FIELD_NUMBER: builtins.int @@ -466,7 +485,8 @@ class Project(google.protobuf.message.Message): self, ) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[ pyspark.sql.connect.proto.expressions_pb2.Expression - ]: ... + ]: + """(Required) A Project requires at least one expression.""" def __init__( self, *, @@ -494,9 +514,11 @@ class Filter(google.protobuf.message.Message): INPUT_FIELD_NUMBER: builtins.int CONDITION_FIELD_NUMBER: builtins.int @property - def input(self) -> global___Relation: ... + def input(self) -> global___Relation: + """(Required) Input relation for a Filter.""" @property - def condition(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression: ... + def condition(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression: + """(Required) A Filter must have a condition expression.""" def __init__( self, *, @@ -552,12 +574,19 @@ class Join(google.protobuf.message.Message): JOIN_TYPE_FIELD_NUMBER: builtins.int USING_COLUMNS_FIELD_NUMBER: builtins.int @property - def left(self) -> global___Relation: ... + def left(self) -> global___Relation: + """(Required) Left input relation for a Join.""" @property - def right(self) -> global___Relation: ... + def right(self) -> global___Relation: + """(Required) Right input relation for a Join.""" @property - def join_condition(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression: ... + def join_condition(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression: + """(Optional) The join condition. Could be unset when `using_columns` is utilized. + + This field does not co-exist with using_columns. + """ join_type: global___Join.JoinType.ValueType + """(Required) The join type.""" @property def using_columns( self, @@ -634,30 +663,57 @@ class SetOperation(google.protobuf.message.Message): IS_ALL_FIELD_NUMBER: builtins.int BY_NAME_FIELD_NUMBER: builtins.int @property - def left_input(self) -> global___Relation: ... + def left_input(self) -> global___Relation: + """(Required) Left input relation for a Set operation.""" @property - def right_input(self) -> global___Relation: ... + def right_input(self) -> global___Relation: + """(Required) Right input relation for a Set operation.""" set_op_type: global___SetOperation.SetOpType.ValueType + """(Required) The Set operation type.""" is_all: builtins.bool + """(Optional) If to remove duplicate rows. + + True to preserve all results. + False to remove duplicate rows. + """ by_name: builtins.bool + """(Optional) If to perform the Set operation based on name resolution. + + Only UNION supports this option. + """ def __init__( self, *, left_input: global___Relation | None = ..., right_input: global___Relation | None = ..., set_op_type: global___SetOperation.SetOpType.ValueType = ..., - is_all: builtins.bool = ..., - by_name: builtins.bool = ..., + is_all: builtins.bool | None = ..., + by_name: builtins.bool | None = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal[ - "left_input", b"left_input", "right_input", b"right_input" + "_by_name", + b"_by_name", + "_is_all", + b"_is_all", + "by_name", + b"by_name", + "is_all", + b"is_all", + "left_input", + b"left_input", + "right_input", + b"right_input", ], ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ + "_by_name", + b"_by_name", + "_is_all", + b"_is_all", "by_name", b"by_name", "is_all", @@ -670,6 +726,14 @@ class SetOperation(google.protobuf.message.Message): b"set_op_type", ], ) -> None: ... + @typing.overload + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_by_name", b"_by_name"] + ) -> typing_extensions.Literal["by_name"] | None: ... + @typing.overload + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_is_all", b"_is_all"] + ) -> typing_extensions.Literal["is_all"] | None: ... global___SetOperation = SetOperation @@ -681,8 +745,10 @@ class Limit(google.protobuf.message.Message): INPUT_FIELD_NUMBER: builtins.int LIMIT_FIELD_NUMBER: builtins.int @property - def input(self) -> global___Relation: ... + def input(self) -> global___Relation: + """(Required) Input relation for a Limit.""" limit: builtins.int + """(Required) the limit.""" def __init__( self, *, @@ -708,8 +774,10 @@ class Offset(google.protobuf.message.Message): INPUT_FIELD_NUMBER: builtins.int OFFSET_FIELD_NUMBER: builtins.int @property - def input(self) -> global___Relation: ... + def input(self) -> global___Relation: + """(Required) Input relation for an Offset.""" offset: builtins.int + """(Required) the limit.""" def __init__( self, *, @@ -734,7 +802,8 @@ class Aggregate(google.protobuf.message.Message): GROUPING_EXPRESSIONS_FIELD_NUMBER: builtins.int RESULT_EXPRESSIONS_FIELD_NUMBER: builtins.int @property - def input(self) -> global___Relation: ... + def input(self) -> global___Relation: + """(Required) Input relation for a Aggregate.""" @property def grouping_expressions( self, @@ -849,30 +918,46 @@ class Sort(google.protobuf.message.Message): SORT_FIELDS_FIELD_NUMBER: builtins.int IS_GLOBAL_FIELD_NUMBER: builtins.int @property - def input(self) -> global___Relation: ... + def input(self) -> global___Relation: + """(Required) Input relation for a Sort.""" @property def sort_fields( self, ) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[ global___Sort.SortField - ]: ... + ]: + """(Required) Sort fields.""" is_global: builtins.bool + """(Optional) if this is a global sort.""" def __init__( self, *, input: global___Relation | None = ..., sort_fields: collections.abc.Iterable[global___Sort.SortField] | None = ..., - is_global: builtins.bool = ..., + is_global: builtins.bool | None = ..., ) -> None: ... def HasField( - self, field_name: typing_extensions.Literal["input", b"input"] + self, + field_name: typing_extensions.Literal[ + "_is_global", b"_is_global", "input", b"input", "is_global", b"is_global" + ], ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ - "input", b"input", "is_global", b"is_global", "sort_fields", b"sort_fields" + "_is_global", + b"_is_global", + "input", + b"input", + "is_global", + b"is_global", + "sort_fields", + b"sort_fields", ], ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_is_global", b"_is_global"] + ) -> typing_extensions.Literal["is_global"] | None: ... global___Sort = Sort @@ -887,25 +972,44 @@ class Deduplicate(google.protobuf.message.Message): COLUMN_NAMES_FIELD_NUMBER: builtins.int ALL_COLUMNS_AS_KEYS_FIELD_NUMBER: builtins.int @property - def input(self) -> global___Relation: ... + def input(self) -> global___Relation: + """(Required) Input relation for a Deduplicate.""" @property def column_names( self, - ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ... + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: + """(Optional) Deduplicate based on a list of column names. + + This field does not co-use with `all_columns_as_keys`. + """ all_columns_as_keys: builtins.bool + """(Optional) Deduplicate based on all the columns of the input relation. + + This field does not co-use with `column_names`. + """ def __init__( self, *, input: global___Relation | None = ..., column_names: collections.abc.Iterable[builtins.str] | None = ..., - all_columns_as_keys: builtins.bool = ..., + all_columns_as_keys: builtins.bool | None = ..., ) -> None: ... def HasField( - self, field_name: typing_extensions.Literal["input", b"input"] + self, + field_name: typing_extensions.Literal[ + "_all_columns_as_keys", + b"_all_columns_as_keys", + "all_columns_as_keys", + b"all_columns_as_keys", + "input", + b"input", + ], ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ + "_all_columns_as_keys", + b"_all_columns_as_keys", "all_columns_as_keys", b"all_columns_as_keys", "column_names", @@ -914,10 +1018,16 @@ class Deduplicate(google.protobuf.message.Message): b"input", ], ) -> None: ... + def WhichOneof( + self, + oneof_group: typing_extensions.Literal["_all_columns_as_keys", b"_all_columns_as_keys"], + ) -> typing_extensions.Literal["all_columns_as_keys"] | None: ... global___Deduplicate = Deduplicate class LocalRelation(google.protobuf.message.Message): + """A relation that does not need to be qualified by name.""" + DESCRIPTOR: google.protobuf.descriptor.Descriptor ATTRIBUTES_FIELD_NUMBER: builtins.int @@ -927,7 +1037,9 @@ class LocalRelation(google.protobuf.message.Message): ) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[ pyspark.sql.connect.proto.expressions_pb2.Expression.QualifiedAttribute ]: - """TODO: support local data.""" + """(Optional) A list qualified attributes. + TODO: support local data. + """ def __init__( self, *, @@ -953,24 +1065,38 @@ class Sample(google.protobuf.message.Message): WITH_REPLACEMENT_FIELD_NUMBER: builtins.int SEED_FIELD_NUMBER: builtins.int @property - def input(self) -> global___Relation: ... + def input(self) -> global___Relation: + """(Required) Input relation for a Sample.""" lower_bound: builtins.float + """(Required) lower bound.""" upper_bound: builtins.float + """(Required) upper bound.""" with_replacement: builtins.bool + """(Optional) Whether to sample with replacement.""" seed: builtins.int + """(Optional) The random seed.""" def __init__( self, *, input: global___Relation | None = ..., lower_bound: builtins.float = ..., upper_bound: builtins.float = ..., - with_replacement: builtins.bool = ..., + with_replacement: builtins.bool | None = ..., seed: builtins.int | None = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal[ - "_seed", b"_seed", "input", b"input", "seed", b"seed" + "_seed", + b"_seed", + "_with_replacement", + b"_with_replacement", + "input", + b"input", + "seed", + b"seed", + "with_replacement", + b"with_replacement", ], ) -> builtins.bool: ... def ClearField( @@ -978,6 +1104,8 @@ class Sample(google.protobuf.message.Message): field_name: typing_extensions.Literal[ "_seed", b"_seed", + "_with_replacement", + b"_with_replacement", "input", b"input", "lower_bound", @@ -990,9 +1118,14 @@ class Sample(google.protobuf.message.Message): b"with_replacement", ], ) -> None: ... + @typing.overload def WhichOneof( self, oneof_group: typing_extensions.Literal["_seed", b"_seed"] ) -> typing_extensions.Literal["seed"] | None: ... + @typing.overload + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_with_replacement", b"_with_replacement"] + ) -> typing_extensions.Literal["with_replacement"] | None: ... global___Sample = Sample @@ -1006,11 +1139,11 @@ class Range(google.protobuf.message.Message): STEP_FIELD_NUMBER: builtins.int NUM_PARTITIONS_FIELD_NUMBER: builtins.int start: builtins.int - """Optional. Default value = 0""" + """(Optional) Default value = 0""" end: builtins.int - """Required.""" + """(Required)""" step: builtins.int - """Required.""" + """(Required)""" num_partitions: builtins.int """Optional. Default value is assigned by 1) SQL conf "spark.sql.leafNodeDefaultParallelism" if it is set, or 2) spark default parallelism. @@ -1018,7 +1151,7 @@ class Range(google.protobuf.message.Message): def __init__( self, *, - start: builtins.int = ..., + start: builtins.int | None = ..., end: builtins.int = ..., step: builtins.int = ..., num_partitions: builtins.int | None = ..., @@ -1026,7 +1159,14 @@ class Range(google.protobuf.message.Message): def HasField( self, field_name: typing_extensions.Literal[ - "_num_partitions", b"_num_partitions", "num_partitions", b"num_partitions" + "_num_partitions", + b"_num_partitions", + "_start", + b"_start", + "num_partitions", + b"num_partitions", + "start", + b"start", ], ) -> builtins.bool: ... def ClearField( @@ -1034,6 +1174,8 @@ class Range(google.protobuf.message.Message): field_name: typing_extensions.Literal[ "_num_partitions", b"_num_partitions", + "_start", + b"_start", "end", b"end", "num_partitions", @@ -1044,9 +1186,14 @@ class Range(google.protobuf.message.Message): b"step", ], ) -> None: ... + @typing.overload def WhichOneof( self, oneof_group: typing_extensions.Literal["_num_partitions", b"_num_partitions"] ) -> typing_extensions.Literal["num_partitions"] | None: ... + @typing.overload + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_start", b"_start"] + ) -> typing_extensions.Literal["start"] | None: ... global___Range = Range @@ -1060,14 +1207,14 @@ class SubqueryAlias(google.protobuf.message.Message): QUALIFIER_FIELD_NUMBER: builtins.int @property def input(self) -> global___Relation: - """Required. The input relation.""" + """(Required) The input relation of SubqueryAlias.""" alias: builtins.str - """Required. The alias.""" + """(Required) The alias.""" @property def qualifier( self, ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: - """Optional. Qualifier of the alias.""" + """(Optional) Qualifier of the alias.""" def __init__( self, *, @@ -1097,27 +1244,40 @@ class Repartition(google.protobuf.message.Message): SHUFFLE_FIELD_NUMBER: builtins.int @property def input(self) -> global___Relation: - """Required. The input relation.""" + """(Required) The input relation of Repartition.""" num_partitions: builtins.int - """Required. Must be positive.""" + """(Required) Must be positive.""" shuffle: builtins.bool - """Optional. Default value is false.""" + """(Optional) Default value is false.""" def __init__( self, *, input: global___Relation | None = ..., num_partitions: builtins.int = ..., - shuffle: builtins.bool = ..., + shuffle: builtins.bool | None = ..., ) -> None: ... def HasField( - self, field_name: typing_extensions.Literal["input", b"input"] + self, + field_name: typing_extensions.Literal[ + "_shuffle", b"_shuffle", "input", b"input", "shuffle", b"shuffle" + ], ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ - "input", b"input", "num_partitions", b"num_partitions", "shuffle", b"shuffle" + "_shuffle", + b"_shuffle", + "input", + b"input", + "num_partitions", + b"num_partitions", + "shuffle", + b"shuffle", ], ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_shuffle", b"_shuffle"] + ) -> typing_extensions.Literal["shuffle"] | None: ... global___Repartition = Repartition @@ -1147,38 +1307,16 @@ class ShowString(google.protobuf.message.Message): self, *, input: global___Relation | None = ..., - numRows: builtins.int | None = ..., - truncate: builtins.int | None = ..., - vertical: builtins.bool | None = ..., + numRows: builtins.int = ..., + truncate: builtins.int = ..., + vertical: builtins.bool = ..., ) -> None: ... def HasField( - self, - field_name: typing_extensions.Literal[ - "_numRows", - b"_numRows", - "_truncate", - b"_truncate", - "_vertical", - b"_vertical", - "input", - b"input", - "numRows", - b"numRows", - "truncate", - b"truncate", - "vertical", - b"vertical", - ], + self, field_name: typing_extensions.Literal["input", b"input"] ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ - "_numRows", - b"_numRows", - "_truncate", - b"_truncate", - "_vertical", - b"_vertical", "input", b"input", "numRows", @@ -1189,18 +1327,6 @@ class ShowString(google.protobuf.message.Message): b"vertical", ], ) -> None: ... - @typing.overload - def WhichOneof( - self, oneof_group: typing_extensions.Literal["_numRows", b"_numRows"] - ) -> typing_extensions.Literal["numRows"] | None: ... - @typing.overload - def WhichOneof( - self, oneof_group: typing_extensions.Literal["_truncate", b"_truncate"] - ) -> typing_extensions.Literal["truncate"] | None: ... - @typing.overload - def WhichOneof( - self, oneof_group: typing_extensions.Literal["_vertical", b"_vertical"] - ) -> typing_extensions.Literal["vertical"] | None: ... global___ShowString = ShowString @@ -1359,12 +1485,12 @@ class RenameColumnsBySameLengthNames(google.protobuf.message.Message): COLUMN_NAMES_FIELD_NUMBER: builtins.int @property def input(self) -> global___Relation: - """Required. The input relation.""" + """(Required) The input relation of RenameColumnsBySameLengthNames.""" @property def column_names( self, ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: - """Required. + """(Required) The number of columns of the input relation must be equal to the length of this field. If this is not true, an exception will be returned. @@ -1411,12 +1537,12 @@ class RenameColumnsByNameToNameMap(google.protobuf.message.Message): RENAME_COLUMNS_MAP_FIELD_NUMBER: builtins.int @property def input(self) -> global___Relation: - """Required. The input relation.""" + """(Required) The input relation.""" @property def rename_columns_map( self, ) -> google.protobuf.internal.containers.ScalarMap[builtins.str, builtins.str]: - """Required. + """(Required) Renaming column names of input relation from A to B where A is the map key and B is the map value. This is a no-op if schema doesn't contain any A. It --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org