This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new d9cf6de0a02 [SPARK-41403][CONNECT][PYTHON] Implement
`DataFrame.describe`
d9cf6de0a02 is described below
commit d9cf6de0a02e7b22b4ef961e1787878f960e0a64
Author: Jiaan Geng <[email protected]>
AuthorDate: Wed Dec 7 16:24:21 2022 +0800
[SPARK-41403][CONNECT][PYTHON] Implement `DataFrame.describe`
### What changes were proposed in this pull request?
Implement `DataFrame.describe` with a proto message
1. Implement `DataFrame.describe` for scala API
2. Implement `DataFrame.describe`for python API
### Why are the changes needed?
for Connect API coverage
### Does this PR introduce _any_ user-facing change?
'No'. New API
### How was this patch tested?
New test cases.
Closes #38938 from beliefer/SPARK-41403.
Authored-by: Jiaan Geng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.../main/protobuf/spark/connect/relations.proto | 12 ++
.../org/apache/spark/sql/connect/dsl/package.scala | 12 ++
.../sql/connect/planner/SparkConnectPlanner.scala | 8 +
.../connect/planner/SparkConnectProtoSuite.scala | 6 +
python/pyspark/sql/connect/dataframe.py | 37 ++++-
python/pyspark/sql/connect/plan.py | 28 ++++
python/pyspark/sql/connect/proto/relations_pb2.py | 178 +++++++++++----------
python/pyspark/sql/connect/proto/relations_pb2.pyi | 42 +++++
.../sql/tests/connect/test_connect_plan_only.py | 11 ++
9 files changed, 249 insertions(+), 85 deletions(-)
diff --git
a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
index bb891d63670..ece8767c06c 100644
--- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
+++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
@@ -63,6 +63,7 @@ message Relation {
// stat functions
StatSummary summary = 100;
StatCrosstab crosstab = 101;
+ StatDescribe describe = 102;
Unknown unknown = 999;
}
@@ -404,6 +405,17 @@ message StatSummary {
repeated string statistics = 2;
}
+// Computes basic statistics for numeric and string columns, including count,
mean, stddev, min,
+// and max. If no columns are given, this function computes statistics for all
numerical or
+// string columns.
+message StatDescribe {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Optional) Columns to compute statistics on.
+ repeated string cols = 2;
+}
+
// Computes a pair-wise frequency table of the given columns. Also known as a
contingency table.
// It will invoke 'Dataset.stat.crosstab' (same as
'StatFunctions.crossTabulate')
// to compute the results.
diff --git
a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala
b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala
index ec2d0cad95b..44baf407816 100644
---
a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala
+++
b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala
@@ -651,6 +651,18 @@ package object dsl {
.build()
}
+ def describe(cols: String*): Relation = {
+ Relation
+ .newBuilder()
+ .setDescribe(
+ proto.StatDescribe
+ .newBuilder()
+ .setInput(logicalPlan)
+ .addAllCols(cols.toSeq.asJava)
+ .build())
+ .build()
+ }
+
def toDF(columnNames: String*): Relation =
Relation
.newBuilder()
diff --git
a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
index d8b7843fbe7..af5d9abc515 100644
---
a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
+++
b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -86,6 +86,7 @@ class SparkConnectPlanner(session: SparkSession) {
case proto.Relation.RelTypeCase.DROP_NA => transformNADrop(rel.getDropNa)
case proto.Relation.RelTypeCase.REPLACE =>
transformReplace(rel.getReplace)
case proto.Relation.RelTypeCase.SUMMARY =>
transformStatSummary(rel.getSummary)
+ case proto.Relation.RelTypeCase.DESCRIBE =>
transformStatDescribe(rel.getDescribe)
case proto.Relation.RelTypeCase.CROSSTAB =>
transformStatCrosstab(rel.getCrosstab)
case proto.Relation.RelTypeCase.RENAME_COLUMNS_BY_SAME_LENGTH_NAMES =>
@@ -253,6 +254,13 @@ class SparkConnectPlanner(session: SparkSession) {
.logicalPlan
}
+ private def transformStatDescribe(rel: proto.StatDescribe): LogicalPlan = {
+ Dataset
+ .ofRows(session, transformRelation(rel.getInput))
+ .describe(rel.getColsList.asScala.toSeq: _*)
+ .logicalPlan
+ }
+
private def transformStatCrosstab(rel: proto.StatCrosstab): LogicalPlan = {
Dataset
.ofRows(session, transformRelation(rel.getInput))
diff --git
a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
index 074372b6c8d..6d36ea9a630 100644
---
a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
+++
b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
@@ -375,6 +375,12 @@ class SparkConnectProtoSuite extends PlanTest with
SparkConnectPlanTest {
sparkTestRelation.summary("count", "mean", "stddev"))
}
+ test("Test describe") {
+ comparePlans(
+ connectTestRelation.describe("id", "name"),
+ sparkTestRelation.describe("id", "name"))
+ }
+
test("Test crosstab") {
comparePlans(
connectTestRelation.stat.crosstab("id", "name"),
diff --git a/python/pyspark/sql/connect/dataframe.py
b/python/pyspark/sql/connect/dataframe.py
index 482a4e3d5b5..1e1b5dbff21 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -392,9 +392,6 @@ class DataFrame(object):
self._session,
)
- def describe(self, cols: List[Column]) -> Any:
- ...
-
def dropDuplicates(self, subset: Optional[List[str]] = None) ->
"DataFrame":
"""Return a new :class:`DataFrame` with duplicate rows removed,
optionally only deduplicating based on certain columns.
@@ -1328,6 +1325,40 @@ class DataFrame(object):
session=self._session,
)
+ def describe(self, *cols: str) -> "DataFrame":
+ """Computes basic statistics for numeric and string columns.
+
+ .. versionadded:: 3.4.0
+
+ This include count, mean, stddev, min, and max. If no columns are
+ given, this function computes statistics for all numerical or string
columns.
+
+ Notes
+ -----
+ This function is meant for exploratory data analysis, as we make no
+ guarantee about the backward compatibility of the schema of the
resulting
+ :class:`DataFrame`.
+ Use summary for expanded statistics and control over which statistics
to compute.
+
+ Parameters
+ ----------
+ cols : str, list, optional
+ Column name or list of column names to describe by (default All
columns).
+
+ Returns
+ -------
+ :class:`DataFrame`
+ A new DataFrame that describes (provides statistics) given
DataFrame.
+ """
+ _cols: List[str] = list(cols)
+ for s in _cols:
+ if not isinstance(s, str):
+ raise TypeError(f"'cols' must be list[str], but got
{type(s).__name__}")
+ return DataFrame.withPlan(
+ plan.StatDescribe(child=self._plan, cols=_cols),
+ session=self._session,
+ )
+
def crosstab(self, col1: str, col2: str) -> "DataFrame":
"""
Computes a pair-wise frequency table of the given columns. Also known
as a contingency
diff --git a/python/pyspark/sql/connect/plan.py
b/python/pyspark/sql/connect/plan.py
index 6c0e559158f..2de0dbb40c3 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -1169,6 +1169,34 @@ class StatSummary(LogicalPlan):
"""
+class StatDescribe(LogicalPlan):
+ def __init__(self, child: Optional["LogicalPlan"], cols: List[str]) ->
None:
+ super().__init__(child)
+ self.cols = cols
+
+ def plan(self, session: "SparkConnectClient") -> proto.Relation:
+ assert self._child is not None
+ plan = proto.Relation()
+ plan.describe.input.CopyFrom(self._child.plan(session))
+ plan.describe.cols.extend(self.cols)
+ return plan
+
+ def print(self, indent: int = 0) -> str:
+ i = " " * indent
+ return f"""{i}<Describe cols='{self.cols}'>"""
+
+ def _repr_html_(self) -> str:
+ return f"""
+ <ul>
+ <li>
+ <b>Describe</b><br />
+ Cols: {self.cols} <br />
+ {self._child_repr_()}
+ </li>
+ </ul>
+ """
+
+
class StatCrosstab(LogicalPlan):
def __init__(self, child: Optional["LogicalPlan"], col1: str, col2: str)
-> None:
super().__init__(child)
diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py
b/python/pyspark/sql/connect/proto/relations_pb2.py
index 47be485b546..06cf18417d2 100644
--- a/python/pyspark/sql/connect/proto/relations_pb2.py
+++ b/python/pyspark/sql/connect/proto/relations_pb2.py
@@ -33,7 +33,7 @@ from pyspark.sql.connect.proto import expressions_pb2 as
spark_dot_connect_dot_e
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-
b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\xd0\r\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01
\x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02
\x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03
\x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04
\x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05
\x01(\x0b\ [...]
+
b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\x8b\x0e\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01
\x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02
\x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03
\x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04
\x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05
\x01(\x0 [...]
)
@@ -64,6 +64,7 @@ _SUBQUERYALIAS =
DESCRIPTOR.message_types_by_name["SubqueryAlias"]
_REPARTITION = DESCRIPTOR.message_types_by_name["Repartition"]
_SHOWSTRING = DESCRIPTOR.message_types_by_name["ShowString"]
_STATSUMMARY = DESCRIPTOR.message_types_by_name["StatSummary"]
+_STATDESCRIBE = DESCRIPTOR.message_types_by_name["StatDescribe"]
_STATCROSSTAB = DESCRIPTOR.message_types_by_name["StatCrosstab"]
_NAFILL = DESCRIPTOR.message_types_by_name["NAFill"]
_NADROP = DESCRIPTOR.message_types_by_name["NADrop"]
@@ -373,6 +374,17 @@ StatSummary = _reflection.GeneratedProtocolMessageType(
)
_sym_db.RegisterMessage(StatSummary)
+StatDescribe = _reflection.GeneratedProtocolMessageType(
+ "StatDescribe",
+ (_message.Message,),
+ {
+ "DESCRIPTOR": _STATDESCRIBE,
+ "__module__": "spark.connect.relations_pb2"
+ # @@protoc_insertion_point(class_scope:spark.connect.StatDescribe)
+ },
+)
+_sym_db.RegisterMessage(StatDescribe)
+
StatCrosstab = _reflection.GeneratedProtocolMessageType(
"StatCrosstab",
(_message.Message,),
@@ -490,85 +502,87 @@ if _descriptor._USE_C_DESCRIPTORS == False:
_RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._options = None
_RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_options =
b"8\001"
_RELATION._serialized_start = 82
- _RELATION._serialized_end = 1826
- _UNKNOWN._serialized_start = 1828
- _UNKNOWN._serialized_end = 1837
- _RELATIONCOMMON._serialized_start = 1839
- _RELATIONCOMMON._serialized_end = 1888
- _SQL._serialized_start = 1890
- _SQL._serialized_end = 1917
- _READ._serialized_start = 1920
- _READ._serialized_end = 2346
- _READ_NAMEDTABLE._serialized_start = 2062
- _READ_NAMEDTABLE._serialized_end = 2123
- _READ_DATASOURCE._serialized_start = 2126
- _READ_DATASOURCE._serialized_end = 2333
- _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 2264
- _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2322
- _PROJECT._serialized_start = 2348
- _PROJECT._serialized_end = 2465
- _FILTER._serialized_start = 2467
- _FILTER._serialized_end = 2579
- _JOIN._serialized_start = 2582
- _JOIN._serialized_end = 3053
- _JOIN_JOINTYPE._serialized_start = 2845
- _JOIN_JOINTYPE._serialized_end = 3053
- _SETOPERATION._serialized_start = 3056
- _SETOPERATION._serialized_end = 3452
- _SETOPERATION_SETOPTYPE._serialized_start = 3315
- _SETOPERATION_SETOPTYPE._serialized_end = 3429
- _LIMIT._serialized_start = 3454
- _LIMIT._serialized_end = 3530
- _OFFSET._serialized_start = 3532
- _OFFSET._serialized_end = 3611
- _TAIL._serialized_start = 3613
- _TAIL._serialized_end = 3688
- _AGGREGATE._serialized_start = 3691
- _AGGREGATE._serialized_end = 3901
- _SORT._serialized_start = 3904
- _SORT._serialized_end = 4454
- _SORT_SORTFIELD._serialized_start = 4058
- _SORT_SORTFIELD._serialized_end = 4246
- _SORT_SORTDIRECTION._serialized_start = 4248
- _SORT_SORTDIRECTION._serialized_end = 4356
- _SORT_SORTNULLS._serialized_start = 4358
- _SORT_SORTNULLS._serialized_end = 4440
- _DROP._serialized_start = 4456
- _DROP._serialized_end = 4556
- _DEDUPLICATE._serialized_start = 4559
- _DEDUPLICATE._serialized_end = 4730
- _LOCALRELATION._serialized_start = 4732
- _LOCALRELATION._serialized_end = 4767
- _SAMPLE._serialized_start = 4770
- _SAMPLE._serialized_end = 4994
- _RANGE._serialized_start = 4997
- _RANGE._serialized_end = 5142
- _SUBQUERYALIAS._serialized_start = 5144
- _SUBQUERYALIAS._serialized_end = 5258
- _REPARTITION._serialized_start = 5261
- _REPARTITION._serialized_end = 5403
- _SHOWSTRING._serialized_start = 5406
- _SHOWSTRING._serialized_end = 5547
- _STATSUMMARY._serialized_start = 5549
- _STATSUMMARY._serialized_end = 5641
- _STATCROSSTAB._serialized_start = 5643
- _STATCROSSTAB._serialized_end = 5744
- _NAFILL._serialized_start = 5747
- _NAFILL._serialized_end = 5881
- _NADROP._serialized_start = 5884
- _NADROP._serialized_end = 6018
- _NAREPLACE._serialized_start = 6021
- _NAREPLACE._serialized_end = 6317
- _NAREPLACE_REPLACEMENT._serialized_start = 6176
- _NAREPLACE_REPLACEMENT._serialized_end = 6317
- _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 6319
- _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 6433
- _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 6436
- _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 6695
- _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start =
6628
- _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 6695
- _WITHCOLUMNS._serialized_start = 6698
- _WITHCOLUMNS._serialized_end = 6829
- _HINT._serialized_start = 6832
- _HINT._serialized_end = 6972
+ _RELATION._serialized_end = 1885
+ _UNKNOWN._serialized_start = 1887
+ _UNKNOWN._serialized_end = 1896
+ _RELATIONCOMMON._serialized_start = 1898
+ _RELATIONCOMMON._serialized_end = 1947
+ _SQL._serialized_start = 1949
+ _SQL._serialized_end = 1976
+ _READ._serialized_start = 1979
+ _READ._serialized_end = 2405
+ _READ_NAMEDTABLE._serialized_start = 2121
+ _READ_NAMEDTABLE._serialized_end = 2182
+ _READ_DATASOURCE._serialized_start = 2185
+ _READ_DATASOURCE._serialized_end = 2392
+ _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 2323
+ _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2381
+ _PROJECT._serialized_start = 2407
+ _PROJECT._serialized_end = 2524
+ _FILTER._serialized_start = 2526
+ _FILTER._serialized_end = 2638
+ _JOIN._serialized_start = 2641
+ _JOIN._serialized_end = 3112
+ _JOIN_JOINTYPE._serialized_start = 2904
+ _JOIN_JOINTYPE._serialized_end = 3112
+ _SETOPERATION._serialized_start = 3115
+ _SETOPERATION._serialized_end = 3511
+ _SETOPERATION_SETOPTYPE._serialized_start = 3374
+ _SETOPERATION_SETOPTYPE._serialized_end = 3488
+ _LIMIT._serialized_start = 3513
+ _LIMIT._serialized_end = 3589
+ _OFFSET._serialized_start = 3591
+ _OFFSET._serialized_end = 3670
+ _TAIL._serialized_start = 3672
+ _TAIL._serialized_end = 3747
+ _AGGREGATE._serialized_start = 3750
+ _AGGREGATE._serialized_end = 3960
+ _SORT._serialized_start = 3963
+ _SORT._serialized_end = 4513
+ _SORT_SORTFIELD._serialized_start = 4117
+ _SORT_SORTFIELD._serialized_end = 4305
+ _SORT_SORTDIRECTION._serialized_start = 4307
+ _SORT_SORTDIRECTION._serialized_end = 4415
+ _SORT_SORTNULLS._serialized_start = 4417
+ _SORT_SORTNULLS._serialized_end = 4499
+ _DROP._serialized_start = 4515
+ _DROP._serialized_end = 4615
+ _DEDUPLICATE._serialized_start = 4618
+ _DEDUPLICATE._serialized_end = 4789
+ _LOCALRELATION._serialized_start = 4791
+ _LOCALRELATION._serialized_end = 4826
+ _SAMPLE._serialized_start = 4829
+ _SAMPLE._serialized_end = 5053
+ _RANGE._serialized_start = 5056
+ _RANGE._serialized_end = 5201
+ _SUBQUERYALIAS._serialized_start = 5203
+ _SUBQUERYALIAS._serialized_end = 5317
+ _REPARTITION._serialized_start = 5320
+ _REPARTITION._serialized_end = 5462
+ _SHOWSTRING._serialized_start = 5465
+ _SHOWSTRING._serialized_end = 5606
+ _STATSUMMARY._serialized_start = 5608
+ _STATSUMMARY._serialized_end = 5700
+ _STATDESCRIBE._serialized_start = 5702
+ _STATDESCRIBE._serialized_end = 5783
+ _STATCROSSTAB._serialized_start = 5785
+ _STATCROSSTAB._serialized_end = 5886
+ _NAFILL._serialized_start = 5889
+ _NAFILL._serialized_end = 6023
+ _NADROP._serialized_start = 6026
+ _NADROP._serialized_end = 6160
+ _NAREPLACE._serialized_start = 6163
+ _NAREPLACE._serialized_end = 6459
+ _NAREPLACE_REPLACEMENT._serialized_start = 6318
+ _NAREPLACE_REPLACEMENT._serialized_end = 6459
+ _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 6461
+ _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 6575
+ _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 6578
+ _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 6837
+ _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start =
6770
+ _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 6837
+ _WITHCOLUMNS._serialized_start = 6840
+ _WITHCOLUMNS._serialized_end = 6971
+ _HINT._serialized_start = 6974
+ _HINT._serialized_end = 7114
# @@protoc_insertion_point(module_scope)
diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi
b/python/pyspark/sql/connect/proto/relations_pb2.pyi
index 996afbedcdc..f1336613687 100644
--- a/python/pyspark/sql/connect/proto/relations_pb2.pyi
+++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi
@@ -88,6 +88,7 @@ class Relation(google.protobuf.message.Message):
REPLACE_FIELD_NUMBER: builtins.int
SUMMARY_FIELD_NUMBER: builtins.int
CROSSTAB_FIELD_NUMBER: builtins.int
+ DESCRIBE_FIELD_NUMBER: builtins.int
UNKNOWN_FIELD_NUMBER: builtins.int
@property
def common(self) -> global___RelationCommon: ...
@@ -150,6 +151,8 @@ class Relation(google.protobuf.message.Message):
@property
def crosstab(self) -> global___StatCrosstab: ...
@property
+ def describe(self) -> global___StatDescribe: ...
+ @property
def unknown(self) -> global___Unknown: ...
def __init__(
self,
@@ -183,6 +186,7 @@ class Relation(google.protobuf.message.Message):
replace: global___NAReplace | None = ...,
summary: global___StatSummary | None = ...,
crosstab: global___StatCrosstab | None = ...,
+ describe: global___StatDescribe | None = ...,
unknown: global___Unknown | None = ...,
) -> None: ...
def HasField(
@@ -196,6 +200,8 @@ class Relation(google.protobuf.message.Message):
b"crosstab",
"deduplicate",
b"deduplicate",
+ "describe",
+ b"describe",
"drop",
b"drop",
"drop_na",
@@ -263,6 +269,8 @@ class Relation(google.protobuf.message.Message):
b"crosstab",
"deduplicate",
b"deduplicate",
+ "describe",
+ b"describe",
"drop",
b"drop",
"drop_na",
@@ -350,6 +358,7 @@ class Relation(google.protobuf.message.Message):
"replace",
"summary",
"crosstab",
+ "describe",
"unknown",
] | None: ...
@@ -1486,6 +1495,39 @@ class StatSummary(google.protobuf.message.Message):
global___StatSummary = StatSummary
+class StatDescribe(google.protobuf.message.Message):
+ """Computes basic statistics for numeric and string columns, including
count, mean, stddev, min,
+ and max. If no columns are given, this function computes statistics for
all numerical or
+ string columns.
+ """
+
+ DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+ INPUT_FIELD_NUMBER: builtins.int
+ COLS_FIELD_NUMBER: builtins.int
+ @property
+ def input(self) -> global___Relation:
+ """(Required) The input relation."""
+ @property
+ def cols(
+ self,
+ ) ->
google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+ """(Optional) Columns to compute statistics on."""
+ def __init__(
+ self,
+ *,
+ input: global___Relation | None = ...,
+ cols: collections.abc.Iterable[builtins.str] | None = ...,
+ ) -> None: ...
+ def HasField(
+ self, field_name: typing_extensions.Literal["input", b"input"]
+ ) -> builtins.bool: ...
+ def ClearField(
+ self, field_name: typing_extensions.Literal["cols", b"cols", "input",
b"input"]
+ ) -> None: ...
+
+global___StatDescribe = StatDescribe
+
class StatCrosstab(google.protobuf.message.Message):
"""Computes a pair-wise frequency table of the given columns. Also known
as a contingency table.
It will invoke 'Dataset.stat.crosstab' (same as
'StatFunctions.crossTabulate')
diff --git a/python/pyspark/sql/tests/connect/test_connect_plan_only.py
b/python/pyspark/sql/tests/connect/test_connect_plan_only.py
index 0d52e66afd0..b9695eea785 100644
--- a/python/pyspark/sql/tests/connect/test_connect_plan_only.py
+++ b/python/pyspark/sql/tests/connect/test_connect_plan_only.py
@@ -183,6 +183,17 @@ class SparkConnectTestsPlanOnly(PlanOnlyTestFixture):
["count", "mean", "stddev", "min", "25%"],
)
+ def test_describe(self):
+ df = self.connect.readTable(table_name=self.tbl_name)
+ plan = df.filter(df.col_name >
3).describe()._plan.to_proto(self.connect)
+ self.assertEqual(plan.root.describe.cols, [])
+
+ plan = df.filter(df.col_name > 3).describe("col_a",
"col_b")._plan.to_proto(self.connect)
+ self.assertEqual(
+ plan.root.describe.cols,
+ ["col_a", "col_b"],
+ )
+
def test_crosstab(self):
df = self.connect.readTable(table_name=self.tbl_name)
plan = df.filter(df.col_name > 3).crosstab("col_a",
"col_b")._plan.to_proto(self.connect)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]