grundprinzip commented on code in PR #38686:
URL: https://github.com/apache/spark/pull/38686#discussion_r1027094335
##########
python/pyspark/sql/connect/dataframe.py:
##########
@@ -255,10 +255,21 @@ def distinct(self) -> "DataFrame":
)
def drop(self, *cols: "ColumnOrString") -> "DataFrame":
Review Comment:
This is an interesting case where one could argue for implementing the
behavior on the client side instead of the server.
##########
connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala:
##########
@@ -523,6 +524,19 @@ class SparkConnectPlanner(session: SparkSession) {
sameOrderExpressions = Seq.empty)
}
+ private def transformDrop(rel: proto.Drop): LogicalPlan = {
+ assert(rel.getColsCount > 0, s"cols must contains at least 1 item!")
+
+ val cols = rel.getColsList.asScala.toArray.map { expr =>
+ Column(transformExpression(expr))
Review Comment:
This should verify supported types.
##########
connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala:
##########
@@ -148,6 +148,23 @@ class SparkConnectProtoSuite extends PlanTest with
SparkConnectPlanTest {
comparePlans(connectPlan2, sparkPlan2)
}
+ test("SPARK-41169: Test drop") {
+ // single column
+ val connectPlan = connectTestRelation.drop("id")
+ val sparkPlan = sparkTestRelation.drop("id")
+ comparePlans(connectPlan, sparkPlan)
+
+ // all columns
+ val connectPlan2 = connectTestRelation.drop("id", "name")
+ val sparkPlan2 = sparkTestRelation.drop("id", "name")
+ comparePlans(connectPlan2, sparkPlan2)
+
+ // non-existing column
Review Comment:
If you treat the dropped columns as expressions we need to add a negative
test for unsupported expressions
##########
connector/connect/src/main/protobuf/spark/connect/relations.proto:
##########
@@ -203,6 +204,19 @@ message Sort {
}
}
+
+// Drop specified columns.
+message Drop {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) columns to drop.
+ //
+ // Should contain at least 1 item.
+ repeated Expression cols = 2;
Review Comment:
Wondering if the name should be more explicit like "dropped"?
##########
connector/connect/src/main/protobuf/spark/connect/relations.proto:
##########
@@ -203,6 +204,19 @@ message Sort {
}
}
+
+// Drop specified columns.
+message Drop {
+ // (Required) The input relation.
+ Relation input = 1;
+
+ // (Required) columns to drop.
+ //
+ // Should contain at least 1 item.
+ repeated Expression cols = 2;
Review Comment:
Does drop actually support arbitrary expressions? Shouldn't this be a
repeated unresolved attribute?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]