This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f4c88ca597ba [SPARK-50558][SQL] Add configurable logging limits for
the number of elements in InSet and In
f4c88ca597ba is described below
commit f4c88ca597bacb14a6d3cb08b1690ea4ec5632ea
Author: Ole Sasse <[email protected]>
AuthorDate: Thu Dec 19 19:21:13 2024 +0800
[SPARK-50558][SQL] Add configurable logging limits for the number of
elements in InSet and In
### What changes were proposed in this pull request?
Introduce a new SQL conf `OPTIMIZER_INSET_MEMBER_LOGGING_LIMIT` that allows
to limit the number of elements logged for In and InSet in toString. The
current implementation does log arbitrarily many elements and in the case of
InSet even sorts them before, which can lead to a lot of unwanted log volumes
and have significant performance impact.
If clients require all elements, they can set to conf to a very high value
or preferably use the `sql` function.
### Why are the changes needed?
I have observed performance impact of the logging
### Does this PR introduce _any_ user-facing change?
Teh logging string changes. Given that there is also the `sql` function,
clients should not depend on the output of the logging function.
### How was this patch tested?
New tests have been added
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #49177 from
olaky/spark-50558-do-not-log-all-values-in-in-set-expressions.
Authored-by: Ole Sasse <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../sql/catalyst/expressions/predicates.scala | 31 ++++++++++++++++------
.../sql/catalyst/expressions/PredicateSuite.scala | 10 +++++++
2 files changed, 33 insertions(+), 8 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 5e402fa2b6ca..d8d81a9cc12f 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -32,6 +32,7 @@ import
org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LeafNode,
LogicalPlan, Project, Union}
import org.apache.spark.sql.catalyst.trees.TreePattern._
import org.apache.spark.sql.catalyst.util.{CollationFactory, TypeUtils}
+import org.apache.spark.sql.catalyst.util.SparkStringUtils.truncatedString
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
@@ -487,7 +488,10 @@ case class In(value: Expression, list: Seq[Expression])
extends Predicate {
}
}
- override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}"
+ override def simpleString(maxFields: Int): String =
+ s"$value IN ${truncatedString(list, "(", ",", ")", maxFields)}"
+
+ override def toString: String = simpleString(Int.MaxValue)
override def eval(input: InternalRow): Any = {
if (list.isEmpty && !legacyNullInEmptyBehavior) {
@@ -608,18 +612,29 @@ case class InSet(child: Expression, hset: Set[Any])
extends UnaryExpression with
require(hset != null, "hset could not be null")
- override def toString: String = {
+ override def simpleString(maxFields: Int): String = {
if (!child.resolved) {
return s"$child INSET (values with unresolved data types)"
}
- val listString = hset.toSeq
- .map(elem => Literal(elem, child.dataType).toString)
- // Sort elements for deterministic behaviours
- .sorted
- .mkString(", ")
- s"$child INSET $listString"
+ if (hset.size <= maxFields) {
+ val listString = hset.toSeq
+ .map(elem => Literal(elem, child.dataType).toString)
+ // Sort elements for deterministic behaviours
+ .sorted
+ .mkString(", ")
+ s"$child INSET $listString"
+ } else {
+ // Skip sorting if there are many elements. Do not use truncatedString
because we would have
+ // to convert elements we do not print to Literals.
+ val listString = hset.take(maxFields).toSeq
+ .map(elem => Literal(elem, child.dataType).toString)
+ .mkString(", ")
+ s"$child INSET $listString, ... ${hset.size - maxFields} more fields"
+ }
}
+ override def toString: String = simpleString(Int.MaxValue)
+
@transient private[this] lazy val hasNull: Boolean = hset.contains(null)
@transient private[this] lazy val isNaN: Any => Boolean = child.dataType
match {
case DoubleType => (value: Any) =>
java.lang.Double.isNaN(value.asInstanceOf[java.lang.Double])
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index a0c75b703ade..4a7bf807d1de 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -674,4 +674,14 @@ class PredicateSuite extends SparkFunSuite with
ExpressionEvalHelper {
checkInAndInSet(In(Literal(Double.NaN),
Seq(Literal(Double.NaN), Literal(2d), Literal.create(null,
DoubleType))), true)
}
+
+ test("In and InSet logging limits") {
+ assert(In(Literal(1), Seq(Literal(1), Literal(2))).simpleString(1)
+ === "1 IN (1,... 1 more fields)")
+ assert(In(Literal(1), Seq(Literal(1), Literal(2))).simpleString(2) === "1
IN (1,2)")
+ assert(In(Literal(1), Seq(Literal(1))).simpleString(1) === "1 IN (1)")
+ assert(InSet(Literal(1), Set(1, 2)).simpleString(1) === "1 INSET 1, ... 1
more fields")
+ assert(InSet(Literal(1), Set(1, 2)).simpleString(2) === "1 INSET 1, 2")
+ assert(InSet(Literal(1), Set(1)).simpleString(1) === "1 INSET 1")
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]