(spark) branch master updated: [SPARK-50558][SQL] Add configurable logging limits for the number of elements in InSet and In

wenchen Thu, 19 Dec 2024 03:21:33 -0800

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new f4c88ca597ba [SPARK-50558][SQL] Add configurable logging limits for 
the number of elements in InSet and In
f4c88ca597ba is described below

commit f4c88ca597bacb14a6d3cb08b1690ea4ec5632ea
Author: Ole Sasse <[email protected]>
AuthorDate: Thu Dec 19 19:21:13 2024 +0800

    [SPARK-50558][SQL] Add configurable logging limits for the number of 
elements in InSet and In
    
    ### What changes were proposed in this pull request?
    
    Introduce a new SQL conf `OPTIMIZER_INSET_MEMBER_LOGGING_LIMIT` that allows 
to limit the number of elements logged for In and InSet in toString. The 
current implementation does log arbitrarily many elements and in the case of 
InSet even sorts them before, which can lead to a lot of unwanted log volumes 
and have significant performance impact.
    If clients require all elements, they can set to conf to a very high value 
or preferably use the `sql` function.
    
    ### Why are the changes needed?
    
    I have observed performance impact of the logging
    
    ### Does this PR introduce _any_ user-facing change?
    
    Teh logging string changes. Given that there is also the `sql` function, 
clients should not depend on the output of the logging function.
    
    ### How was this patch tested?
    
    New tests have been added
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #49177 from 
olaky/spark-50558-do-not-log-all-values-in-in-set-expressions.
    
    Authored-by: Ole Sasse <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../sql/catalyst/expressions/predicates.scala      | 31 ++++++++++++++++------
 .../sql/catalyst/expressions/PredicateSuite.scala  | 10 +++++++
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 5e402fa2b6ca..d8d81a9cc12f 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -32,6 +32,7 @@ import 
org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LeafNode, 
LogicalPlan, Project, Union}
 import org.apache.spark.sql.catalyst.trees.TreePattern._
 import org.apache.spark.sql.catalyst.util.{CollationFactory, TypeUtils}
+import org.apache.spark.sql.catalyst.util.SparkStringUtils.truncatedString
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -487,7 +488,10 @@ case class In(value: Expression, list: Seq[Expression]) 
extends Predicate {
     }
   }
 
-  override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}"
+  override def simpleString(maxFields: Int): String =
+    s"$value IN ${truncatedString(list, "(", ",", ")", maxFields)}"
+
+  override def toString: String = simpleString(Int.MaxValue)
 
   override def eval(input: InternalRow): Any = {
     if (list.isEmpty && !legacyNullInEmptyBehavior) {
@@ -608,18 +612,29 @@ case class InSet(child: Expression, hset: Set[Any]) 
extends UnaryExpression with
 
   require(hset != null, "hset could not be null")
 
-  override def toString: String = {
+  override def simpleString(maxFields: Int): String = {
     if (!child.resolved) {
       return s"$child INSET (values with unresolved data types)"
     }
-    val listString = hset.toSeq
-      .map(elem => Literal(elem, child.dataType).toString)
-      // Sort elements for deterministic behaviours
-      .sorted
-      .mkString(", ")
-    s"$child INSET $listString"
+    if (hset.size <= maxFields) {
+      val listString = hset.toSeq
+        .map(elem => Literal(elem, child.dataType).toString)
+        // Sort elements for deterministic behaviours
+        .sorted
+        .mkString(", ")
+      s"$child INSET $listString"
+    } else {
+      // Skip sorting if there are many elements. Do not use truncatedString 
because we would have
+      // to convert elements we do not print to Literals.
+      val listString = hset.take(maxFields).toSeq
+        .map(elem => Literal(elem, child.dataType).toString)
+        .mkString(", ")
+      s"$child INSET $listString, ... ${hset.size - maxFields} more fields"
+    }
   }
 
+  override def toString: String = simpleString(Int.MaxValue)
+
   @transient private[this] lazy val hasNull: Boolean = hset.contains(null)
   @transient private[this] lazy val isNaN: Any => Boolean = child.dataType 
match {
     case DoubleType => (value: Any) => 
java.lang.Double.isNaN(value.asInstanceOf[java.lang.Double])
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index a0c75b703ade..4a7bf807d1de 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -674,4 +674,14 @@ class PredicateSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     checkInAndInSet(In(Literal(Double.NaN),
       Seq(Literal(Double.NaN), Literal(2d), Literal.create(null, 
DoubleType))), true)
   }
+
+  test("In and InSet logging limits") {
+    assert(In(Literal(1), Seq(Literal(1), Literal(2))).simpleString(1)
+      === "1 IN (1,... 1 more fields)")
+    assert(In(Literal(1), Seq(Literal(1), Literal(2))).simpleString(2) === "1 
IN (1,2)")
+    assert(In(Literal(1), Seq(Literal(1))).simpleString(1) === "1 IN (1)")
+    assert(InSet(Literal(1), Set(1, 2)).simpleString(1) === "1 INSET 1, ... 1 
more fields")
+    assert(InSet(Literal(1), Set(1, 2)).simpleString(2) === "1 INSET 1, 2")
+    assert(InSet(Literal(1), Set(1)).simpleString(1) === "1 INSET 1")
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-50558][SQL] Add configurable logging limits for the number of elements in InSet and In

Reply via email to