(datafusion-comet) branch main updated: feat: Add specific fuzz tests for cast and try_cast and fix NPE found during fuzz testing (#514)

agrove Tue, 04 Jun 2024 10:09:42 -0700

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git



The following commit(s) were added to refs/heads/main by this push:
     new a668a865 feat: Add specific fuzz tests for cast and try_cast and fix 
NPE found during fuzz testing (#514)
a668a865 is described below

commit a668a8657a16496781075a014c6009d038c3fa1b
Author: Andy Grove <[email protected]>
AuthorDate: Tue Jun 4 11:09:32 2024 -0600

    feat: Add specific fuzz tests for cast and try_cast and fix NPE found 
during fuzz testing (#514)
    
    * Varius improvements to fuzz testing tool
    
    * Fix NPE in QueryPlanSerde handling of trim expression
    
    * format
---
 fuzz-testing/README.md                               |  5 +++--
 .../main/scala/org/apache/comet/fuzz/DataGen.scala   |  3 ++-
 .../main/scala/org/apache/comet/fuzz/QueryGen.scala  | 20 ++++++++++++++++++--
 .../scala/org/apache/comet/fuzz/QueryRunner.scala    | 10 ++++++++--
 .../src/main/scala/org/apache/comet/fuzz/Utils.scala |  4 ++--
 .../org/apache/comet/serde/QueryPlanSerde.scala      |  4 ++--
 6 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/fuzz-testing/README.md b/fuzz-testing/README.md
index 56af359f..076ff6ae 100644
--- a/fuzz-testing/README.md
+++ b/fuzz-testing/README.md
@@ -30,8 +30,8 @@ Comet Fuzz is inspired by the 
[SparkFuzz](https://ir.cwi.nl/pub/30222) paper fro
 
 Planned areas of improvement:
 
+- ANSI mode
 - Support for all data types, expressions, and operators supported by Comet
-- Explicit casts
 - Unary and binary arithmetic expressions
 - IF and CASE WHEN expressions
 - Complex (nested) expressions
@@ -91,7 +91,8 @@ $SPARK_HOME/bin/spark-submit \
     --conf spark.comet.exec.shuffle.enabled=true \
     --conf spark.comet.exec.shuffle.mode=auto \
     --jars $COMET_JAR \
-    --driver-class-path $COMET_JAR \
+    --conf spark.driver.extraClassPath=$COMET_JAR \
+    --conf spark.executor.extraClassPath=$COMET_JAR \
     --class org.apache.comet.fuzz.Main \
     target/comet-fuzz-spark3.4_2.12-0.1.0-SNAPSHOT-jar-with-dependencies.jar \
     run --num-files=2 --filename=queries.sql
diff --git a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/DataGen.scala 
b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/DataGen.scala
index 47a6bd87..9f9f772b 100644
--- a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/DataGen.scala
+++ b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/DataGen.scala
@@ -50,7 +50,8 @@ object DataGen {
 
     // generate schema using random data types
     val fields = Range(0, numColumns)
-      .map(i => StructField(s"c$i", 
Utils.randomWeightedChoice(Meta.dataTypes), nullable = true))
+      .map(i =>
+        StructField(s"c$i", Utils.randomWeightedChoice(Meta.dataTypes, r), 
nullable = true))
     val schema = StructType(fields)
 
     // generate columnar data
diff --git a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryGen.scala 
b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryGen.scala
index 1daa2620..7584e76c 100644
--- a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryGen.scala
+++ b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryGen.scala
@@ -42,10 +42,11 @@ object QueryGen {
     val uniqueQueries = mutable.HashSet[String]()
 
     for (_ <- 0 until numQueries) {
-      val sql = r.nextInt().abs % 3 match {
+      val sql = r.nextInt().abs % 4 match {
         case 0 => generateJoin(r, spark, numFiles)
         case 1 => generateAggregate(r, spark, numFiles)
         case 2 => generateScalar(r, spark, numFiles)
+        case 3 => generateCast(r, spark, numFiles)
       }
       if (!uniqueQueries.contains(sql)) {
         uniqueQueries += sql
@@ -91,6 +92,21 @@ object QueryGen {
       s"ORDER BY ${args.mkString(", ")};"
   }
 
+  private def generateCast(r: Random, spark: SparkSession, numFiles: Int): 
String = {
+    val tableName = s"test${r.nextInt(numFiles)}"
+    val table = spark.table(tableName)
+
+    val toType = Utils.randomWeightedChoice(Meta.dataTypes, r).sql
+    val arg = Utils.randomChoice(table.columns, r)
+
+    // We test both `cast` and `try_cast` to cover LEGACY and TRY eval modes. 
It is not
+    // recommended to run Comet Fuzz with ANSI enabled currently.
+    // Example SELECT c0, cast(c0 as float), try_cast(c0 as float) FROM test0
+    s"SELECT $arg, cast($arg as $toType), try_cast($arg as $toType) " +
+      s"FROM $tableName " +
+      s"ORDER BY $arg;"
+  }
+
   private def generateJoin(r: Random, spark: SparkSession, numFiles: Int): 
String = {
     val leftTableName = s"test${r.nextInt(numFiles)}"
     val rightTableName = s"test${r.nextInt(numFiles)}"
@@ -101,7 +117,7 @@ object QueryGen {
     val rightCol = Utils.randomChoice(rightTable.columns, r)
 
     val joinTypes = Seq(("INNER", 0.4), ("LEFT", 0.3), ("RIGHT", 0.3))
-    val joinType = Utils.randomWeightedChoice(joinTypes)
+    val joinType = Utils.randomWeightedChoice(joinTypes, r)
 
     val leftColProjection = leftTable.columns.map(c => s"l.$c").mkString(", ")
     val rightColProjection = rightTable.columns.map(c => s"r.$c").mkString(", 
")
diff --git 
a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala 
b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala
index 49f9fc3b..b2ceae9d 100644
--- a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala
+++ b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala
@@ -19,7 +19,7 @@
 
 package org.apache.comet.fuzz
 
-import java.io.{BufferedWriter, FileWriter}
+import java.io.{BufferedWriter, FileWriter, PrintWriter}
 
 import scala.io.Source
 
@@ -109,7 +109,12 @@ object QueryRunner {
               case e: Exception =>
                 // the query worked in Spark but failed in Comet, so this is 
likely a bug in Comet
                 showSQL(w, sql)
-                w.write(s"[ERROR] Query failed in Comet: ${e.getMessage}\n")
+                w.write(s"[ERROR] Query failed in Comet: ${e.getMessage}:\n")
+                w.write("```\n")
+                val p = new PrintWriter(w)
+                e.printStackTrace(p)
+                p.close()
+                w.write("```\n")
             }
 
             // flush after every query so that results are saved in the event 
of the driver crashing
@@ -134,6 +139,7 @@ object QueryRunner {
   private def formatRow(row: Row): String = {
     row.toSeq
       .map {
+        case null => "NULL"
         case v: Array[Byte] => v.mkString
         case other => other.toString
       }
diff --git a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/Utils.scala 
b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/Utils.scala
index 19f9695a..4d51c60e 100644
--- a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/Utils.scala
+++ b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/Utils.scala
@@ -27,9 +27,9 @@ object Utils {
     list(r.nextInt(list.length))
   }
 
-  def randomWeightedChoice[T](valuesWithWeights: Seq[(T, Double)]): T = {
+  def randomWeightedChoice[T](valuesWithWeights: Seq[(T, Double)], r: Random): 
T = {
     val totalWeight = valuesWithWeights.map(_._2).sum
-    val randomValue = Random.nextDouble() * totalWeight
+    val randomValue = r.nextDouble() * totalWeight
     var cumulativeWeight = 0.0
 
     for ((value, weight) <- valuesWithWeights) {
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala 
b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
index 439ec4eb..8d81b57c 100644
--- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -2169,10 +2169,10 @@ object QueryPlanSerde extends Logging with 
ShimQueryPlanSerde with CometExprShim
         val trimCast = Cast(trimStr.get, StringType)
         val trimExpr = exprToProtoInternal(trimCast, inputs)
         val optExpr = scalarExprToProto(trimType, srcExpr, trimExpr)
-        optExprWithInfo(optExpr, expr, null, srcCast, trimCast)
+        optExprWithInfo(optExpr, expr, srcCast, trimCast)
       } else {
         val optExpr = scalarExprToProto(trimType, srcExpr)
-        optExprWithInfo(optExpr, expr, null, srcCast)
+        optExprWithInfo(optExpr, expr, srcCast)
       }
     }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion-comet) branch main updated: feat: Add specific fuzz tests for cast and try_cast and fix NPE found during fuzz testing (#514)

Reply via email to