This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new fcf7d5b4 tests: Move random data generation methods from
CometCastSuite to new DataGenerator class (#426)
fcf7d5b4 is described below
commit fcf7d5b4e03d0b272194e4269b509d1c47854c9b
Author: Andy Grove <[email protected]>
AuthorDate: Wed May 15 12:54:30 2024 -0600
tests: Move random data generation methods from CometCastSuite to new
DataGenerator class (#426)
* add new DataGenerator class
* move more data gen methods
* ignore some failing tests, update compat docs
* address feedback
* fix regression
---
docs/source/user-guide/compatibility.md | 8 +-
.../org/apache/comet/expressions/CometCast.scala | 2 +-
.../scala/org/apache/comet/CometCastSuite.scala | 122 +++++----------------
.../org/apache/comet/CometExpressionSuite.scala | 4 +-
.../scala/org/apache/comet/DataGenerator.scala | 98 +++++++++++++++++
5 files changed, 136 insertions(+), 98 deletions(-)
diff --git a/docs/source/user-guide/compatibility.md
b/docs/source/user-guide/compatibility.md
index a4ed9289..278edb84 100644
--- a/docs/source/user-guide/compatibility.md
+++ b/docs/source/user-guide/compatibility.md
@@ -110,10 +110,6 @@ The following cast operations are generally compatible
with Spark except for the
| decimal | float | |
| decimal | double | |
| string | boolean | |
-| string | byte | |
-| string | short | |
-| string | integer | |
-| string | long | |
| string | binary | |
| date | string | |
| timestamp | long | |
@@ -129,6 +125,10 @@ The following cast operations are not compatible with
Spark for all inputs and a
|-|-|-|
| integer | decimal | No overflow check |
| long | decimal | No overflow check |
+| string | byte | Not all invalid inputs are detected |
+| string | short | Not all invalid inputs are detected |
+| string | integer | Not all invalid inputs are detected |
+| string | long | Not all invalid inputs are detected |
| string | timestamp | Not all valid formats are supported |
| binary | string | Only works for binary data representing valid UTF-8
strings |
diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala
b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala
index 795bdb42..9c3695ba 100644
--- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala
+++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala
@@ -108,7 +108,7 @@ object CometCast {
Compatible()
case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType |
DataTypes.LongType =>
- Compatible()
+ Incompatible(Some("Not all invalid inputs are detected"))
case DataTypes.BinaryType =>
Compatible()
case DataTypes.FloatType | DataTypes.DoubleType =>
diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
index 1881c561..50311a9b 100644
--- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
@@ -35,7 +35,11 @@ import org.apache.comet.expressions.{CometCast, Compatible}
class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
import testImplicits._
- private val dataSize = 1000
+ /** Create a data generator using a fixed seed so that tests are
reproducible */
+ private val gen = DataGenerator.DEFAULT
+
+ /** Number of random data items to generate in each test */
+ private val dataSize = 10000
// we should eventually add more whitespace chars here as documented in
//
https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#isWhitespace-char-
@@ -478,7 +482,7 @@ class CometCastSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
test("cast StringType to BooleanType") {
val testValues =
(Seq("TRUE", "True", "true", "FALSE", "False", "false", "1", "0", "",
null) ++
- generateStrings("truefalseTRUEFALSEyesno10" + whitespaceChars,
8)).toDF("a")
+ gen.generateStrings(dataSize, "truefalseTRUEFALSEyesno10" +
whitespaceChars, 8)).toDF("a")
castTest(testValues, DataTypes.BooleanType)
}
@@ -515,57 +519,57 @@ class CometCastSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
"9223372036854775808" // Long.MaxValue + 1
)
- test("cast StringType to ByteType") {
+ ignore("cast StringType to ByteType") {
// test with hand-picked values
castTest(castStringToIntegralInputs.toDF("a"), DataTypes.ByteType)
// fuzz test
- castTest(generateStrings(numericPattern, 4).toDF("a"), DataTypes.ByteType)
+ castTest(gen.generateStrings(dataSize, numericPattern, 4).toDF("a"),
DataTypes.ByteType)
}
- test("cast StringType to ShortType") {
+ ignore("cast StringType to ShortType") {
// test with hand-picked values
castTest(castStringToIntegralInputs.toDF("a"), DataTypes.ShortType)
// fuzz test
- castTest(generateStrings(numericPattern, 5).toDF("a"), DataTypes.ShortType)
+ castTest(gen.generateStrings(dataSize, numericPattern, 5).toDF("a"),
DataTypes.ShortType)
}
- test("cast StringType to IntegerType") {
+ ignore("cast StringType to IntegerType") {
// test with hand-picked values
castTest(castStringToIntegralInputs.toDF("a"), DataTypes.IntegerType)
// fuzz test
- castTest(generateStrings(numericPattern, 8).toDF("a"),
DataTypes.IntegerType)
+ castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"),
DataTypes.IntegerType)
}
- test("cast StringType to LongType") {
+ ignore("cast StringType to LongType") {
// test with hand-picked values
castTest(castStringToIntegralInputs.toDF("a"), DataTypes.LongType)
// fuzz test
- castTest(generateStrings(numericPattern, 8).toDF("a"), DataTypes.LongType)
+ castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"),
DataTypes.LongType)
}
ignore("cast StringType to FloatType") {
// https://github.com/apache/datafusion-comet/issues/326
- castTest(generateStrings(numericPattern, 8).toDF("a"), DataTypes.FloatType)
+ castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"),
DataTypes.FloatType)
}
ignore("cast StringType to DoubleType") {
// https://github.com/apache/datafusion-comet/issues/326
- castTest(generateStrings(numericPattern, 8).toDF("a"),
DataTypes.DoubleType)
+ castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"),
DataTypes.DoubleType)
}
ignore("cast StringType to DecimalType(10,2)") {
// https://github.com/apache/datafusion-comet/issues/325
- val values = generateStrings(numericPattern, 8).toDF("a")
+ val values = gen.generateStrings(dataSize, numericPattern, 8).toDF("a")
castTest(values, DataTypes.createDecimalType(10, 2))
}
test("cast StringType to BinaryType") {
- castTest(generateStrings(numericPattern, 8).toDF("a"),
DataTypes.BinaryType)
+ castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"),
DataTypes.BinaryType)
}
ignore("cast StringType to DateType") {
// https://github.com/apache/datafusion-comet/issues/327
- castTest(generateStrings(datePattern, 8).toDF("a"), DataTypes.DateType)
+ castTest(gen.generateStrings(dataSize, datePattern, 8).toDF("a"),
DataTypes.DateType)
}
test("cast StringType to TimestampType disabled by default") {
@@ -581,7 +585,10 @@ class CometCastSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
ignore("cast StringType to TimestampType") {
// https://github.com/apache/datafusion-comet/issues/328
withSQLConf((CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key, "true")) {
- val values = Seq("2020-01-01T12:34:56.123456", "T2") ++
generateStrings(timestampPattern, 8)
+ val values = Seq("2020-01-01T12:34:56.123456", "T2") ++
gen.generateStrings(
+ dataSize,
+ timestampPattern,
+ 8)
castTest(values.toDF("a"), DataTypes.TimestampType)
}
}
@@ -630,7 +637,7 @@ class CometCastSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
}
test("cast BinaryType to StringType - valid UTF-8 inputs") {
- castTest(generateStrings(numericPattern, 8).toDF("a"),
DataTypes.StringType)
+ castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"),
DataTypes.StringType)
}
// CAST from DateType
@@ -739,35 +746,11 @@ class CometCastSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
}
private def generateFloats(): DataFrame = {
- val r = new Random(0)
- val values = Seq(
- Float.MaxValue,
- Float.MinPositiveValue,
- Float.MinValue,
- Float.NaN,
- Float.PositiveInfinity,
- Float.NegativeInfinity,
- 1.0f,
- -1.0f,
- Short.MinValue.toFloat,
- Short.MaxValue.toFloat,
- 0.0f) ++
- Range(0, dataSize).map(_ => r.nextFloat())
- withNulls(values).toDF("a")
+ withNulls(gen.generateFloats(dataSize)).toDF("a")
}
private def generateDoubles(): DataFrame = {
- val r = new Random(0)
- val values = Seq(
- Double.MaxValue,
- Double.MinPositiveValue,
- Double.MinValue,
- Double.NaN,
- Double.PositiveInfinity,
- Double.NegativeInfinity,
- 0.0d) ++
- Range(0, dataSize).map(_ => r.nextDouble())
- withNulls(values).toDF("a")
+ withNulls(gen.generateDoubles(dataSize)).toDF("a")
}
private def generateBools(): DataFrame = {
@@ -775,31 +758,19 @@ class CometCastSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
}
private def generateBytes(): DataFrame = {
- val r = new Random(0)
- val values = Seq(Byte.MinValue, Byte.MaxValue) ++
- Range(0, dataSize).map(_ => r.nextInt().toByte)
- withNulls(values).toDF("a")
+ withNulls(gen.generateBytes(dataSize)).toDF("a")
}
private def generateShorts(): DataFrame = {
- val r = new Random(0)
- val values = Seq(Short.MinValue, Short.MaxValue) ++
- Range(0, dataSize).map(_ => r.nextInt().toShort)
- withNulls(values).toDF("a")
+ withNulls(gen.generateShorts(dataSize)).toDF("a")
}
private def generateInts(): DataFrame = {
- val r = new Random(0)
- val values = Seq(Int.MinValue, Int.MaxValue) ++
- Range(0, dataSize).map(_ => r.nextInt())
- withNulls(values).toDF("a")
+ withNulls(gen.generateInts(dataSize)).toDF("a")
}
private def generateLongs(): DataFrame = {
- val r = new Random(0)
- val values = Seq(Long.MinValue, Long.MaxValue) ++
- Range(0, dataSize).map(_ => r.nextLong())
- withNulls(values).toDF("a")
+ withNulls(gen.generateLongs(dataSize)).toDF("a")
}
private def generateDecimalsPrecision10Scale2(): DataFrame = {
@@ -864,17 +835,6 @@ class CometCastSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
.drop("str")
}
- private def generateString(r: Random, chars: String, maxLen: Int): String = {
- val len = r.nextInt(maxLen)
- Range(0, len).map(_ => chars.charAt(r.nextInt(chars.length))).mkString
- }
-
- // TODO return DataFrame for consistency with other generators and include
null values
- private def generateStrings(chars: String, maxLen: Int): Seq[String] = {
- val r = new Random(0)
- Range(0, dataSize).map(_ => generateString(r, chars, maxLen))
- }
-
private def generateBinary(): DataFrame = {
val r = new Random(0)
val bytes = new Array[Byte](8)
@@ -907,28 +867,6 @@ class CometCastSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
}
}
- // TODO Commented out to work around scalafix since this is currently unused.
- // private def castFallbackTestTimezone(
- // input: DataFrame,
- // toType: DataType,
- // expectedMessage: String): Unit = {
- // withTempPath { dir =>
- // val data = roundtripParquet(input, dir).coalesce(1)
- // data.createOrReplaceTempView("t")
- //
- // withSQLConf(
- // (SQLConf.ANSI_ENABLED.key, "false"),
- // (CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key, "true"),
- // (SQLConf.SESSION_LOCAL_TIMEZONE.key, "America/Los_Angeles")) {
- // val df = data.withColumn("converted", col("a").cast(toType))
- // df.collect()
- // val str =
- // new
ExtendedExplainInfo().generateExtendedInfo(df.queryExecution.executedPlan)
- // assert(str.contains(expectedMessage))
- // }
- // }
- // }
-
private def castTimestampTest(input: DataFrame, toType: DataType) = {
withTempPath { dir =>
val data = roundtripParquet(input, dir).coalesce(1)
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
index 43014f63..dbb46e07 100644
--- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -942,7 +942,9 @@ class CometExpressionSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
test("Chr") {
Seq(false, true).foreach { dictionary =>
- withSQLConf("parquet.enable.dictionary" -> dictionary.toString) {
+ withSQLConf(
+ "parquet.enable.dictionary" -> dictionary.toString,
+ CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key -> "true") {
val table = "test"
withTable(table) {
sql(s"create table $table(col varchar(20)) using parquet")
diff --git a/spark/src/test/scala/org/apache/comet/DataGenerator.scala
b/spark/src/test/scala/org/apache/comet/DataGenerator.scala
new file mode 100644
index 00000000..691a371b
--- /dev/null
+++ b/spark/src/test/scala/org/apache/comet/DataGenerator.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet
+
+import scala.util.Random
+
+object DataGenerator {
+ // note that we use `def` rather than `val` intentionally here so that
+ // each test suite starts with a fresh data generator to help ensure
+ // that tests are deterministic
+ def DEFAULT = new DataGenerator(new Random(42))
+}
+
+class DataGenerator(r: Random) {
+
+ /** Generate a random string using the specified characters */
+ def generateString(chars: String, maxLen: Int): String = {
+ val len = r.nextInt(maxLen)
+ Range(0, len).map(_ => chars.charAt(r.nextInt(chars.length))).mkString
+ }
+
+ /** Generate random strings */
+ def generateStrings(n: Int, maxLen: Int): Seq[String] = {
+ Range(0, n).map(_ => r.nextString(maxLen))
+ }
+
+ /** Generate random strings using the specified characters */
+ def generateStrings(n: Int, chars: String, maxLen: Int): Seq[String] = {
+ Range(0, n).map(_ => generateString(chars, maxLen))
+ }
+
+ def generateFloats(n: Int): Seq[Float] = {
+ Seq(
+ Float.MaxValue,
+ Float.MinPositiveValue,
+ Float.MinValue,
+ Float.NaN,
+ Float.PositiveInfinity,
+ Float.NegativeInfinity,
+ 1.0f,
+ -1.0f,
+ Short.MinValue.toFloat,
+ Short.MaxValue.toFloat,
+ 0.0f) ++
+ Range(0, n).map(_ => r.nextFloat())
+ }
+
+ def generateDoubles(n: Int): Seq[Double] = {
+ Seq(
+ Double.MaxValue,
+ Double.MinPositiveValue,
+ Double.MinValue,
+ Double.NaN,
+ Double.PositiveInfinity,
+ Double.NegativeInfinity,
+ 0.0d) ++
+ Range(0, n).map(_ => r.nextDouble())
+ }
+
+ def generateBytes(n: Int): Seq[Byte] = {
+ Seq(Byte.MinValue, Byte.MaxValue) ++
+ Range(0, n).map(_ => r.nextInt().toByte)
+ }
+
+ def generateShorts(n: Int): Seq[Short] = {
+ val r = new Random(0)
+ Seq(Short.MinValue, Short.MaxValue) ++
+ Range(0, n).map(_ => r.nextInt().toShort)
+ }
+
+ def generateInts(n: Int): Seq[Int] = {
+ Seq(Int.MinValue, Int.MaxValue) ++
+ Range(0, n).map(_ => r.nextInt())
+ }
+
+ def generateLongs(n: Int): Seq[Long] = {
+ Seq(Long.MinValue, Long.MaxValue) ++
+ Range(0, n).map(_ => r.nextLong())
+ }
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]