This is an automated email from the ASF dual-hosted git repository.
rui pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 41bd1ce668 [GLUTEN-9106][VL] Add support for staticInvoke
CharVarcharCodegenUtils. #9107 (#9107)
41bd1ce668 is described below
commit 41bd1ce668b416087ff7997eed7450af4e5af3bc
Author: iiFeung <[email protected]>
AuthorDate: Fri Oct 24 19:31:55 2025 +0800
[GLUTEN-9106][VL] Add support for staticInvoke CharVarcharCodegenUtils.
#9107 (#9107)
---
.../org/apache/gluten/utils/CHExpressionUtil.scala | 5 +-
.../functions/ScalarFunctionsValidateSuite.scala | 41 +++++
.../gluten/expression/ExpressionConverter.scala | 83 ++++++---
.../utils/clickhouse/ClickHouseTestSettings.scala | 10 ++
.../gluten/utils/velox/VeloxTestSettings.scala | 10 ++
.../spark/sql/GlutenCharVarcharTestSuite.scala | 69 +++++++-
.../utils/clickhouse/ClickHouseTestSettings.scala | 8 +
.../gluten/utils/velox/VeloxTestSettings.scala | 8 +
.../spark/sql/GlutenCharVarcharTestSuite.scala | 44 ++++-
.../utils/clickhouse/ClickHouseTestSettings.scala | 22 +++
.../gluten/utils/velox/VeloxTestSettings.scala | 21 +++
.../spark/sql/GlutenCharVarcharTestSuite.scala | 192 ++++++++++++++++++++-
.../utils/clickhouse/ClickHouseTestSettings.scala | 22 +++
.../gluten/utils/velox/VeloxTestSettings.scala | 19 ++
.../spark/sql/GlutenCharVarcharTestSuite.scala | 190 +++++++++++++++++++-
.../apache/gluten/expression/ExpressionNames.scala | 5 +
16 files changed, 712 insertions(+), 37 deletions(-)
diff --git
a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
index 8365245b84..37308f8c0b 100644
---
a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
+++
b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
@@ -207,6 +207,9 @@ object CHExpressionUtil {
MAKE_DATE -> DefaultValidator(),
ARRAY_APPEND -> DefaultValidator(),
JSON_OBJECT_KEYS -> DefaultValidator(),
- LUHN_CHECK -> DefaultValidator()
+ LUHN_CHECK -> DefaultValidator(),
+ VARCHAR_TYPE_WRITE_SIDE_CHECK -> DefaultValidator(),
+ CHAR_TYPE_WRITE_SIDE_CHECK -> DefaultValidator(),
+ READ_SIDE_PADDING -> DefaultValidator()
)
}
diff --git
a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala
b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala
index 097e6fc68e..f6be606131 100644
---
a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala
+++
b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala
@@ -622,6 +622,47 @@ abstract class ScalarFunctionsValidateSuite extends
FunctionsValidateSuite {
}
}
+ // Add test suite for CharVarcharCodegenUtils functions.
+ // A ProjectExecTransformer is expected to be constructed after expr support.
+ // We currently test below functions with Spark v3.4
+ testWithMinSparkVersion("charTypeWriteSideCheck", "3.4") {
+ withTable("src", "dest") {
+
+ sql("create table src(id string) USING PARQUET")
+ sql("insert into src values('s')")
+ sql("create table dest(id char(3)) USING PARQUET")
+ // check whether the executed plan of a dataframe contains the expected
plan.
+ runQueryAndCompare("insert into dest select id from src") {
+ checkGlutenOperatorMatch[ProjectExecTransformer]
+ }
+ }
+ }
+
+ testWithMinSparkVersion("varcharTypeWriteSideCheck", "3.4") {
+ withTable("src", "dest") {
+
+ sql("create table src(id string) USING PARQUET")
+ sql("insert into src values('abc')")
+ sql("create table dest(id varchar(10)) USING PARQUET")
+ // check whether the executed plan of a dataframe contains the expected
plan.
+ runQueryAndCompare("insert into dest select id from src") {
+ checkGlutenOperatorMatch[ProjectExecTransformer]
+ }
+ }
+ }
+
+ testWithMinSparkVersion("readSidePadding", "3.4") {
+ withTable("src", "dest") {
+
+ sql("create table tgt(id char(3)) USING PARQUET")
+ sql("insert into tgt values('p')")
+ // check whether the executed plan of a dataframe contains the expected
plan.
+ runQueryAndCompare("select id from tgt") {
+ checkGlutenOperatorMatch[ProjectExecTransformer]
+ }
+ }
+ }
+
test("soundex") {
runQueryAndCompare("select soundex(c_comment) from customer limit 50") {
checkGlutenOperatorMatch[ProjectExecTransformer]
diff --git
a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala
b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala
index 61f5c1be8d..41a2a2ff82 100644
---
a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala
+++
b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala
@@ -144,6 +144,66 @@ object ExpressionConverter extends SQLConfHelper with
Logging {
DecimalArithmeticExpressionTransformer(substraitName, leftChild,
rightChild, resultType, b)
}
+ private def replaceStaticInvokeWithExpressionTransformer(
+ i: StaticInvoke,
+ attributeSeq: Seq[Attribute],
+ expressionsMap: Map[Class[_], String]): ExpressionTransformer = {
+ def validateAndTransform(
+ exprName: String,
+ childTransformers: => Seq[ExpressionTransformer]):
ExpressionTransformer = {
+ if (!BackendsApiManager.getValidatorApiInstance.doExprValidate(exprName,
i)) {
+ throw new GlutenNotSupportException(
+ s"Not supported to map current ${i.getClass} call on function:
${i.functionName}.")
+ }
+ GenericExpressionTransformer(exprName, childTransformers, i)
+ }
+
+ i.functionName match {
+ case "encode" | "decode" if i.objectName.endsWith("UrlCodec") =>
+ validateAndTransform(
+ "url_" + i.functionName,
+ Seq(replaceWithExpressionTransformer0(i.arguments.head,
attributeSeq, expressionsMap))
+ )
+
+ case "isLuhnNumber" =>
+ validateAndTransform(
+ ExpressionNames.LUHN_CHECK,
+ Seq(replaceWithExpressionTransformer0(i.arguments.head,
attributeSeq, expressionsMap))
+ )
+
+ case "encode" | "decode" if i.objectName.endsWith("Base64") =>
+ if
(!BackendsApiManager.getValidatorApiInstance.doExprValidate(ExpressionNames.BASE64,
i)) {
+ throw new GlutenNotSupportException(
+ s"Not supported to map current ${i.getClass} call on function:
${i.functionName}.")
+ }
+
BackendsApiManager.getSparkPlanExecApiInstance.genBase64StaticInvokeTransformer(
+ ExpressionNames.BASE64,
+ replaceWithExpressionTransformer0(i.arguments.head, attributeSeq,
expressionsMap),
+ i
+ )
+
+ case fn
+ if i.objectName.endsWith("CharVarcharCodegenUtils") && Set(
+ "varcharTypeWriteSideCheck",
+ "charTypeWriteSideCheck",
+ "readSidePadding").contains(fn) =>
+ val exprName = fn match {
+ case "varcharTypeWriteSideCheck" =>
ExpressionNames.VARCHAR_TYPE_WRITE_SIDE_CHECK
+ case "charTypeWriteSideCheck" =>
ExpressionNames.CHAR_TYPE_WRITE_SIDE_CHECK
+ case "readSidePadding" => ExpressionNames.READ_SIDE_PADDING
+ }
+ validateAndTransform(
+ exprName,
+ i.arguments.map(replaceWithExpressionTransformer0(_, attributeSeq,
expressionsMap))
+ )
+
+ case _ =>
+ throw new GlutenNotSupportException(
+ s"Not supported to transform StaticInvoke with object:
${i.staticObject.getName}, " +
+ s"function: ${i.functionName}")
+ }
+ }
+
private def replaceIcebergStaticInvoke(
s: StaticInvoke,
attributeSeq: Seq[Attribute],
@@ -186,33 +246,12 @@ object ExpressionConverter extends SQLConfHelper with
Logging {
return
BackendsApiManager.getSparkPlanExecApiInstance.genHiveUDFTransformer(
expr,
attributeSeq)
- case i: StaticInvoke
- if Seq("encode", "decode").contains(i.functionName) &&
i.objectName.endsWith(
- "UrlCodec") =>
- return GenericExpressionTransformer(
- "url_" + i.functionName,
- replaceWithExpressionTransformer0(i.arguments.head, attributeSeq,
expressionsMap),
- i)
- case i: StaticInvoke if i.functionName.equals("isLuhnNumber") =>
- return GenericExpressionTransformer(
- ExpressionNames.LUHN_CHECK,
- replaceWithExpressionTransformer0(i.arguments.head, attributeSeq,
expressionsMap),
- i)
- case i: StaticInvoke
- if Seq("encode", "decode").contains(i.functionName) &&
i.objectName.endsWith("Base64") =>
- return
BackendsApiManager.getSparkPlanExecApiInstance.genBase64StaticInvokeTransformer(
- ExpressionNames.BASE64,
- replaceWithExpressionTransformer0(i.arguments.head, attributeSeq,
expressionsMap),
- i
- )
case i: StaticInvoke
if i.functionName == "invoke" && i.staticObject.getName.startsWith(
"org.apache.iceberg.spark.functions.") =>
return replaceIcebergStaticInvoke(i, attributeSeq, expressionsMap)
case i: StaticInvoke =>
- throw new GlutenNotSupportException(
- s"Not supported to transform StaticInvoke with object:
${i.staticObject.getName}, " +
- s"function: ${i.functionName}")
+ return replaceStaticInvokeWithExpressionTransformer(i, attributeSeq,
expressionsMap)
case _ =>
}
diff --git
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 7f9bdba52b..dc44facd7e 100644
---
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -135,6 +135,11 @@ class ClickHouseTestSettings extends BackendTestSettings {
enableSuite[GlutenDSV2CharVarcharTestSuite]
// failed on spark32 UT, see
https://github.com/oap-project/gluten/issues/4043
.exclude("SPARK-34833: right-padding applied correctly for correlated
subqueries - other preds")
+ // Excluded. The Gluten tests for char/varchar validation were rewritten
for Velox.
+ // ClickHouse backend doesn't support this feature and falls back to
vanilla Spark,
+ // causing mismatches in error messages.
+ .excludeGlutenTest("length check for input string values: nested in
struct")
+ .excludeGlutenTest("length check for input string values: nested in struct
of array")
enableSuite[GlutenDSV2SQLInsertTestSuite]
enableSuite[GlutenDataFrameAggregateSuite]
.exclude("average")
@@ -354,6 +359,11 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("varchar type values length check and trim: partitioned columns")
.exclude("char/varchar type values length check: partitioned columns of
other types")
.exclude("char type comparison: partitioned columns")
+ // Excluded. The Gluten tests for char/varchar validation were rewritten
for Velox.
+ // ClickHouse backend doesn't support this feature and falls back to
vanilla Spark,
+ // causing mismatches in error messages.
+ .excludeGlutenTest("length check for input string values: nested in
struct")
+ .excludeGlutenTest("length check for input string values: nested in struct
of array")
enableSuite[GlutenFileSourceSQLInsertTestSuite]
.exclude("SPARK-33474: Support typed literals as partition spec values")
.exclude(
diff --git
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 43a3250fc9..8929dfbefa 100644
---
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -422,7 +422,17 @@ class VeloxTestSettings extends BackendTestSettings {
.excludeByPrefix("SPARK-24705")
.excludeByPrefix("determining the number of reducers")
enableSuite[GlutenFileSourceCharVarcharTestSuite]
+ // Following tests are excluded as these are overridden in Gluten test
suite..
+ // The overridden tests assert against Velox-specific error messages for
char/varchar
+ // length validation, which differ from the original vanilla Spark tests.
+ .exclude("length check for input string values: nested in struct")
+ .exclude("length check for input string values: nested in struct of array")
enableSuite[GlutenDSV2CharVarcharTestSuite]
+ // Following tests are excluded as these are overridden in Gluten test
suite..
+ // The overridden tests assert against Velox-specific error messages for
char/varchar
+ // length validation, which differ from the original vanilla Spark tests.
+ .exclude("length check for input string values: nested in struct")
+ .exclude("length check for input string values: nested in struct of array")
enableSuite[GlutenFileScanSuite]
enableSuite[GlutenNestedDataSourceV1Suite]
enableSuite[GlutenNestedDataSourceV2Suite]
diff --git
a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
index 84502ace51..8dcb7bbfd8 100644
---
a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
+++
b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
@@ -16,8 +16,73 @@
*/
package org.apache.spark.sql
+import org.apache.spark.SparkException
+
class GlutenFileSourceCharVarcharTestSuite
extends FileSourceCharVarcharTestSuite
- with GlutenSQLTestsTrait {}
+ with GlutenSQLTestsTrait {
+
+ private def testTableWrite(f: String => Unit): Unit = {
+ withTable("t")(f("char"))
+ withTable("t")(f("varchar"))
+ }
+
+ private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5"
+
+ testGluten("length check for input string values: nested in struct") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c STRUCT<c: $typeName(5)>) USING $format")
+ sql("INSERT INTO t SELECT struct(null)")
+ checkAnswer(spark.table("t"), Row(Row(null)))
+ val e = intercept[SparkException] {
+ sql("INSERT INTO t SELECT struct('123456')")
+ }
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: nested in struct of
array") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
+ sql("INSERT INTO t SELECT struct(array(null))")
+ checkAnswer(spark.table("t"), Row(Row(Seq(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t SELECT
struct(array('123456'))"))
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+}
+
+class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with
GlutenSQLTestsTrait {
+ private def testTableWrite(f: String => Unit): Unit = {
+ withTable("t")(f("char"))
+ withTable("t")(f("varchar"))
+ }
+
+ private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5"
+
+ testGluten("length check for input string values: nested in struct") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c STRUCT<c: $typeName(5)>) USING $format")
+ sql("INSERT INTO t SELECT struct(null)")
+ checkAnswer(spark.table("t"), Row(Row(null)))
+ val e = intercept[SparkException] {
+ sql("INSERT INTO t SELECT struct('123456')")
+ }
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
-class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with
GlutenSQLTestsTrait {}
+ testGluten("length check for input string values: nested in struct of
array") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
+ sql("INSERT INTO t SELECT struct(array(null))")
+ checkAnswer(spark.table("t"), Row(Row(Seq(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t SELECT
struct(array('123456'))"))
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+}
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 6ce7d1e325..2fcd692598 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -152,6 +152,10 @@ class ClickHouseTestSettings extends BackendTestSettings {
enableSuite[GlutenCountMinSketchAggQuerySuite]
enableSuite[GlutenCsvFunctionsSuite]
enableSuite[GlutenDSV2CharVarcharTestSuite]
+ // Excluded. The Gluten tests for char/varchar validation were rewritten
for Velox.
+ // ClickHouse backend doesn't support this feature and falls back to
vanilla Spark,
+ // causing mismatches in error messages.
+ .excludeGlutenTest("length check for input string values: nested in struct
of array")
enableSuite[GlutenDSV2SQLInsertTestSuite]
enableSuite[GlutenDataFrameAggregateSuite]
.exclude("average")
@@ -367,6 +371,10 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("varchar type values length check and trim: partitioned columns")
.exclude("char/varchar type values length check: partitioned columns of
other types")
.exclude("char type comparison: partitioned columns")
+ // Excluded. The Gluten tests for char/varchar validation were rewritten
for Velox.
+ // ClickHouse backend doesn't support this feature and falls back to
vanilla Spark,
+ // causing mismatches in error messages.
+ .excludeGlutenTest("length check for input string values: nested in struct
of array")
enableSuite[GlutenFileSourceSQLInsertTestSuite]
.exclude("SPARK-33474: Support typed literals as partition spec values")
.exclude(
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 52ce14bda3..3249017c3b 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -616,7 +616,15 @@ class VeloxTestSettings extends BackendTestSettings {
// Extra ColumnarToRow is needed to transform vanilla columnar data to
gluten columnar data.
.exclude("SPARK-37369: Avoid redundant ColumnarToRow transition on
InMemoryTableScan")
enableSuite[GlutenFileSourceCharVarcharTestSuite]
+ // Following test is excluded as it is overridden in Gluten test suite..
+ // The overridden tests assert against Velox-specific error messages for
char/varchar
+ // length validation, which differ from the original vanilla Spark tests.
+ .exclude("length check for input string values: nested in struct of array")
enableSuite[GlutenDSV2CharVarcharTestSuite]
+ // Following test is excluded as it is overridden in Gluten test suite..
+ // The overridden tests assert against Velox-specific error messages for
char/varchar
+ // length validation, which differ from the original vanilla Spark tests.
+ .exclude("length check for input string values: nested in struct of array")
enableSuite[GlutenColumnExpressionSuite]
// Velox raise_error('errMsg') throws a velox_user_error exception with
the message 'errMsg'.
// The final caught Spark exception's getCause().getMessage() contains
'errMsg' but does not
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
index 84502ace51..8c59c323ee 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
@@ -16,8 +16,48 @@
*/
package org.apache.spark.sql
+import org.apache.spark.SparkException
+
class GlutenFileSourceCharVarcharTestSuite
extends FileSourceCharVarcharTestSuite
- with GlutenSQLTestsTrait {}
+ with GlutenSQLTestsTrait {
+
+ private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5"
+
+ private def testTableWrite(f: String => Unit): Unit = {
+ withTable("t")(f("char"))
+ withTable("t")(f("varchar"))
+ }
+
+ testGluten("length check for input string values: nested in struct of
array") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
+ sql("INSERT INTO t SELECT struct(array(null))")
+ checkAnswer(spark.table("t"), Row(Row(Seq(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t SELECT
struct(array('123456'))"))
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+}
+
+class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with
GlutenSQLTestsTrait {
+
+ private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5"
+
+ private def testTableWrite(f: String => Unit): Unit = {
+ withTable("t")(f("char"))
+ withTable("t")(f("varchar"))
+ }
-class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with
GlutenSQLTestsTrait {}
+ testGluten("length check for input string values: nested in struct of
array") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
+ sql("INSERT INTO t SELECT struct(array(null))")
+ checkAnswer(spark.table("t"), Row(Row(Seq(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t SELECT
struct(array('123456'))"))
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+}
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 21cf94a61e..71b6210817 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -154,6 +154,17 @@ class ClickHouseTestSettings extends BackendTestSettings {
enableSuite[GlutenCountMinSketchAggQuerySuite]
enableSuite[GlutenCsvFunctionsSuite]
enableSuite[GlutenDSV2CharVarcharTestSuite]
+ // Excluded. The Gluten tests for char/varchar validation were rewritten
for Velox.
+ // ClickHouse backend doesn't support this feature and falls back to
vanilla Spark,
+ // causing mismatches in error messages.
+ .excludeGlutenTest("length check for input string values: top-level
columns")
+ .excludeGlutenTest("length check for input string values: nested in array")
+ .excludeGlutenTest("length check for input string values: nested in struct
of array")
+ .excludeGlutenTest("length check for input string values: nested in array
of struct")
+ .excludeGlutenTest("length check for input string values: nested in array
of array")
+ .excludeGlutenTest("length check for input string values: with implicit
cast")
+ .excludeGlutenTest("char/varchar type values length check: partitioned
columns of other types")
+ .excludeGlutenTest("SPARK-42611: check char/varchar length in reordered
structs within arrays")
enableSuite[GlutenDSV2SQLInsertTestSuite]
enableSuite[GlutenDataFrameAggregateSuite]
.exclude("average")
@@ -367,6 +378,17 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("varchar type values length check and trim: partitioned columns")
.exclude("char/varchar type values length check: partitioned columns of
other types")
.exclude("char type comparison: partitioned columns")
+ // Excluded. The Gluten tests for char/varchar validation were rewritten
for Velox.
+ // ClickHouse backend doesn't support this feature and falls back to
vanilla Spark,
+ // causing mismatches in error messages.
+ .excludeGlutenTest("length check for input string values: top-level
columns")
+ .excludeGlutenTest("length check for input string values: partitioned
columns")
+ .excludeGlutenTest("length check for input string values: nested in struct
of array")
+ .excludeGlutenTest("length check for input string values: with implicit
cast")
+ .excludeGlutenTest("char/varchar type values length check: partitioned
columns of other types")
+ .excludeGlutenTest("length check for input string values: nested in array
of array")
+ .excludeGlutenTest("length check for input string values: nested in array
of struct")
+ .excludeGlutenTest("length check for input string values: nested in array")
enableSuite[GlutenFileSourceSQLInsertTestSuite]
.exclude("SPARK-33474: Support typed literals as partition spec values")
.exclude(
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 519cef5b76..2fa7c7a77e 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -654,7 +654,28 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("length check for input string values: nested in both map key and
value")
.exclude("length check for input string values: nested in array of struct")
.exclude("length check for input string values: nested in array of array")
+ // Following tests are excluded as these are overridden in Gluten test
suite..
+ // The overridden tests assert against Velox-specific error messages for
char/varchar
+ // length validation, which differ from the original vanilla Spark tests.
+ .exclude("length check for input string values: top-level columns")
+ .exclude("length check for input string values: partitioned columns")
+ .exclude("length check for input string values: nested in struct of array")
+ .exclude("length check for input string values: with implicit cast")
+ .exclude("char/varchar type values length check: partitioned columns of
other types")
+
enableSuite[GlutenDSV2CharVarcharTestSuite]
+ // Following tests are excluded as these are overridden in Gluten test
suite..
+ // The overridden tests assert against Velox-specific error messages for
char/varchar
+ // length validation, which differ from the original vanilla Spark tests.
+ .exclude("SPARK-42611: check char/varchar length in reordered structs
within arrays")
+ .exclude("char/varchar type values length check: partitioned columns of
other types")
+ .exclude("length check for input string values: top-level columns")
+ .exclude("length check for input string values: nested in array")
+ .exclude("length check for input string values: nested in struct of array")
+ .exclude("length check for input string values: nested in array of struct")
+ .exclude("length check for input string values: nested in array of array")
+ .exclude("length check for input string values: with implicit cast")
+
enableSuite[GlutenColumnExpressionSuite]
// Velox raise_error('errMsg') throws a velox_user_error exception with
the message 'errMsg'.
// The final caught Spark exception's getCause().getMessage() contains
'errMsg' but does not
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
index 89d9114870..4f0d826f57 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
@@ -17,6 +17,8 @@
package org.apache.spark.sql
import org.apache.spark.SparkException
+import org.apache.spark.sql.internal.SQLConf
+
class GlutenFileSourceCharVarcharTestSuite
extends FileSourceCharVarcharTestSuite
with GlutenSQLTestsTrait {
@@ -27,6 +29,8 @@ class GlutenFileSourceCharVarcharTestSuite
private val ERROR_MESSAGE =
"Exceeds char/varchar type length limitation: 5"
+ private val VELOX_ERROR_MESSAGE =
+ "Exceeds allowed length limitation: 5"
testGluten("length check for input string values: nested in struct") {
testTableWrite {
@@ -50,7 +54,7 @@ class GlutenFileSourceCharVarcharTestSuite
val e = intercept[SparkException] {
sql("INSERT INTO t VALUES (array('a', '123456'))")
}
- assert(e.getMessage.contains(ERROR_MESSAGE))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
@@ -92,7 +96,7 @@ class GlutenFileSourceCharVarcharTestSuite
sql("INSERT INTO t SELECT struct(array(null))")
checkAnswer(spark.table("t"), Row(Row(Seq(null))))
val e = intercept[SparkException](sql("INSERT INTO t SELECT
struct(array('123456'))"))
- assert(e.getMessage.contains(ERROR_MESSAGE))
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
@@ -103,7 +107,7 @@ class GlutenFileSourceCharVarcharTestSuite
sql("INSERT INTO t VALUES (array(struct(null)))")
checkAnswer(spark.table("t"), Row(Seq(Row(null))))
val e = intercept[SparkException](sql("INSERT INTO t VALUES
(array(struct('123456')))"))
- assert(e.getMessage.contains(ERROR_MESSAGE))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
@@ -114,9 +118,187 @@ class GlutenFileSourceCharVarcharTestSuite
sql("INSERT INTO t VALUES (array(array(null)))")
checkAnswer(spark.table("t"), Row(Seq(Seq(null))))
val e = intercept[SparkException](sql("INSERT INTO t VALUES
(array(array('123456')))"))
- assert(e.getMessage.contains(ERROR_MESSAGE))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: top-level columns") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c $typeName(5)) USING $format")
+ sql("INSERT INTO t VALUES (null)")
+ checkAnswer(spark.table("t"), Row(null))
+ val e = intercept[SparkException](sql("INSERT INTO t VALUES
('123456')"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
+
+ testGluten("length check for input string values: partitioned columns") {
+ // DS V2 doesn't support partitioned table.
+ if (!conf.contains(SQLConf.DEFAULT_CATALOG.key)) {
+ val tableName = "t"
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE $tableName(i INT, c $typeName(5)) USING $format
PARTITIONED BY (c)")
+ sql(s"INSERT INTO $tableName VALUES (1, null)")
+ checkAnswer(spark.table(tableName), Row(1, null))
+ val e = intercept[SparkException](sql(s"INSERT INTO $tableName
VALUES (1, '123456')"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+ }
+
+ testGluten("length check for input string values: with implicit cast") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format")
+ sql("INSERT INTO t VALUES (1234, 1234)")
+ checkAnswer(spark.table("t"), Row("1234 ", "1234"))
+ val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456,
1)"))
+ assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE))
+ val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1,
123456)"))
+ assert(e2.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("char/varchar type values length check: partitioned columns of
other types") {
+ val tableName = "t"
+ Seq("CHAR(5)", "VARCHAR(5)").foreach {
+ typ =>
+ withTable(tableName) {
+ sql(s"CREATE TABLE $tableName(i STRING, c $typ) USING $format
PARTITIONED BY (c)")
+ Seq(1, 10, 100, 1000, 10000).foreach {
+ v =>
+ sql(s"INSERT OVERWRITE $tableName VALUES ('1', $v)")
+ checkPlainResult(spark.table(tableName), typ, v.toString)
+ sql(s"ALTER TABLE $tableName DROP PARTITION(c=$v)")
+ checkAnswer(spark.table(tableName), Nil)
+ }
+
+ val e1 =
+ intercept[SparkException](sql(s"INSERT OVERWRITE $tableName VALUES
('1', 100000)"))
+ assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE))
+
+ val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c=100000)"))
+ assert(e2.getMessage.contains(ERROR_MESSAGE))
+ }
+ }
+ }
+
}
-class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with
GlutenSQLTestsTrait {}
+class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with
GlutenSQLTestsTrait {
+ private val ERROR_MESSAGE =
+ "Exceeds char/varchar type length limitation: 5"
+ private val VELOX_ERROR_MESSAGE =
+ "Exceeds allowed length limitation: 5"
+
+ private def testTableWrite(f: String => Unit): Unit = {
+ withTable("t")(f("char"))
+ withTable("t")(f("varchar"))
+ }
+
+ testGluten("length check for input string values: top-level columns") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c $typeName(5)) USING $format")
+ sql("INSERT INTO t VALUES (null)")
+ checkAnswer(spark.table("t"), Row(null))
+ val e = intercept[SparkException](sql("INSERT INTO t VALUES
('123456')"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: nested in array") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c ARRAY<$typeName(5)>) USING $format")
+ sql("INSERT INTO t VALUES (array(null))")
+ checkAnswer(spark.table("t"), Row(Seq(null)))
+ val e = intercept[SparkException] {
+ sql("INSERT INTO t VALUES (array('a', '123456'))")
+ }
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: nested in struct of
array") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
+ sql("INSERT INTO t SELECT struct(array(null))")
+ checkAnswer(spark.table("t"), Row(Row(Seq(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t SELECT
struct(array('123456'))"))
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: nested in array of
struct") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c ARRAY<STRUCT<c: $typeName(5)>>) USING $format")
+ sql("INSERT INTO t VALUES (array(struct(null)))")
+ checkAnswer(spark.table("t"), Row(Seq(Row(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t VALUES
(array(struct('123456')))"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: nested in array of array")
{
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c ARRAY<ARRAY<$typeName(5)>>) USING $format")
+ sql("INSERT INTO t VALUES (array(array(null)))")
+ checkAnswer(spark.table("t"), Row(Seq(Seq(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t VALUES
(array(array('123456')))"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: with implicit cast") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format")
+ sql("INSERT INTO t VALUES (1234, 1234)")
+ checkAnswer(spark.table("t"), Row("1234 ", "1234"))
+ val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456,
1)"))
+ assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE))
+ val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1,
123456)"))
+ assert(e2.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("char/varchar type values length check: partitioned columns of
other types") {
+ val tableName = "t"
+ Seq("CHAR(5)", "VARCHAR(5)").foreach {
+ typ =>
+ withTable(tableName) {
+ sql(s"CREATE TABLE $tableName(i STRING, c $typ) USING $format
PARTITIONED BY (c)")
+ Seq(1, 10, 100, 1000, 10000).foreach {
+ v =>
+ sql(s"INSERT OVERWRITE $tableName VALUES ('1', $v)")
+ checkPlainResult(spark.table(tableName), typ, v.toString)
+ sql(s"ALTER TABLE $tableName DROP PARTITION(c=$v)")
+ checkAnswer(spark.table(tableName), Nil)
+ }
+
+ val e1 =
+ intercept[SparkException](sql(s"INSERT OVERWRITE $tableName VALUES
('1', 100000)"))
+ assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE))
+
+ val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c=100000)"))
+ assert(e2.getMessage.contains(ERROR_MESSAGE))
+ }
+ }
+ }
+
+ testGluten("SPARK-42611: check char/varchar length in reordered structs
within arrays") {
+ Seq("CHAR(5)", "VARCHAR(5)").foreach {
+ typ =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(a ARRAY<STRUCT<n_c: $typ, n_i: INT>>) USING
$format")
+ val inputDF = sql("SELECT array(named_struct('n_i', 1, 'n_c',
'123456')) AS a")
+ val e = intercept[SparkException](inputDF.writeTo("t").append())
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+ }
+}
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index e7e3ddf8a0..d325d8a6b9 100644
---
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -445,6 +445,17 @@ class ClickHouseTestSettings extends BackendTestSettings {
enableSuite[GlutenCustomerExtensionSuite]
enableSuite[GlutenDDLSourceLoadSuite]
enableSuite[GlutenDSV2CharVarcharTestSuite]
+ // Excluded. The Gluten tests for char/varchar validation were rewritten
for Velox.
+ // ClickHouse backend doesn't support this feature and falls back to
vanilla Spark,
+ // causing mismatches in error messages.
+ .excludeGlutenTest("length check for input string values: top-level
columns")
+ .excludeGlutenTest("length check for input string values: nested in array")
+ .excludeGlutenTest("length check for input string values: nested in struct
of array")
+ .excludeGlutenTest("length check for input string values: nested in array
of struct")
+ .excludeGlutenTest("length check for input string values: nested in array
of array")
+ .excludeGlutenTest("length check for input string values: with implicit
cast")
+ .excludeGlutenTest("char/varchar type values length check: partitioned
columns of other types")
+ .excludeGlutenTest("SPARK-42611: check char/varchar length in reordered
structs within arrays")
enableSuite[GlutenDSV2SQLInsertTestSuite]
enableSuite[GlutenDataFrameAggregateSuite]
// Test for vanilla spark codegen, not apply for Gluten
@@ -848,6 +859,17 @@ class ClickHouseTestSettings extends BackendTestSettings {
.includeCH("length check for input string values: nested in both map key
and value")
.includeCH("length check for input string values: nested in array of
struct")
.includeCH("length check for input string values: nested in array of
array")
+ // Excluded. The Gluten tests for char/varchar validation were rewritten
for Velox.
+ // ClickHouse backend doesn't support this feature and falls back to
vanilla Spark,
+ // causing mismatches in error messages.
+ .excludeGlutenTest("length check for input string values: top-level
columns")
+ .excludeGlutenTest("length check for input string values: partitioned
columns")
+ .excludeGlutenTest("length check for input string values: nested in struct
of array")
+ .excludeGlutenTest("length check for input string values: with implicit
cast")
+ .excludeGlutenTest("char/varchar type values length check: partitioned
columns of other types")
+ .excludeGlutenTest("length check for input string values: nested in array")
+ .excludeGlutenTest("length check for input string values: nested in array
of struct")
+ .excludeGlutenTest("length check for input string values: nested in array
of array")
enableSuite[GlutenFileSourceCustomMetadataStructSuite]
enableSuite[GlutenFileSourceSQLInsertTestSuite]
.excludeCH("SPARK-33474: Support typed literals as partition spec values")
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 27af909029..66a0583450 100644
---
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -676,7 +676,26 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("length check for input string values: nested in both map key and
value")
.exclude("length check for input string values: nested in array of struct")
.exclude("length check for input string values: nested in array of array")
+ // Following tests are excluded as these are overridden in Gluten test
suite..
+ // The overridden tests assert against Velox-specific error messages for
char/varchar
+ // length validation, which differ from the original vanilla Spark tests.
+ .exclude("length check for input string values: top-level columns")
+ .exclude("length check for input string values: partitioned columns")
+ .exclude("length check for input string values: nested in struct of array")
+ .exclude("length check for input string values: with implicit cast")
+ .exclude("char/varchar type values length check: partitioned columns of
other types")
enableSuite[GlutenDSV2CharVarcharTestSuite]
+ // Following tests are excluded as these are overridden in Gluten test
suite..
+ // The overridden tests assert against Velox-specific error messages for
char/varchar
+ // length validation, which differ from the original vanilla Spark tests.
+ .exclude("length check for input string values: top-level columns")
+ .exclude("length check for input string values: nested in array")
+ .exclude("length check for input string values: nested in struct of array")
+ .exclude("length check for input string values: nested in array of struct")
+ .exclude("length check for input string values: nested in array of array")
+ .exclude("length check for input string values: with implicit cast")
+ .exclude("char/varchar type values length check: partitioned columns of
other types")
+ .exclude("SPARK-42611: check char/varchar length in reordered structs
within arrays")
enableSuite[GlutenColumnExpressionSuite]
// Velox raise_error('errMsg') throws a velox_user_error exception with
the message 'errMsg'.
// The final caught Spark exception's getCause().getMessage() contains
'errMsg' but does not
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
index ce2f1b465e..689946547d 100644
---
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
+++
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala
@@ -17,6 +17,7 @@
package org.apache.spark.sql
import org.apache.spark.SparkException
+import org.apache.spark.sql.internal.SQLConf
class GlutenFileSourceCharVarcharTestSuite
extends FileSourceCharVarcharTestSuite
@@ -28,6 +29,8 @@ class GlutenFileSourceCharVarcharTestSuite
private val ERROR_MESSAGE =
"Exceeds char/varchar type length limitation: 5"
+ private val VELOX_ERROR_MESSAGE =
+ "Exceeds allowed length limitation: 5"
testGluten("length check for input string values: nested in struct") {
testTableWrite {
@@ -51,7 +54,7 @@ class GlutenFileSourceCharVarcharTestSuite
val e = intercept[SparkException] {
sql("INSERT INTO t VALUES (array('a', '123456'))")
}
- assert(e.getMessage.contains(ERROR_MESSAGE))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
@@ -93,7 +96,7 @@ class GlutenFileSourceCharVarcharTestSuite
sql("INSERT INTO t SELECT struct(array(null))")
checkAnswer(spark.table("t"), Row(Row(Seq(null))))
val e = intercept[SparkException](sql("INSERT INTO t SELECT
struct(array('123456'))"))
- assert(e.getMessage.contains(ERROR_MESSAGE))
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
@@ -104,7 +107,7 @@ class GlutenFileSourceCharVarcharTestSuite
sql("INSERT INTO t VALUES (array(struct(null)))")
checkAnswer(spark.table("t"), Row(Seq(Row(null))))
val e = intercept[SparkException](sql("INSERT INTO t VALUES
(array(struct('123456')))"))
- assert(e.getMessage.contains(ERROR_MESSAGE))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
@@ -115,9 +118,186 @@ class GlutenFileSourceCharVarcharTestSuite
sql("INSERT INTO t VALUES (array(array(null)))")
checkAnswer(spark.table("t"), Row(Seq(Seq(null))))
val e = intercept[SparkException](sql("INSERT INTO t VALUES
(array(array('123456')))"))
- assert(e.getMessage.contains(ERROR_MESSAGE))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: top-level columns") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c $typeName(5)) USING $format")
+ sql("INSERT INTO t VALUES (null)")
+ checkAnswer(spark.table("t"), Row(null))
+ val e = intercept[SparkException](sql("INSERT INTO t VALUES
('123456')"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: partitioned columns") {
+ // DS V2 doesn't support partitioned table.
+ if (!conf.contains(SQLConf.DEFAULT_CATALOG.key)) {
+ val tableName = "t"
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE $tableName(i INT, c $typeName(5)) USING $format
PARTITIONED BY (c)")
+ sql(s"INSERT INTO $tableName VALUES (1, null)")
+ checkAnswer(spark.table(tableName), Row(1, null))
+ val e = intercept[SparkException](sql(s"INSERT INTO $tableName
VALUES (1, '123456')"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+ }
+
+ testGluten("length check for input string values: with implicit cast") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format")
+ sql("INSERT INTO t VALUES (1234, 1234)")
+ checkAnswer(spark.table("t"), Row("1234 ", "1234"))
+ val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456,
1)"))
+ assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE))
+ val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1,
123456)"))
+ assert(e2.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("char/varchar type values length check: partitioned columns of
other types") {
+ val tableName = "t"
+ Seq("CHAR(5)", "VARCHAR(5)").foreach {
+ typ =>
+ withTable(tableName) {
+ sql(s"CREATE TABLE $tableName(i STRING, c $typ) USING $format
PARTITIONED BY (c)")
+ Seq(1, 10, 100, 1000, 10000).foreach {
+ v =>
+ sql(s"INSERT OVERWRITE $tableName VALUES ('1', $v)")
+ checkPlainResult(spark.table(tableName), typ, v.toString)
+ sql(s"ALTER TABLE $tableName DROP PARTITION(c=$v)")
+ checkAnswer(spark.table(tableName), Nil)
+ }
+
+ val e1 =
+ intercept[SparkException](sql(s"INSERT OVERWRITE $tableName VALUES
('1', 100000)"))
+ assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE))
+
+ val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c=100000)"))
+ assert(e2.getMessage.contains(ERROR_MESSAGE))
+ }
}
}
}
-class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with
GlutenSQLTestsTrait {}
+class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with
GlutenSQLTestsTrait {
+ private val ERROR_MESSAGE =
+ "Exceeds char/varchar type length limitation: 5"
+ private val VELOX_ERROR_MESSAGE =
+ "Exceeds allowed length limitation: 5"
+
+ private def testTableWrite(f: String => Unit): Unit = {
+ withTable("t")(f("char"))
+ withTable("t")(f("varchar"))
+ }
+
+ testGluten("length check for input string values: top-level columns") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c $typeName(5)) USING $format")
+ sql("INSERT INTO t VALUES (null)")
+ checkAnswer(spark.table("t"), Row(null))
+ val e = intercept[SparkException](sql("INSERT INTO t VALUES
('123456')"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: nested in array") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c ARRAY<$typeName(5)>) USING $format")
+ sql("INSERT INTO t VALUES (array(null))")
+ checkAnswer(spark.table("t"), Row(Seq(null)))
+ val e = intercept[SparkException] {
+ sql("INSERT INTO t VALUES (array('a', '123456'))")
+ }
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: nested in struct of
array") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
+ sql("INSERT INTO t SELECT struct(array(null))")
+ checkAnswer(spark.table("t"), Row(Row(Seq(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t SELECT
struct(array('123456'))"))
+ assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: nested in array of
struct") {
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c ARRAY<STRUCT<c: $typeName(5)>>) USING $format")
+ sql("INSERT INTO t VALUES (array(struct(null)))")
+ checkAnswer(spark.table("t"), Row(Seq(Row(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t VALUES
(array(struct('123456')))"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: nested in array of array")
{
+ testTableWrite {
+ typeName =>
+ sql(s"CREATE TABLE t(c ARRAY<ARRAY<$typeName(5)>>) USING $format")
+ sql("INSERT INTO t VALUES (array(array(null)))")
+ checkAnswer(spark.table("t"), Row(Seq(Seq(null))))
+ val e = intercept[SparkException](sql("INSERT INTO t VALUES
(array(array('123456')))"))
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("length check for input string values: with implicit cast") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format")
+ sql("INSERT INTO t VALUES (1234, 1234)")
+ checkAnswer(spark.table("t"), Row("1234 ", "1234"))
+ val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456,
1)"))
+ assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE))
+ val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1,
123456)"))
+ assert(e2.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+
+ testGluten("char/varchar type values length check: partitioned columns of
other types") {
+ val tableName = "t"
+ Seq("CHAR(5)", "VARCHAR(5)").foreach {
+ typ =>
+ withTable(tableName) {
+ sql(s"CREATE TABLE $tableName(i STRING, c $typ) USING $format
PARTITIONED BY (c)")
+ Seq(1, 10, 100, 1000, 10000).foreach {
+ v =>
+ sql(s"INSERT OVERWRITE $tableName VALUES ('1', $v)")
+ checkPlainResult(spark.table(tableName), typ, v.toString)
+ sql(s"ALTER TABLE $tableName DROP PARTITION(c=$v)")
+ checkAnswer(spark.table(tableName), Nil)
+ }
+
+ val e1 =
+ intercept[SparkException](sql(s"INSERT OVERWRITE $tableName VALUES
('1', 100000)"))
+ assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE))
+
+ val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c=100000)"))
+ assert(e2.getMessage.contains(ERROR_MESSAGE))
+ }
+ }
+ }
+
+ testGluten("SPARK-42611: check char/varchar length in reordered structs
within arrays") {
+ Seq("CHAR(5)", "VARCHAR(5)").foreach {
+ typ =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(a ARRAY<STRUCT<n_c: $typ, n_i: INT>>) USING
$format")
+ val inputDF = sql("SELECT array(named_struct('n_i', 1, 'n_c',
'123456')) AS a")
+ val e = intercept[SparkException](inputDF.writeTo("t").append())
+ assert(e.getMessage.contains(VELOX_ERROR_MESSAGE))
+ }
+ }
+ }
+}
diff --git
a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala
b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala
index 8fcdc01e5e..329ac65e76 100644
---
a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala
+++
b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala
@@ -372,6 +372,11 @@ object ExpressionNames {
final val UDF_PLACEHOLDER = "udf_placeholder"
final val UDAF_PLACEHOLDER = "udaf_placeholder"
+ // Spark StaticInvoke Catalyst util functions
+ final val VARCHAR_TYPE_WRITE_SIDE_CHECK = "varchar_type_write_side_check"
+ final val CHAR_TYPE_WRITE_SIDE_CHECK = "char_type_write_side_check"
+ final val READ_SIDE_PADDING = "read_side_padding"
+
// Iceberg function names
final val YEARS = "years"
final val MONTHS = "months"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]