AngersZhuuuu commented on a change in pull request #29085:
URL: https://github.com/apache/spark/pull/29085#discussion_r458688383
##########
File path:
sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
##########
@@ -206,75 +169,83 @@ class HiveScriptTransformationSuite extends SparkPlanTest
with SQLTestUtils with
val query = sql(
s"""
- |SELECT
- |TRANSFORM(a, b, c, d, e)
- |USING 'python $scriptFilePath' AS (a, b, c, d, e)
- |FROM v
+ |SELECT TRANSFORM(a, b, c, d, e)
+ |USING 'python ${scriptFilePath}'
+ |FROM v
""".stripMargin)
- // In Hive 1.2, the string representation of a decimal omits trailing
zeroes.
- // But in Hive 2.3, it is always padded to 18 digits with trailing
zeroes if necessary.
- val decimalToString: Column => Column = if (HiveUtils.isHive23) {
- c => c.cast("string")
- } else {
- c => c.cast("decimal(1, 0)").cast("string")
- }
- checkAnswer(query, identity, df.select(
- 'a.cast("string"),
- 'b.cast("string"),
- 'c.cast("string"),
- decimalToString('d),
- 'e.cast("string")).collect())
+ // In hive default serde mode, if we don't define output schema, it will
choose first
+ // two column as output schema (key: String, value: String)
+ checkAnswer(
+ query,
+ identity,
+ df.select(
+ 'a.cast("string").as("key"),
+ 'b.cast("string").as("value")).collect())
}
}
- test("SPARK-30973: TRANSFORM should wait for the termination of the script
(no serde)") {
+ test("SPARK-32106: TRANSFORM support complex data types as input and ouput
type (hive serde)") {
assume(TestUtils.testCommandAvailable("/bin/bash"))
+ withTempView("v") {
+ val df = Seq(
+ (1, "1", Array(0, 1, 2), Map("a" -> 1)),
+ (2, "2", Array(3, 4, 5), Map("b" -> 2)))
+ .toDF("a", "b", "c", "d")
+ .select('a, 'b, 'c, 'd, struct('a, 'b).as("e"))
+ df.createTempView("v")
- val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
- val e = intercept[SparkException] {
- val plan =
- new HiveScriptTransformationExec(
- input = Seq(rowsDf.col("a").expr),
- script = "some_non_existent_command",
- output = Seq(AttributeReference("a", StringType)()),
- child = rowsDf.queryExecution.sparkPlan,
- ioschema = noSerdeIOSchema)
- SparkPlanTest.executePlan(plan, hiveContext)
+ // Hive serde support ArrayType/MapType/StructType as input and output
data type
+ checkAnswer(
+ df,
+ (child: SparkPlan) => createScriptTransformationExec(
+ input = Seq(
+ df.col("c").expr,
+ df.col("d").expr,
+ df.col("e").expr),
+ script = "cat",
+ output = Seq(
+ AttributeReference("c", ArrayType(IntegerType))(),
+ AttributeReference("d", MapType(StringType, IntegerType))(),
+ AttributeReference("e", StructType(
+ Seq(
+ StructField("col1", IntegerType, false),
+ StructField("col2", StringType, true))))()),
+ child = child,
+ ioschema = serdeIOSchema
+ ),
+ df.select('c, 'd, 'e).collect())
}
- assert(e.getMessage.contains("Subprocess exited with status"))
- assert(uncaughtExceptionHandler.exception.isEmpty)
}
- test("SPARK-30973: TRANSFORM should wait for the termination of the script
(with serde)") {
+ test("SPARK-32106: TRANSFORM don't support
CalenderIntervalType/UserDefinedType (hive serde)") {
assume(TestUtils.testCommandAvailable("/bin/bash"))
+ withTempView("v") {
+ val df = Seq(
+ (1, new CalendarInterval(7, 1, 1000), new
TestUDT.MyDenseVector(Array(1, 2, 3))),
+ (1, new CalendarInterval(7, 1, 1000), new
TestUDT.MyDenseVector(Array(1, 2, 3))))
+ .toDF("a", "b", "c")
+ df.createTempView("v")
- val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
- val e = intercept[SparkException] {
- val plan =
- new HiveScriptTransformationExec(
- input = Seq(rowsDf.col("a").expr),
- script = "some_non_existent_command",
- output = Seq(AttributeReference("a", StringType)()),
- child = rowsDf.queryExecution.sparkPlan,
- ioschema = serdeIOSchema)
- SparkPlanTest.executePlan(plan, hiveContext)
+ val e1 = intercept[Exception] {
+ sql(
+ """
+ |SELECT TRANSFORM(a, b) USING 'cat' AS (a, b)
+ |FROM v
+ """.stripMargin).collect()
+ }
+ assert(e1.getMessage.contains("scala.MatchError: CalendarIntervalType"))
Review comment:
> Ah, I see. which code throws this match error?
HiveInspectors$typeInfoConversions.toTypeInfo
```
def toTypeInfo: TypeInfo = dt match {
case ArrayType(elemType, _) =>
getListTypeInfo(elemType.toTypeInfo)
case StructType(fields) =>
getStructTypeInfo(
java.util.Arrays.asList(fields.map(_.name) : _*),
java.util.Arrays.asList(fields.map(_.dataType.toTypeInfo) : _*))
case MapType(keyType, valueType, _) =>
getMapTypeInfo(keyType.toTypeInfo, valueType.toTypeInfo)
case BinaryType => binaryTypeInfo
case BooleanType => booleanTypeInfo
case ByteType => byteTypeInfo
case DoubleType => doubleTypeInfo
case FloatType => floatTypeInfo
case IntegerType => intTypeInfo
case LongType => longTypeInfo
case ShortType => shortTypeInfo
case StringType => stringTypeInfo
case d: DecimalType => decimalTypeInfo(d)
case DateType => dateTypeInfo
case TimestampType => timestampTypeInfo
case NullType => voidTypeInfo
}
```
Since hive don't have corresponding data type, maybe we can convert it as
String? and raise a pr to fix this.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]