Github user viirya commented on a diff in the pull request:
https://github.com/apache/spark/pull/19492#discussion_r144753534
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
---
@@ -536,26 +536,31 @@ case class JsonToStructs(
timeZoneId = None)
override def checkInputDataTypes(): TypeCheckResult = schema match {
- case _: StructType | ArrayType(_: StructType, _) =>
+ case _: StructType | ArrayType(_: StructType | _: AtomicType, _) =>
super.checkInputDataTypes()
case _ => TypeCheckResult.TypeCheckFailure(
- s"Input schema ${schema.simpleString} must be a struct or an array
of structs.")
+ s"Input schema ${schema.simpleString} must be a struct or " +
+ s"an array of structs or primitive types.")
}
@transient
- lazy val rowSchema = schema match {
+ lazy val rowSchema: DataType = schema match {
case st: StructType => st
case ArrayType(st: StructType, _) => st
+ case ArrayType(at: AtomicType, _) => ArrayType(at)
}
// This converts parsed rows to the desired output by the given schema.
@transient
- lazy val converter = schema match {
- case _: StructType =>
- (rows: Seq[InternalRow]) => if (rows.length == 1) rows.head else null
- case ArrayType(_: StructType, _) =>
- (rows: Seq[InternalRow]) => new GenericArrayData(rows)
- }
+ lazy val converter = (rows: Seq[Any]) =>
--- End diff --
This brings extra matching cost at runtime. Can we move matching outside?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]