Re: [PR] [SPARK-49966][SQL] Codegen Support for JsonToStructs(`from_json`) [spark]

via GitHub Tue, 15 Oct 2024 19:02:14 -0700


panbingkun commented on code in PR #48466:
URL: https://github.com/apache/spark/pull/48466#discussion_r1802200454



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala:
##########
@@ -680,53 +678,36 @@ case class JsonToStructs(
         messageParameters = Map("schema" -> toSQLType(nullableSchema)))
   }
 
-  // This converts parsed rows to the desired output by the given schema.
-  @transient
-  lazy val converter = nullableSchema match {
-    case _: StructType =>
-      (rows: Iterator[InternalRow]) => if (rows.hasNext) rows.next() else null
-    case _: ArrayType =>
-      (rows: Iterator[InternalRow]) => if (rows.hasNext) 
rows.next().getArray(0) else null
-    case _: MapType =>
-      (rows: Iterator[InternalRow]) => if (rows.hasNext) rows.next().getMap(0) 
else null
-  }
-
-  val nameOfCorruptRecord = 
SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
-  @transient lazy val parser = {
-    val parsedOptions = new JSONOptions(options, timeZoneId.get, 
nameOfCorruptRecord)
-    val mode = parsedOptions.parseMode
-    if (mode != PermissiveMode && mode != FailFastMode) {
-      throw QueryCompilationErrors.parseModeUnsupportedError("from_json", mode)
-    }
-    val (parserSchema, actualSchema) = nullableSchema match {
-      case s: StructType =>
-        ExprUtils.verifyColumnNameOfCorruptRecord(s, 
parsedOptions.columnNameOfCorruptRecord)
-        (s, StructType(s.filterNot(_.name == 
parsedOptions.columnNameOfCorruptRecord)))
-      case other =>
-        (StructType(Array(StructField("value", other))), other)
-    }
-
-    val rawParser = new JacksonParser(actualSchema, parsedOptions, 
allowArrayAsStructs = false)
-    val createParser = CreateJacksonParser.utf8String _
-
-    new FailureSafeParser[UTF8String](
-      input => rawParser.parse(input, createParser, identity[UTF8String]),
-      mode,
-      parserSchema,
-      parsedOptions.columnNameOfCorruptRecord)
-  }
-
   override def dataType: DataType = nullableSchema
 
   override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
     copy(timeZoneId = Option(timeZoneId))
 
-  override def nullSafeEval(json: Any): Any = nullableSchema match {
-    case _: VariantType =>
-      VariantExpressionEvalUtils.parseJson(json.asInstanceOf[UTF8String],
-        allowDuplicateKeys = variantAllowDuplicateKeys)
-    case _ =>
-      converter(parser.parse(json.asInstanceOf[UTF8String]))
+  @transient
+  private val nameOfCorruptRecord = 
SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
+
+  @transient
+  private lazy val evaluator = new JsonToStructsEvaluator(
+    options, nullableSchema, nameOfCorruptRecord, timeZoneId, 
variantAllowDuplicateKeys)
+
+  override def nullSafeEval(json: Any): Any = 
evaluator.evaluate(json.asInstanceOf[UTF8String])
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val refEvaluator = ctx.addReferenceObj("evaluator", evaluator)
+    val eval = child.genCode(ctx)
+    val resultType = CodeGenerator.boxedType(dataType)
+    val resultTerm = ctx.freshName("result")
+    ev.copy(code =
+      code"""
+         |${eval.code}
+         |$resultType $resultTerm = ($resultType) $refEvaluator.evaluate(
+         |  ${eval.isNull} ? null : ${eval.value});

Review Comment:
   Yes, it's redundant. I have already removed it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-49966][SQL] Codegen Support for JsonToStructs(`from_json`) [spark]

Reply via email to