sunchao commented on code in PR #56685:
URL: https://github.com/apache/spark/pull/56685#discussion_r3461227819


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala:
##########
@@ -155,28 +161,24 @@ object GetJsonObject {
 }
 
 /**
- * Extracts multiple simple top-level fields from a JSON string in one parse. 
This is an internal
- * expression used to share sibling [[GetJsonObject]] expressions; unsupported 
JSON paths remain
- * as independent GetJsonObject expressions.
+ * Extracts multiple simple named paths from a JSON string in one parse. This 
is an internal
+ * expression used to share sibling [[GetJsonObject]] expressions; unsupported 
and
+ * prefix-conflicting JSON paths remain as independent GetJsonObject 
expressions.
  */
 case class MultiGetJsonObject(
     json: Expression,
-    fieldNames: Seq[String],
     fallbackPaths: Seq[String])
   extends UnaryExpression
   with ExpectsInputTypes {
 
-  require(
-    fieldNames.nonEmpty &&
-      fieldNames.distinct.length == fieldNames.length &&
-      fallbackPaths.length == fieldNames.length)
+  require(fallbackPaths.nonEmpty)

Review Comment:
   Thanks, clarified in 917b893e784. I added a note on MultiGetJsonObject that 
OptimizeCsvJsonExprs caps shared path depth to keep the recursive evaluator 
stack-safe.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/json/JsonExpressionEvalUtils.scala:
##########
@@ -647,15 +639,86 @@ case class MultiGetJsonObjectEvaluator(
         nullRow
       }
     } catch {
-      // Every simple top-level legacy extraction scans through the root 
object's closing token,
-      // so a syntax failure makes every sibling null without needing per-path 
reparsing.
+      // Every simple named legacy extraction scans through the root object's 
closing token, so a
+      // syntax failure makes every sibling null without needing per-path 
reparsing.
       case _: JsonParseException => nullRow
-      // A parser-side rendering failure can leave the shared token stream 
unusable. Reparse each
-      // path with the legacy evaluator so one bad selected value cannot erase 
sibling results.
+      // A parser-side rendering failure, such as a string-length constraint 
violation, can leave
+      // the shared token stream unusable. Reparse each path with the legacy 
evaluator so one bad
+      // selected value cannot erase independent sibling results.
       case _: JsonProcessingException => fallback(json)
     }
   }
 
+  private def extractTopLevelObject(
+      parser: JsonParser,
+      values: Array[Any],
+      matched: Array[Boolean]): Boolean = {
+    var token = parser.nextToken()
+    while (token != null && token != JsonToken.END_OBJECT) {
+      if (token == JsonToken.FIELD_NAME) {
+        val ordinal = 
topLevelFieldToOrdinal.get(parser.currentName).filter(!matched(_))
+        val valueToken = parser.nextToken()
+        if (ordinal.nonEmpty && valueToken != JsonToken.VALUE_NULL) {
+          val index = ordinal.get
+          matched(index) = true
+          copyCurrentStructure(parser).foreach(value => values(index) = value)
+        } else {
+          parser.skipChildren()
+        }
+      } else {
+        parser.skipChildren()
+      }
+      token = parser.nextToken()
+    }
+    token == JsonToken.END_OBJECT
+  }
+
+  private def extractObject(
+      parser: JsonParser,
+      node: MultiGetJsonObjectEvaluator.PathTrieNode,
+      values: Array[Any],
+      matched: Array[Boolean]): Boolean = {
+    var valid = true
+    var token = parser.nextToken()
+    while (valid && token != null && token != JsonToken.END_OBJECT) {
+      if (token == JsonToken.FIELD_NAME) {
+        val child = 
node.children.get(parser.currentName).filter(_.hasUnmatched(matched))
+        val valueToken = parser.nextToken()
+        if (child.nonEmpty && valueToken != JsonToken.VALUE_NULL) {
+          valid = extractValue(parser, child.get, values, matched)
+        } else {
+          parser.skipChildren()
+        }
+      } else {
+        parser.skipChildren()
+      }
+      if (valid) {
+        token = parser.nextToken()
+      }
+    }
+    valid && token == JsonToken.END_OBJECT
+  }
+
+  private def extractValue(
+      parser: JsonParser,
+      node: MultiGetJsonObjectEvaluator.PathTrieNode,
+      values: Array[Any],
+      matched: Array[Boolean]): Boolean = {
+    if (node.terminalOrdinals.nonEmpty) {

Review Comment:
   Thanks, clarified in 917b893e784. I documented that optimizer-generated 
paths are deduplicated and that multiple terminal ordinals are retained only as 
defensive support for directly constructed internal expressions with duplicate 
paths.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to