(texera) branch main updated: refactor(workflow-core): make JSONToMap iterative to avoid stack overflow (#5322)

github-bot Sun, 14 Jun 2026 14:27:12 -0700

This is an automated email from the ASF dual-hosted git repository.

github-merge-queue[bot] pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git



The following commit(s) were added to refs/heads/main by this push:
     new 2f9573be88 refactor(workflow-core): make JSONToMap iterative to avoid 
stack overflow (#5322)
2f9573be88 is described below

commit 2f9573be88e7f73e63e7f2f441187a5977fdb0a3
Author: Matthew B. <[email protected]>
AuthorDate: Sun Jun 14 14:15:21 2026 -0700

    refactor(workflow-core): make JSONToMap iterative to avoid stack overflow 
(#5322)
    
    ### What changes were proposed in this PR?
    - Rewrote `JSONUtils.JSONToMap` from per-level recursion into an
    iterative traversal over a `(JsonNode, parentName)` worklist (a
    `mutable.Stack`), so nesting depth lives on the heap instead of the JVM
    call stack.
    - Per-node logic is unchanged: objects emit value entries and (in
    flatten mode) push their object/array children, arrays push each element
    keyed by 1-based index, and value nodes with a parent emit their text.
    Output keys and values are identical to before.
    - Added a `JSONUtilsSpec` case that flattens a 20,000-deep object (built
    programmatically to bypass Jackson's own parser nesting cap), which
    would previously have thrown `StackOverflowError`.
    ### Any related issues, documentation, or discussions?
    Closes: #5321
    ### How was this PR tested?
    - Run `sbt "WorkflowCore/testOnly *JSONUtilsSpec"`; all cases pass,
    including the new 20,000-deep flatten case that throws
    `StackOverflowError` on `main`.
    ### Was this PR authored or co-authored using generative AI tooling?
    Co-authored with Claude Opus 4.7 in compliance with ASF
    
    ---------
    
    Signed-off-by: Matthew B. <[email protected]>
    Co-authored-by: Kunwoo (Chris) <[email protected]>
---
 .../texera/amber/core/tuple/TupleUtils.scala       |  8 ++--
 .../org/apache/texera/amber/util/JSONUtils.scala   | 48 +++++++++++--------
 .../apache/texera/amber/util/JSONUtilsSpec.scala   | 54 ++++++++++++++++++++++
 3 files changed, 88 insertions(+), 22 deletions(-)

diff --git 
a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/tuple/TupleUtils.scala
 
b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/tuple/TupleUtils.scala
index 6938dcc6ca..10f34574db 100644
--- 
a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/tuple/TupleUtils.scala
+++ 
b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/tuple/TupleUtils.scala
@@ -44,11 +44,12 @@ object TupleUtils {
 
     val allFields: ArrayBuffer[Map[String, String]] = ArrayBuffer()
 
+    // Parse and flatten once; reused for schema inference and value 
extraction.
     val root: JsonNode = objectMapper.readTree(json)
+    val data: Map[String, String] = JSONToMap(root)
     if (root.isObject) {
-      val fields: Map[String, String] = JSONToMap(root)
-      fieldNames = fieldNames.++(fields.keySet)
-      allFields += fields
+      fieldNames = fieldNames.++(data.keySet)
+      allFields += data
     }
 
     val sortedFieldNames = fieldNames.toList
@@ -73,7 +74,6 @@ object TupleUtils {
 
     try {
       val fields = scala.collection.mutable.ArrayBuffer.empty[Any]
-      val data = JSONToMap(objectMapper.readTree(json))
 
       for (fieldName <- schema.getAttributeNames) {
         if (data.contains(fieldName)) {
diff --git 
a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/JSONUtils.scala
 
b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/JSONUtils.scala
index bcc7291489..7f862b4fbc 100644
--- 
a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/JSONUtils.scala
+++ 
b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/JSONUtils.scala
@@ -28,6 +28,7 @@ import org.apache.texera.amber.core.workflow.PortIdentity
 import org.apache.texera.amber.util.serde.{PortIdentityKeyDeserializer, 
PortIdentityKeySerializer}
 
 import java.text.SimpleDateFormat
+import scala.collection.mutable
 import scala.jdk.CollectionConverters.IteratorHasAsScala
 
 object JSONUtils {
@@ -87,27 +88,38 @@ object JSONUtils {
       flatten: Boolean = false,
       parentName: String = ""
   ): Map[String, String] = {
-    var result = Map[String, String]()
-    if (node.isObject) {
-      for (key <- node.fieldNames().asScala) {
-        val child: JsonNode = node.get(key)
-        val absoluteKey = (if (parentName.nonEmpty) parentName + "." else "") 
+ key
-        if (flatten && (child.isObject || child.isArray)) {
-          result = result ++ JSONToMap(child, flatten, absoluteKey)
-        } else if (child.isValueNode) {
-          result = result + (absoluteKey -> child.asText())
-        } else {
-          // do nothing
+    val result = mutable.Map[String, String]()
+    val stack = mutable.Stack[(JsonNode, String)]((node, parentName))
+    while (stack.nonEmpty) {
+      // Read via _1/_2 rather than `val (a, b) = ...`: tuple destructuring
+      // desugars to a pattern match with an unreachable MatchError branch that
+      // coverage tools report as a permanently uncovered branch.
+      val entry = stack.pop()
+      val current = entry._1
+      val currentParent = entry._2
+      if (current.isObject) {
+        // Iterate entries (key + value) to avoid a second lookup per field.
+        for (entry <- current.fields().asScala) {
+          val key = entry.getKey
+          val child: JsonNode = entry.getValue
+          val absoluteKey = (if (currentParent.nonEmpty) currentParent + "." 
else "") + key
+          if (flatten && (child.isObject || child.isArray)) {
+            stack.push((child, absoluteKey))
+          } else if (child.isValueNode) {
+            result(absoluteKey) = child.asText()
+          } else {
+            // do nothing
+          }
         }
+      } else if (current.isArray) {
+        for ((child, i) <- current.elements().asScala.zipWithIndex) {
+          stack.push((child, currentParent + (i + 1)))
+        }
+      } else if (current.isValueNode && currentParent.nonEmpty) {
+        result(currentParent) = current.asText()
       }
-    } else if (node.isArray) {
-      for ((child, i) <- node.elements().asScala.zipWithIndex) {
-        result = result ++ JSONToMap(child, flatten, parentName + (i + 1))
-      }
-    } else if (node.isValueNode && parentName.nonEmpty) {
-      result = result + (parentName -> node.asText())
     }
-    result
+    result.toMap
   }
 
 }
diff --git 
a/common/workflow-core/src/test/scala/org/apache/texera/amber/util/JSONUtilsSpec.scala
 
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/JSONUtilsSpec.scala
index 5a6534713e..2212393832 100644
--- 
a/common/workflow-core/src/test/scala/org/apache/texera/amber/util/JSONUtilsSpec.scala
+++ 
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/JSONUtilsSpec.scala
@@ -20,6 +20,7 @@
 package org.apache.texera.amber.util
 
 import com.fasterxml.jackson.databind.JsonNode
+import com.fasterxml.jackson.databind.node.{JsonNodeFactory, MissingNode}
 import org.scalatest.flatspec.AnyFlatSpec
 import org.scalatest.matchers.should.Matchers
 
@@ -133,6 +134,59 @@ class JSONUtilsSpec extends AnyFlatSpec with Matchers {
     )
   }
 
+  it should "flatten nested arrays with concatenated 1-based index keys when 
flatten=true" in {
+    // An array element that is itself an array is pushed back onto the 
worklist
+    // and re-processed: the inner indices concatenate onto the outer parent 
with
+    // no separator, so matrix[0][1] becomes "m12".
+    val node = parse("""{"m":[[1,2],[3]]}""")
+    JSONUtils.JSONToMap(node, flatten = true) shouldBe Map(
+      "m11" -> "1",
+      "m12" -> "2",
+      "m21" -> "3"
+    )
+  }
+
+  it should "render JSON null as the literal string \"null\" for nested fields 
when flatten=true" in {
+    val node = parse("""{"outer":{"a":null}}""")
+    JSONUtils.JSONToMap(node, flatten = true) shouldBe Map("outer.a" -> "null")
+  }
+
+  it should "contribute no entries for empty nested objects and arrays when 
flatten=true" in {
+    // An empty object/array is pushed onto the worklist but yields nothing 
once
+    // popped, so only the sibling primitive survives.
+    val node = parse("""{"emptyObj":{},"emptyArr":[],"b":"x"}""")
+    JSONUtils.JSONToMap(node, flatten = true) shouldBe Map("b" -> "x")
+  }
+
+  it should "return an empty map for a top-level empty array" in {
+    JSONUtils.JSONToMap(parse("[]"), flatten = true) shouldBe 
Map.empty[String, String]
+  }
+
+  it should "ignore a node that is neither object, array, nor value node" in {
+    // Defensive branch: a MissingNode is none of object/array/value, so the
+    // traversal pops it and contributes nothing. Guards against a node type
+    // that slips past all three predicates silently corrupting the result.
+    JSONUtils.JSONToMap(MissingNode.getInstance()) shouldBe Map.empty[String, 
String]
+  }
+
+  it should "flatten very deeply nested JSON without overflowing the stack" in 
{
+    // The traversal is iterative, so nesting depth lives on the heap rather 
than
+    // the call stack: a depth that would StackOverflow a per-level recursion 
must
+    // still produce the dotted leaf key. Build the tree programmatically 
rather
+    // than via parse() so Jackson's own parser nesting limit doesn't cap the 
depth
+    // before JSONToMap runs. Shape: {"a":{"a":{...{"leaf":"v"}...}}}.
+    val depth = 20000
+    var current = JsonNodeFactory.instance.objectNode()
+    current.put("leaf", "v")
+    for (_ <- 1 to depth) {
+      val parent = JsonNodeFactory.instance.objectNode()
+      parent.set[JsonNode]("a", current)
+      current = parent
+    }
+    val expectedKey = ("a." * depth) + "leaf"
+    JSONUtils.JSONToMap(current, flatten = true) shouldBe Map(expectedKey -> 
"v")
+  }
+
   // ----- objectMapper configuration -----
 
   "objectMapper" should "exclude null and absent fields from serialized 
output" in {

(texera) branch main updated: refactor(workflow-core): make JSONToMap iterative to avoid stack overflow (#5322)

Reply via email to