This is an automated email from the ASF dual-hosted git repository.
github-merge-queue[bot] pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git
The following commit(s) were added to refs/heads/main by this push:
new 2f9573be88 refactor(workflow-core): make JSONToMap iterative to avoid
stack overflow (#5322)
2f9573be88 is described below
commit 2f9573be88e7f73e63e7f2f441187a5977fdb0a3
Author: Matthew B. <[email protected]>
AuthorDate: Sun Jun 14 14:15:21 2026 -0700
refactor(workflow-core): make JSONToMap iterative to avoid stack overflow
(#5322)
### What changes were proposed in this PR?
- Rewrote `JSONUtils.JSONToMap` from per-level recursion into an
iterative traversal over a `(JsonNode, parentName)` worklist (a
`mutable.Stack`), so nesting depth lives on the heap instead of the JVM
call stack.
- Per-node logic is unchanged: objects emit value entries and (in
flatten mode) push their object/array children, arrays push each element
keyed by 1-based index, and value nodes with a parent emit their text.
Output keys and values are identical to before.
- Added a `JSONUtilsSpec` case that flattens a 20,000-deep object (built
programmatically to bypass Jackson's own parser nesting cap), which
would previously have thrown `StackOverflowError`.
### Any related issues, documentation, or discussions?
Closes: #5321
### How was this PR tested?
- Run `sbt "WorkflowCore/testOnly *JSONUtilsSpec"`; all cases pass,
including the new 20,000-deep flatten case that throws
`StackOverflowError` on `main`.
### Was this PR authored or co-authored using generative AI tooling?
Co-authored with Claude Opus 4.7 in compliance with ASF
---------
Signed-off-by: Matthew B. <[email protected]>
Co-authored-by: Kunwoo (Chris) <[email protected]>
---
.../texera/amber/core/tuple/TupleUtils.scala | 8 ++--
.../org/apache/texera/amber/util/JSONUtils.scala | 48 +++++++++++--------
.../apache/texera/amber/util/JSONUtilsSpec.scala | 54 ++++++++++++++++++++++
3 files changed, 88 insertions(+), 22 deletions(-)
diff --git
a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/tuple/TupleUtils.scala
b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/tuple/TupleUtils.scala
index 6938dcc6ca..10f34574db 100644
---
a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/tuple/TupleUtils.scala
+++
b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/tuple/TupleUtils.scala
@@ -44,11 +44,12 @@ object TupleUtils {
val allFields: ArrayBuffer[Map[String, String]] = ArrayBuffer()
+ // Parse and flatten once; reused for schema inference and value
extraction.
val root: JsonNode = objectMapper.readTree(json)
+ val data: Map[String, String] = JSONToMap(root)
if (root.isObject) {
- val fields: Map[String, String] = JSONToMap(root)
- fieldNames = fieldNames.++(fields.keySet)
- allFields += fields
+ fieldNames = fieldNames.++(data.keySet)
+ allFields += data
}
val sortedFieldNames = fieldNames.toList
@@ -73,7 +74,6 @@ object TupleUtils {
try {
val fields = scala.collection.mutable.ArrayBuffer.empty[Any]
- val data = JSONToMap(objectMapper.readTree(json))
for (fieldName <- schema.getAttributeNames) {
if (data.contains(fieldName)) {
diff --git
a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/JSONUtils.scala
b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/JSONUtils.scala
index bcc7291489..7f862b4fbc 100644
---
a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/JSONUtils.scala
+++
b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/JSONUtils.scala
@@ -28,6 +28,7 @@ import org.apache.texera.amber.core.workflow.PortIdentity
import org.apache.texera.amber.util.serde.{PortIdentityKeyDeserializer,
PortIdentityKeySerializer}
import java.text.SimpleDateFormat
+import scala.collection.mutable
import scala.jdk.CollectionConverters.IteratorHasAsScala
object JSONUtils {
@@ -87,27 +88,38 @@ object JSONUtils {
flatten: Boolean = false,
parentName: String = ""
): Map[String, String] = {
- var result = Map[String, String]()
- if (node.isObject) {
- for (key <- node.fieldNames().asScala) {
- val child: JsonNode = node.get(key)
- val absoluteKey = (if (parentName.nonEmpty) parentName + "." else "")
+ key
- if (flatten && (child.isObject || child.isArray)) {
- result = result ++ JSONToMap(child, flatten, absoluteKey)
- } else if (child.isValueNode) {
- result = result + (absoluteKey -> child.asText())
- } else {
- // do nothing
+ val result = mutable.Map[String, String]()
+ val stack = mutable.Stack[(JsonNode, String)]((node, parentName))
+ while (stack.nonEmpty) {
+ // Read via _1/_2 rather than `val (a, b) = ...`: tuple destructuring
+ // desugars to a pattern match with an unreachable MatchError branch that
+ // coverage tools report as a permanently uncovered branch.
+ val entry = stack.pop()
+ val current = entry._1
+ val currentParent = entry._2
+ if (current.isObject) {
+ // Iterate entries (key + value) to avoid a second lookup per field.
+ for (entry <- current.fields().asScala) {
+ val key = entry.getKey
+ val child: JsonNode = entry.getValue
+ val absoluteKey = (if (currentParent.nonEmpty) currentParent + "."
else "") + key
+ if (flatten && (child.isObject || child.isArray)) {
+ stack.push((child, absoluteKey))
+ } else if (child.isValueNode) {
+ result(absoluteKey) = child.asText()
+ } else {
+ // do nothing
+ }
}
+ } else if (current.isArray) {
+ for ((child, i) <- current.elements().asScala.zipWithIndex) {
+ stack.push((child, currentParent + (i + 1)))
+ }
+ } else if (current.isValueNode && currentParent.nonEmpty) {
+ result(currentParent) = current.asText()
}
- } else if (node.isArray) {
- for ((child, i) <- node.elements().asScala.zipWithIndex) {
- result = result ++ JSONToMap(child, flatten, parentName + (i + 1))
- }
- } else if (node.isValueNode && parentName.nonEmpty) {
- result = result + (parentName -> node.asText())
}
- result
+ result.toMap
}
}
diff --git
a/common/workflow-core/src/test/scala/org/apache/texera/amber/util/JSONUtilsSpec.scala
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/JSONUtilsSpec.scala
index 5a6534713e..2212393832 100644
---
a/common/workflow-core/src/test/scala/org/apache/texera/amber/util/JSONUtilsSpec.scala
+++
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/JSONUtilsSpec.scala
@@ -20,6 +20,7 @@
package org.apache.texera.amber.util
import com.fasterxml.jackson.databind.JsonNode
+import com.fasterxml.jackson.databind.node.{JsonNodeFactory, MissingNode}
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers
@@ -133,6 +134,59 @@ class JSONUtilsSpec extends AnyFlatSpec with Matchers {
)
}
+ it should "flatten nested arrays with concatenated 1-based index keys when
flatten=true" in {
+ // An array element that is itself an array is pushed back onto the
worklist
+ // and re-processed: the inner indices concatenate onto the outer parent
with
+ // no separator, so matrix[0][1] becomes "m12".
+ val node = parse("""{"m":[[1,2],[3]]}""")
+ JSONUtils.JSONToMap(node, flatten = true) shouldBe Map(
+ "m11" -> "1",
+ "m12" -> "2",
+ "m21" -> "3"
+ )
+ }
+
+ it should "render JSON null as the literal string \"null\" for nested fields
when flatten=true" in {
+ val node = parse("""{"outer":{"a":null}}""")
+ JSONUtils.JSONToMap(node, flatten = true) shouldBe Map("outer.a" -> "null")
+ }
+
+ it should "contribute no entries for empty nested objects and arrays when
flatten=true" in {
+ // An empty object/array is pushed onto the worklist but yields nothing
once
+ // popped, so only the sibling primitive survives.
+ val node = parse("""{"emptyObj":{},"emptyArr":[],"b":"x"}""")
+ JSONUtils.JSONToMap(node, flatten = true) shouldBe Map("b" -> "x")
+ }
+
+ it should "return an empty map for a top-level empty array" in {
+ JSONUtils.JSONToMap(parse("[]"), flatten = true) shouldBe
Map.empty[String, String]
+ }
+
+ it should "ignore a node that is neither object, array, nor value node" in {
+ // Defensive branch: a MissingNode is none of object/array/value, so the
+ // traversal pops it and contributes nothing. Guards against a node type
+ // that slips past all three predicates silently corrupting the result.
+ JSONUtils.JSONToMap(MissingNode.getInstance()) shouldBe Map.empty[String,
String]
+ }
+
+ it should "flatten very deeply nested JSON without overflowing the stack" in
{
+ // The traversal is iterative, so nesting depth lives on the heap rather
than
+ // the call stack: a depth that would StackOverflow a per-level recursion
must
+ // still produce the dotted leaf key. Build the tree programmatically
rather
+ // than via parse() so Jackson's own parser nesting limit doesn't cap the
depth
+ // before JSONToMap runs. Shape: {"a":{"a":{...{"leaf":"v"}...}}}.
+ val depth = 20000
+ var current = JsonNodeFactory.instance.objectNode()
+ current.put("leaf", "v")
+ for (_ <- 1 to depth) {
+ val parent = JsonNodeFactory.instance.objectNode()
+ parent.set[JsonNode]("a", current)
+ current = parent
+ }
+ val expectedKey = ("a." * depth) + "leaf"
+ JSONUtils.JSONToMap(current, flatten = true) shouldBe Map(expectedKey ->
"v")
+ }
+
// ----- objectMapper configuration -----
"objectMapper" should "exclude null and absent fields from serialized
output" in {