This is an automated email from the ASF dual-hosted git repository.
slawrence pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git
The following commit(s) were added to refs/heads/main by this push:
new 37fa52e2e Improve performance of xml and json infoset outputters
37fa52e2e is described below
commit 37fa52e2efed11d4fa272a2f61c47e42302cc6ed
Author: Steve Lawrence <[email protected]>
AuthorDate: Thu Feb 15 15:27:15 2024 -0500
Improve performance of xml and json infoset outputters
The XMLTextInfosetOutputter and JSONInfosetOutputter do not use any
buffering when writing data. Modifying these to wrap a BufferedWriter
around the existing OutputStreamWriter gives significant performance
improvements. Using a BufferedWriter also allows us to use its built-in
newLine() function for pretty printing.
This also modifies the "Standard" XML escape style in the xml infoset
outputter so that it first checks if there are any characters that need
to be escaped, similar to what we do for CDATA escape style. In most
cases, there will not be any characters that need escaping, so we can
avoid Scala XML's escape utility, which has noticeable overhead, even if
nothing needs escaping.
With these changes tested on a large file with lots of strings, this saw
total parse + infoset output time drop from about 125 seconds to 93
seconds, about a 25% decrease. Note that parsing with the null infoset
outputter takes about 78 seconds, so the xml infoset outputter overhead
went from about 37% of the total parse time down to about 20%.
DAFFODIL-2872
---
.../runtime1/infoset/JsonInfosetOutputter.scala | 14 ++++++---
.../runtime1/infoset/XMLTextInfosetOutputter.scala | 36 ++++++++++++++--------
2 files changed, 32 insertions(+), 18 deletions(-)
diff --git
a/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/JsonInfosetOutputter.scala
b/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/JsonInfosetOutputter.scala
index 35a09a4ae..69696e7d1 100644
---
a/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/JsonInfosetOutputter.scala
+++
b/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/JsonInfosetOutputter.scala
@@ -29,12 +29,16 @@ import org.apache.daffodil.runtime1.api.InfosetSimpleElement
import com.fasterxml.jackson.core.io.JsonStringEncoder
-class JsonInfosetOutputter private (writer: java.io.Writer, pretty: Boolean)
+class JsonInfosetOutputter private (writer: java.io.BufferedWriter, pretty:
Boolean)
extends InfosetOutputter
with Indentable {
def this(os: java.io.OutputStream, pretty: Boolean) = {
- this(new java.io.OutputStreamWriter(os, StandardCharsets.UTF_8), pretty)
+ // using a BufferedWriter provides significant performance improvements
+ this(
+ new java.io.BufferedWriter(new java.io.OutputStreamWriter(os,
StandardCharsets.UTF_8)),
+ pretty,
+ )
}
// Keeps track of if the next element we see is the first child or not of a
@@ -63,7 +67,7 @@ class JsonInfosetOutputter private (writer: java.io.Writer,
pretty: Boolean)
} else {
writer.write(',')
}
- if (pretty) writer.write(System.lineSeparator())
+ if (pretty) writer.newLine()
if (pretty) outputIndentation(writer)
}
@@ -91,7 +95,7 @@ class JsonInfosetOutputter private (writer: java.io.Writer,
pretty: Boolean)
// complex/array/document at the right indentation level
private def endNodeWithChildren(): Unit = {
isFirstChildStack.pop()
- if (pretty) writer.write(System.lineSeparator())
+ if (pretty) writer.newLine()
decrementIndentation()
if (pretty) outputIndentation(writer)
}
@@ -163,7 +167,7 @@ class JsonInfosetOutputter private (writer: java.io.Writer,
pretty: Boolean)
override def endDocument(): Unit = {
endNodeWithChildren()
writer.write('}')
- if (pretty) writer.write(System.lineSeparator())
+ if (pretty) writer.newLine()
writer.flush()
}
}
diff --git
a/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/XMLTextInfosetOutputter.scala
b/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/XMLTextInfosetOutputter.scala
index dd7b9f331..ca130df3a 100644
---
a/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/XMLTextInfosetOutputter.scala
+++
b/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/XMLTextInfosetOutputter.scala
@@ -30,7 +30,7 @@ import org.apache.daffodil.runtime1.api.InfosetSimpleElement
import org.apache.daffodil.runtime1.dpath.NodeInfo
/**
- * Writes the infoset to a java.io.Writer as XML text.
+ * Writes the infoset to a java.io.BufferedWriter as XML text.
*
* @param writer The writer to write the XML text to
* @param pretty Whether or to enable pretty printing. Set to
true, XML
@@ -40,7 +40,7 @@ import org.apache.daffodil.runtime1.dpath.NodeInfo
* @param minimal Determine whether to exclude xml slug and prefix
bindings
*/
class XMLTextInfosetOutputter private (
- writer: java.io.Writer,
+ writer: java.io.BufferedWriter,
pretty: Boolean,
xmlTextEscapeStyle: XMLTextEscapeStyle.Value,
minimal: Boolean,
@@ -53,8 +53,9 @@ class XMLTextInfosetOutputter private (
xmlTextEscapeStyle: XMLTextEscapeStyle.Value = XMLTextEscapeStyle.Standard,
minimal: Boolean = false,
) = {
+ // using a BufferedWriter provides significant performance improvements
this(
- new java.io.OutputStreamWriter(os, StandardCharsets.UTF_8),
+ new java.io.BufferedWriter(new java.io.OutputStreamWriter(os,
StandardCharsets.UTF_8)),
pretty,
xmlTextEscapeStyle,
minimal,
@@ -127,7 +128,7 @@ class XMLTextInfosetOutputter private (
// namespaces if one is defined in the infoset
incrementIndentation()
if (pretty) {
- writer.write(System.lineSeparator())
+ writer.newLine()
outputIndentation(writer)
}
writer.write("<")
@@ -135,7 +136,7 @@ class XMLTextInfosetOutputter private (
writer.write(" xmlns=\"\">")
if (pretty) {
- writer.write(System.lineSeparator())
+ writer.newLine()
}
// Parse the string as XML and then write all events out to the
@@ -158,7 +159,7 @@ class XMLTextInfosetOutputter private (
// write the closing wrapper element
if (pretty) {
- writer.write(System.lineSeparator())
+ writer.newLine()
outputIndentation(writer)
}
writer.write("</")
@@ -169,7 +170,7 @@ class XMLTextInfosetOutputter private (
// if pretty, write indentation so that the closing tag of the simple
// element is indented as if it were complex
if (pretty) {
- writer.write(System.lineSeparator())
+ writer.newLine()
outputIndentation(writer)
}
}
@@ -177,7 +178,7 @@ class XMLTextInfosetOutputter private (
override def startSimple(se: InfosetSimpleElement): Unit = {
val simple = se.asInstanceOf[DISimple]
if (pretty) {
- writer.write(System.lineSeparator())
+ writer.newLine()
outputIndentation(writer)
}
outputStartTag(simple)
@@ -192,7 +193,7 @@ class XMLTextInfosetOutputter private (
val escaped = xmlTextEscapeStyle match {
case XMLTextEscapeStyle.CDATA => {
val needsCDataEscape = xmlSafe.exists { c =>
- scala.xml.Utility.Escapes.escMap.contains(c) || c.isWhitespace
+ c == '<' || c == '>' || c == '"' || c == '&' || c.isWhitespace
}
if (needsCDataEscape) {
"<![CDATA[%s]]>".format(xmlSafe.replaceAll("]]>",
"]]]]><![CDATA[>"))
@@ -200,7 +201,16 @@ class XMLTextInfosetOutputter private (
xmlSafe
}
}
- case XMLTextEscapeStyle.Standard =>
scala.xml.Utility.escape(xmlSafe)
+ case XMLTextEscapeStyle.Standard => {
+ val needsStandardEscape = xmlSafe.exists { c =>
+ c == '<' || c == '>' || c == '"' || c == '&'
+ }
+ if (needsStandardEscape) {
+ scala.xml.Utility.escape(xmlSafe)
+ } else {
+ xmlSafe
+ }
+ }
}
writer.write(escaped)
}
@@ -220,7 +230,7 @@ class XMLTextInfosetOutputter private (
override def startComplex(ce: InfosetComplexElement): Unit = {
val complex = ce.asInstanceOf[DIComplex]
if (pretty) {
- writer.write(System.lineSeparator())
+ writer.newLine()
outputIndentation(writer)
}
outputStartTag(complex)
@@ -233,7 +243,7 @@ class XMLTextInfosetOutputter private (
decrementIndentation()
if (pretty && inScopeComplexElementHasChildren) {
// only output newline and indentation for non-empty complex types
- writer.write(System.lineSeparator())
+ writer.newLine()
outputIndentation(writer)
}
outputEndTag(complex)
@@ -255,7 +265,7 @@ class XMLTextInfosetOutputter private (
}
override def endDocument(): Unit = {
- writer.write(System.lineSeparator())
+ writer.newLine()
writer.flush()
}
}