(daffodil) branch main updated: Improve performance of xml and json infoset outputters

slawrence Mon, 19 Feb 2024 15:08:26 -0800

This is an automated email from the ASF dual-hosted git repository.

slawrence pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git



The following commit(s) were added to refs/heads/main by this push:
     new 37fa52e2e Improve performance of xml and json infoset outputters
37fa52e2e is described below

commit 37fa52e2efed11d4fa272a2f61c47e42302cc6ed
Author: Steve Lawrence <[email protected]>
AuthorDate: Thu Feb 15 15:27:15 2024 -0500

    Improve performance of xml and json infoset outputters
    
    The XMLTextInfosetOutputter and JSONInfosetOutputter do not use any
    buffering when writing data. Modifying these to wrap a BufferedWriter
    around the existing OutputStreamWriter gives significant performance
    improvements. Using a BufferedWriter also allows us to use its built-in
    newLine() function for pretty printing.
    
    This also modifies the "Standard" XML escape style in the xml infoset
    outputter so that it first checks if there are any characters that need
    to be escaped, similar to what we do for CDATA escape style. In most
    cases, there will not be any characters that need escaping, so we can
    avoid Scala XML's escape utility, which has noticeable overhead, even if
    nothing needs escaping.
    
    With these changes tested on a large file with lots of strings, this saw
    total parse + infoset output time drop from about 125 seconds to 93
    seconds, about a 25% decrease. Note that parsing with the null infoset
    outputter takes about 78 seconds, so the xml infoset outputter overhead
    went from about 37% of the total parse time down to about 20%.
    
    DAFFODIL-2872
---
 .../runtime1/infoset/JsonInfosetOutputter.scala    | 14 ++++++---
 .../runtime1/infoset/XMLTextInfosetOutputter.scala | 36 ++++++++++++++--------
 2 files changed, 32 insertions(+), 18 deletions(-)

diff --git 
a/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/JsonInfosetOutputter.scala
 
b/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/JsonInfosetOutputter.scala
index 35a09a4ae..69696e7d1 100644
--- 
a/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/JsonInfosetOutputter.scala
+++ 
b/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/JsonInfosetOutputter.scala
@@ -29,12 +29,16 @@ import org.apache.daffodil.runtime1.api.InfosetSimpleElement
 
 import com.fasterxml.jackson.core.io.JsonStringEncoder
 
-class JsonInfosetOutputter private (writer: java.io.Writer, pretty: Boolean)
+class JsonInfosetOutputter private (writer: java.io.BufferedWriter, pretty: 
Boolean)
   extends InfosetOutputter
   with Indentable {
 
   def this(os: java.io.OutputStream, pretty: Boolean) = {
-    this(new java.io.OutputStreamWriter(os, StandardCharsets.UTF_8), pretty)
+    // using a BufferedWriter provides significant performance improvements
+    this(
+      new java.io.BufferedWriter(new java.io.OutputStreamWriter(os, 
StandardCharsets.UTF_8)),
+      pretty,
+    )
   }
 
   // Keeps track of if the next element we see is the first child or not of a
@@ -63,7 +67,7 @@ class JsonInfosetOutputter private (writer: java.io.Writer, 
pretty: Boolean)
     } else {
       writer.write(',')
     }
-    if (pretty) writer.write(System.lineSeparator())
+    if (pretty) writer.newLine()
     if (pretty) outputIndentation(writer)
   }
 
@@ -91,7 +95,7 @@ class JsonInfosetOutputter private (writer: java.io.Writer, 
pretty: Boolean)
   // complex/array/document at the right indentation level
   private def endNodeWithChildren(): Unit = {
     isFirstChildStack.pop()
-    if (pretty) writer.write(System.lineSeparator())
+    if (pretty) writer.newLine()
     decrementIndentation()
     if (pretty) outputIndentation(writer)
   }
@@ -163,7 +167,7 @@ class JsonInfosetOutputter private (writer: java.io.Writer, 
pretty: Boolean)
   override def endDocument(): Unit = {
     endNodeWithChildren()
     writer.write('}')
-    if (pretty) writer.write(System.lineSeparator())
+    if (pretty) writer.newLine()
     writer.flush()
   }
 }
diff --git 
a/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/XMLTextInfosetOutputter.scala
 
b/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/XMLTextInfosetOutputter.scala
index dd7b9f331..ca130df3a 100644
--- 
a/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/XMLTextInfosetOutputter.scala
+++ 
b/daffodil-runtime1/src/main/scala/org/apache/daffodil/runtime1/infoset/XMLTextInfosetOutputter.scala
@@ -30,7 +30,7 @@ import org.apache.daffodil.runtime1.api.InfosetSimpleElement
 import org.apache.daffodil.runtime1.dpath.NodeInfo
 
 /**
- * Writes the infoset to a java.io.Writer as XML text.
+ * Writes the infoset to a java.io.BufferedWriter as XML text.
  *
  * @param writer             The writer to write the XML text to
  * @param pretty             Whether or to enable pretty printing. Set to 
true, XML
@@ -40,7 +40,7 @@ import org.apache.daffodil.runtime1.dpath.NodeInfo
  * @param minimal            Determine whether to exclude xml slug and prefix 
bindings
  */
 class XMLTextInfosetOutputter private (
-  writer: java.io.Writer,
+  writer: java.io.BufferedWriter,
   pretty: Boolean,
   xmlTextEscapeStyle: XMLTextEscapeStyle.Value,
   minimal: Boolean,
@@ -53,8 +53,9 @@ class XMLTextInfosetOutputter private (
     xmlTextEscapeStyle: XMLTextEscapeStyle.Value = XMLTextEscapeStyle.Standard,
     minimal: Boolean = false,
   ) = {
+    // using a BufferedWriter provides significant performance improvements
     this(
-      new java.io.OutputStreamWriter(os, StandardCharsets.UTF_8),
+      new java.io.BufferedWriter(new java.io.OutputStreamWriter(os, 
StandardCharsets.UTF_8)),
       pretty,
       xmlTextEscapeStyle,
       minimal,
@@ -127,7 +128,7 @@ class XMLTextInfosetOutputter private (
     // namespaces if one is defined in the infoset
     incrementIndentation()
     if (pretty) {
-      writer.write(System.lineSeparator())
+      writer.newLine()
       outputIndentation(writer)
     }
     writer.write("<")
@@ -135,7 +136,7 @@ class XMLTextInfosetOutputter private (
     writer.write(" xmlns=\"\">")
 
     if (pretty) {
-      writer.write(System.lineSeparator())
+      writer.newLine()
     }
 
     // Parse the string as XML and then write all events out to the
@@ -158,7 +159,7 @@ class XMLTextInfosetOutputter private (
 
     // write the closing wrapper element
     if (pretty) {
-      writer.write(System.lineSeparator())
+      writer.newLine()
       outputIndentation(writer)
     }
     writer.write("</")
@@ -169,7 +170,7 @@ class XMLTextInfosetOutputter private (
     // if pretty, write indentation so that the closing tag of the simple
     // element is indented as if it were complex
     if (pretty) {
-      writer.write(System.lineSeparator())
+      writer.newLine()
       outputIndentation(writer)
     }
   }
@@ -177,7 +178,7 @@ class XMLTextInfosetOutputter private (
   override def startSimple(se: InfosetSimpleElement): Unit = {
     val simple = se.asInstanceOf[DISimple]
     if (pretty) {
-      writer.write(System.lineSeparator())
+      writer.newLine()
       outputIndentation(writer)
     }
     outputStartTag(simple)
@@ -192,7 +193,7 @@ class XMLTextInfosetOutputter private (
           val escaped = xmlTextEscapeStyle match {
             case XMLTextEscapeStyle.CDATA => {
               val needsCDataEscape = xmlSafe.exists { c =>
-                scala.xml.Utility.Escapes.escMap.contains(c) || c.isWhitespace
+                c == '<' || c == '>' || c == '"' || c == '&' || c.isWhitespace
               }
               if (needsCDataEscape) {
                 "<![CDATA[%s]]>".format(xmlSafe.replaceAll("]]>", 
"]]]]><![CDATA[>"))
@@ -200,7 +201,16 @@ class XMLTextInfosetOutputter private (
                 xmlSafe
               }
             }
-            case XMLTextEscapeStyle.Standard => 
scala.xml.Utility.escape(xmlSafe)
+            case XMLTextEscapeStyle.Standard => {
+              val needsStandardEscape = xmlSafe.exists { c =>
+                c == '<' || c == '>' || c == '"' || c == '&'
+              }
+              if (needsStandardEscape) {
+                scala.xml.Utility.escape(xmlSafe)
+              } else {
+                xmlSafe
+              }
+            }
           }
           writer.write(escaped)
         }
@@ -220,7 +230,7 @@ class XMLTextInfosetOutputter private (
   override def startComplex(ce: InfosetComplexElement): Unit = {
     val complex = ce.asInstanceOf[DIComplex]
     if (pretty) {
-      writer.write(System.lineSeparator())
+      writer.newLine()
       outputIndentation(writer)
     }
     outputStartTag(complex)
@@ -233,7 +243,7 @@ class XMLTextInfosetOutputter private (
     decrementIndentation()
     if (pretty && inScopeComplexElementHasChildren) {
       // only output newline and indentation for non-empty complex types
-      writer.write(System.lineSeparator())
+      writer.newLine()
       outputIndentation(writer)
     }
     outputEndTag(complex)
@@ -255,7 +265,7 @@ class XMLTextInfosetOutputter private (
   }
 
   override def endDocument(): Unit = {
-    writer.write(System.lineSeparator())
+    writer.newLine()
     writer.flush()
   }
 }

(daffodil) branch main updated: Improve performance of xml and json infoset outputters

Reply via email to