[jira] [Updated] (XALANJ-2607) Improve performance for large documents using ID attributes

Matthias Urban (JIRA) Thu, 09 Mar 2017 14:08:59 -0800

     [ 
https://issues.apache.org/jira/browse/XALANJ-2607?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Matthias Urban updated XALANJ-2607:
-----------------------------------
    Description: 
XalanJ gets very slow for large XML documents using ID attributes often used in 
article lists. If, for instance, an article list with 1.000.000 entries is 
parsed, then it takes 6 minutes (on my machine) just to build the DTM. This is 
due to a design decision in DTMStringPool utilizing a fixed size hash table of 
101 entries. This works astoundingly well for documents with less than 100.000 
different attribute values (at least on my machine). Then it starts to get 
slower and slower. 

A minimally invasive solution is to make the hash table size configurable 
leaving the current size the default. If an application is expected to work 
with large documents, then it can increase the hash table size. For all others 
nothing changes. See the patch attached to this issue. It was created using the 
current trunk version of DTMStringPool.java.

Here is an example for testing.

{code:title=Test.java}
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.util.Date;

import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

public class Test {
  public static void main(String[] args) {
    try {
      File xmlfile = new File("doc.xml");
      FileWriter fout = new FileWriter(xmlfile);
      fout.write("<catalogue>\n");
      for (int i = 0; i < 1000000; i++) {
        fout.append("<article id=\"" + i + "\">articlename</article>\n");
      }
      fout.write("</catalogue>");
      fout.close();

      // reduce the time needed to 3 seconds! otherwise it will take 6 minutes!
      System.setProperty("org.apache.xml.dtm.ref.DTMStringPool.hashPoolSize", 
"100000");

      System.out.println("Start : " + new Date());
      TransformerFactory factory = TransformerFactory.newInstance();
      Transformer transformer = factory.newTransformer(new StreamSource(new 
File("script.xsl")));
      transformer.transform(new StreamSource(xmlfile), new StreamResult(new 
FileOutputStream("out.txt")));
      System.out.println("End   : " + new Date());
    }
    catch (Exception e) {
      e.printStackTrace();
    }
  }
}
{code}

{code:xml|title=script.xsl}
<?xml version="1.0"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform";>
  <xsl:output method="text"/>
  <xsl:template match="text()|@*"/>
  <xsl:template match="/">
    <xsl:apply-templates select="*"/>
  </xsl:template>
  <xsl:template match="article">
    <xsl:value-of select="@id"/>
    <xsl:text>&#10;</xsl:text>
  </xsl:template>
</xsl:stylesheet>
{code}

  was:
XalanJ gets very slow for large XML documents using ID attributes often used in 
article lists. If, for instance, an article list with 1.000.000 entries is 
parsed, then it takes 6 minutes (on my machine) just to build the DTM. This is 
due to a design decision in DTMStringPool utilizing a fixed size hash table of 
101 entries. This works astoundingly well for documents with less than 100.000 
different attribute values (at least on my machine). Then it starts to get 
slower and slower. 

A minimally invasive solution is to make the hash table size configurable 
leaving the current size the default. If an application is expected to work 
with large documents, then it can increase the hash table size. For all others 
nothing changes. See the patch attached to this issue. It was created using the 
current trunk version of DTMStringPool.java.

Here is an example for testing.

{code:title=Test.java}
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.util.Date;

import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

public class Test {
        public static void main(String[] args) {
                try {
                        File xmlfile = new File("doc.xml");
                        FileWriter fout = new FileWriter(xmlfile);
                        fout.write("<catalogue>\n");
                        for (int i = 0; i < 1000000; i++) {
                                fout.append("<article id=\"" + i + 
"\">articlename</article>\n");
                        }
                        fout.write("</catalogue>");
                        fout.close();

                        // reduce the time needed to 3 seconds! otherwise it 
will take 6 minutes!
                        
System.setProperty("org.apache.xml.dtm.ref.DTMStringPool.hashPoolSize", 
"100000");

                        System.out.println("Start : " + new Date());
                        TransformerFactory factory = 
TransformerFactory.newInstance();
                        Transformer transformer = factory.newTransformer(new 
StreamSource(new File("script.xsl")));
                        transformer.transform(new StreamSource(xmlfile), new 
StreamResult(new FileOutputStream("out.txt")));
                        System.out.println("End   : " + new Date());
                }
                catch (Exception e) {
                        e.printStackTrace();
                }
        }
}
{code}

{code:xml|title=script.xsl}
<?xml version="1.0"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform";>
        <xsl:output method="text"/>
        <xsl:template match="text()|@*"/>
        <xsl:template match="/">
                <xsl:apply-templates select="*"/>
        </xsl:template>
        <xsl:template match="article">
                <xsl:value-of select="@id"/>
                <xsl:text>&#10;</xsl:text>
        </xsl:template>
</xsl:stylesheet>
{code}


> Improve performance for large documents using ID attributes
> -----------------------------------------------------------
>
>                 Key: XALANJ-2607
>                 URL: https://issues.apache.org/jira/browse/XALANJ-2607
>             Project: XalanJ2
>          Issue Type: Improvement
>      Security Level: No security risk; visible to anyone(Ordinary problems in 
> Xalan projects.  Anybody can view the issue.) 
>          Components: DTM
>    Affects Versions: 2.7.2
>         Environment: Tested on but not limited to: Windows, x64, JRE 1.8
>            Reporter: Matthias Urban
>            Assignee: Steven J. Hathaway
>         Attachments: DTMStringPool.patch
>
>
> XalanJ gets very slow for large XML documents using ID attributes often used 
> in article lists. If, for instance, an article list with 1.000.000 entries is 
> parsed, then it takes 6 minutes (on my machine) just to build the DTM. This 
> is due to a design decision in DTMStringPool utilizing a fixed size hash 
> table of 101 entries. This works astoundingly well for documents with less 
> than 100.000 different attribute values (at least on my machine). Then it 
> starts to get slower and slower. 
> A minimally invasive solution is to make the hash table size configurable 
> leaving the current size the default. If an application is expected to work 
> with large documents, then it can increase the hash table size. For all 
> others nothing changes. See the patch attached to this issue. It was created 
> using the current trunk version of DTMStringPool.java.
> Here is an example for testing.
> {code:title=Test.java}
> import java.io.File;
> import java.io.FileOutputStream;
> import java.io.FileWriter;
> import java.util.Date;
> import javax.xml.transform.Transformer;
> import javax.xml.transform.TransformerFactory;
> import javax.xml.transform.stream.StreamResult;
> import javax.xml.transform.stream.StreamSource;
> public class Test {
>   public static void main(String[] args) {
>     try {
>       File xmlfile = new File("doc.xml");
>       FileWriter fout = new FileWriter(xmlfile);
>       fout.write("<catalogue>\n");
>       for (int i = 0; i < 1000000; i++) {
>         fout.append("<article id=\"" + i + "\">articlename</article>\n");
>       }
>       fout.write("</catalogue>");
>       fout.close();
>       // reduce the time needed to 3 seconds! otherwise it will take 6 
> minutes!
>       System.setProperty("org.apache.xml.dtm.ref.DTMStringPool.hashPoolSize", 
> "100000");
>       System.out.println("Start : " + new Date());
>       TransformerFactory factory = TransformerFactory.newInstance();
>       Transformer transformer = factory.newTransformer(new StreamSource(new 
> File("script.xsl")));
>       transformer.transform(new StreamSource(xmlfile), new StreamResult(new 
> FileOutputStream("out.txt")));
>       System.out.println("End   : " + new Date());
>     }
>     catch (Exception e) {
>       e.printStackTrace();
>     }
>   }
> }
> {code}
> {code:xml|title=script.xsl}
> <?xml version="1.0"?>
> <xsl:stylesheet version="1.0" 
> xmlns:xsl="http://www.w3.org/1999/XSL/Transform";>
>   <xsl:output method="text"/>
>   <xsl:template match="text()|@*"/>
>   <xsl:template match="/">
>     <xsl:apply-templates select="*"/>
>   </xsl:template>
>   <xsl:template match="article">
>     <xsl:value-of select="@id"/>
>     <xsl:text>&#10;</xsl:text>
>   </xsl:template>
> </xsl:stylesheet>
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[jira] [Updated] (XALANJ-2607) Improve performance for large documents using ID attributes

Reply via email to