mmidy 00/08/17 14:32:54
Modified: src/org/apache/xalan/xpath/dtm DTM.java
Log:
Checking this in for Joe Kesselman: Fix for EntityRefs.
Revision Changes Path
1.24 +176 -15 xml-xalan/src/org/apache/xalan/xpath/dtm/DTM.java
Index: DTM.java
===================================================================
RCS file: /home/cvs/xml-xalan/src/org/apache/xalan/xpath/dtm/DTM.java,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -r1.23 -r1.24
--- DTM.java 2000/08/04 02:12:26 1.23
+++ DTM.java 2000/08/17 21:32:53 1.24
@@ -115,7 +115,7 @@
// DTM state information
// org.xml.sax.Parser parser;
// org.apache.xalan.xpath.dtm.HookedXMLParser ibmparser;
- // org.apache.xerces.utils.StringPool fStringPool;
+ // org.apache.xerces.utils.StringPool fStringPool;
ChunkedIntArray nodes = new ChunkedIntArray(4);
/**
@@ -141,6 +141,7 @@
// MANEFEST CONSTANTS
// Status bits, ORed with node type (assumed to be <256, should be safe)
+ final int TEXT_DTM_POOL = 1 << 8; // Locally cached, eg concatenation
final int TEXT_IGNORABLE = 2 << 8;
final int TEXT_CDATA = 4 << 8;
@@ -405,6 +406,7 @@
throws org.xml.sax.SAXException
{
if(DISABLE)return;
+ appendAccumulatedText();
done=true;
@@ -443,6 +445,8 @@
int attrListIndex)
{
if(DISABLE)return;
+ appendAccumulatedText();
+
// Need to retrive the attrList...
String attrname, attrvalue;
@@ -609,6 +613,7 @@
public final void endElement(QName name)
{
if(DISABLE)return;
+ appendAccumulatedText();
int thisElement = currentParent;
// If last node appended before we pop has a next-sib reference,
@@ -776,25 +781,135 @@
general_characters(dataIndex);
}
+ // Vector handles objects. Too much overhead. I could use ChunkedIntArray
+ // (and did, in an early draft), but since we aren't trying to handle SAX
+ // right now there's no need for the additional columns. So we'll use a
+ // simple grow-it-myself array.
+ int charChunks[]=new int[100];
+ int charChunkStart=0,charChunkCount=0;
+
/** Text-accumulator operation for the integer-index version of
* characters(). Obviously far simpler, since we are assured that
* (unlike the parse buffers) the XML4J symbol table will persist.
* @param index int Index of this string in XML4J's symbol tables.
+ *<p>
+ * Note: Even though we are using XML4J's internal events rather than SAX,
+ * we <strong>must</strong> be prepared to normalize successive blocks
+ * of characters():
+ * <ul>
+ * <li>when text runs over the end of a parse buffer (may not arise in
+ * this parser),</li>
+ * <li>when text and CDATA sections are intermixed (with intervening
+ * start/end CDATA events),</li>
+ * <li>and when text and entity references are intermixed (with intervening
+ * start/end Entity Reference events).</li>
+ * </ul>
+ * The simplest way to handle this is to record the data, but defer
+ * creating the Text node until we get an event indicating that no further
+ * text will arrive. This logic was present in early versions of DTM,
+ * but was lost during an overagressive optimization; we're restoring it
now.
+ *<p>
+ * Note: Yes, the charChunks array grows monotonically during parsing,
+ * and does not shrink back down when the chunks are concatenated later
+ * in processing. Tough. I'm assuming that this is cheaper than allocating
+ * a separate array for every multichunk string, despite the block-copying
+ * that occurs when the array is grown.
*<p>
- * KNOWN LIMITATION: DOESN'T PRESERVE CDATA FLAG.
+ * KNOWN LIMITATION: DOESN'T PRESERVE CDATA FLAG. Since XSLT doesn't
+ * care about that flag, this is not a problem for our target
+ * application. It may be an issue if you try to reuse DTM elsewhere.
+ *
+ * @see appendAccumulatedText
*/
public final void general_characters(int index)
{
- // Add this element to the document
- int w0 = Node.TEXT_NODE;
- // W1: Parent
- int w1 = currentParent;
- // W2: Start position within buffer (SAX), or text index (XML4J)
- int w2 = index;
- // W3: Length of this text (SAX), or 0 (XML4J)
- int w3 = gotslot[2];
- int ourslot = appendNode(w0, w1, w2, w3);
- previousSibling = ourslot;
+ // Grow the array, if out of space. (Doubling may be excessive, but the
+ // goal is to trade off minimum memory use versus minimum recopying.)
+ if(charChunkCount==charChunks.length)
+ {
+ int[] newCharChunks=new int[2*charChunks.length];
+ System.arraycopy(charChunks,0,newCharChunks,0,charChunks.length);
+ charChunks=newCharChunks;
+ }
+ // Append to the array
+ charChunks[charChunkCount++]=index;
+ }
+
+ /** appendAccumulatedText completes the work started by
+ * general_characters(). It takes all the blocks of text which have
+ * arrived, and generates a single Text node containing their
+ * concatenated value. This routine _MUST_ be called at the first step
+ * in processing any other event.
+ *<p>
+ * There are a few reasonable ways of handling this.
+ * <ul>
+ * <li> One is to hold onto the individual text chunks -- which are
+ * already in a string pool inside the parser, since we're being
+ * driven through XMLDocumentHandler -- and concatenate them on
+ * demand when the user asks for this node's value; this minimizes
+ * model-building time, especially if the user never asks for the
+ * value of this node.</li>
+ *<li>The other is to generate a concatenated string in a local
+ * pool; this avoids re-concatenating the string if it should be
+ * accessed more than once.</li>
+ * <li>Or we could use the first solution, but convert it to the second
+ * the first time the text node is accessed. This is probably the best
+ * of both worlds... and we can get away with it because DTM is
+ * explicitly single-threaded after parsing, so there will be no
+ * contention for the node during its conversion.</li>
+ * </ul>
+ * <p>
+ * Early versions of DTM chose the first answer. I'm going to try the third
+ * this time.
+ *<p>
+ * Length of 0 indicates the simple case, referenced directly from
+ * the parser's pool.
+ *<p>
+
+ * @see general_characters() */
+ void appendAccumulatedText()
+ {
+ if(charChunkCount==charChunkStart)
+ return; // No new text.
+ else if(charChunkCount==charChunkStart+1)
+ {
+ // Single chunk. We can use the efficient inline version of Text
+
+ int w0 = Node.TEXT_NODE;
+ // W1: Parent
+ int w1 = currentParent;
+ // W2: Start position within charChunks (multiple),
+ // or text index (inline), or local text index (multiple converted)
+ int w2 = charChunks[charChunkStart];
+ // W3: Start of next sequence, or 0 for inline
+ int w3 = 0;
+ int ourslot = appendNode(w0, w1, w2, w3);
+ previousSibling = ourslot;
+
+ // This chunk has been completely processed, so reuse its chunk slot
+ // (They're cheap, but why waste them?)
+ --charChunkCount;
+ }
+ else
+ {
+ // Here's our problem child. We need to record that the Text node's
+ // value is represented by a sequence of nodes in
+ int w0 = Node.TEXT_NODE;
+ // W1: Parent
+ int w1 = currentParent;
+ // W2: Start position within charChunks (multiple),
+ // or text index (inline), or local text index (multiple converted)
+ int w2 = charChunkStart;
+ // W3: Start of next sequence, or 0 for inline
+ int w3 = charChunkCount;
+ int ourslot = appendNode(w0, w1, w2, w3);
+ previousSibling = ourslot;
+
+ // This time, we need to remember that these charChunks can _NOT_
+ // be reused -- leave the high-water mark alone, and instead move
+ // the baseline up.
+ charChunkStart=charChunkCount;
+ }
}
/**
@@ -805,6 +920,7 @@
public final void comment(int dataIndex)
{
if(DISABLE)return;
+ appendAccumulatedText();
// Short Form, XML4J mode
int w0, w1, w2, w3;
@@ -832,7 +948,8 @@
public final void processingInstruction(int target, int data)
{
if(DISABLE)return;
-
+ appendAccumulatedText();
+
// W0 Low: Node Type.
int w0 = org.w3c.dom.Node.PROCESSING_INSTRUCTION_NODE;
// W1: Parent
@@ -1944,6 +2061,9 @@
return intToString(w0>>16);
}
+ // Cache conversions of multi-charChunk text nodes
+ Vector localStringPool=new Vector();
+
/**
* DTM read API: Given a node index, return its node value. This is mostly
* as defined by the DOM, but may ignore some conveniences.
@@ -1962,7 +2082,48 @@
{
case Node.TEXT_NODE:
case Node.CDATA_SECTION_NODE: // We handle as flagged Text...
- value=intToString(gotslot[2]);
+ if((gotslot[0] & TEXT_DTM_POOL) != 0)
+ {
+ // Value of this node lives in DTM's pool, not in the parser's
+ value=(String)(localStringPool.elementAt(gotslot[2]));
+ }
+ else if(gotslot[3]>0) // (actually >1, but 1 never occurs)
+ {
+ // This was a multi-charChunk node. Its value is the concatenation
+ // of those chunks. For efficient future access, we will now convert
+ // this into a TEXT_DTM_POOL node
+
+ // First, concatenate the chunks to obtain the value
+ int chunk=gotslot[2],stop=gotslot[3];
+ StringBuffer sb=new StringBuffer(intToString(charChunks[chunk++]));
+ while(chunk<stop)
+ sb.append(intToString(charChunks[chunk++]));
+ value=sb.toString();
+
+ // Add the normalized string to our local pool.
+ // ****** Is it worth suppressing duplicates?
+ // int localStringNumber=localStringPool.indexOf(value);
+ // if(-1 == localStringNumber) // Not found
+ // {
+ localStringPool.addElement(value);
+ int localStringNumber=localStringPool.size();
+ // }
+
+ // Now back-patch the node. We can get away with not protecting
+ // this since we assert that DTM's read access is single-threaded,
+ // and hence nobody else is accessing this node right now.
+ // (If you don't believe that, synchronize this and the preceeding
+ // case on localStringPool.)
+ gotslot[0] |= TEXT_DTM_POOL;
+ gotslot[2] = localStringNumber-1;
+ // ***** Would be nice right here to have an array-to-array write...
+ nodes.writeSlot(position,gotslot[0],gotslot[1],gotslot[2],gotslot[3]);
+ }
+ else
+ {
+ // Single charChunk. Read the value direct from the parser's pool.
+ value=intToString(gotslot[2]);
+ }
break;
case Node.PROCESSING_INSTRUCTION_NODE:
case Node.COMMENT_NODE:
@@ -2184,4 +2345,4 @@
}
-}
\ No newline at end of file
+}