dtm DTM.java

mmidy 17 Aug 2000 21:32:54 -0000

mmidy       00/08/17 14:32:54


  Modified:    src/org/apache/xalan/xpath/dtm DTM.java
  Log:
  Checking this in for Joe Kesselman: Fix for EntityRefs.
  
  Revision  Changes    Path
  1.24      +176 -15   xml-xalan/src/org/apache/xalan/xpath/dtm/DTM.java
  
  Index: DTM.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/src/org/apache/xalan/xpath/dtm/DTM.java,v
  retrieving revision 1.23
  retrieving revision 1.24
  diff -u -r1.23 -r1.24
  --- DTM.java  2000/08/04 02:12:26     1.23
  +++ DTM.java  2000/08/17 21:32:53     1.24
  @@ -115,7 +115,7 @@
     // DTM state information
     // org.xml.sax.Parser parser;
     // org.apache.xalan.xpath.dtm.HookedXMLParser ibmparser;
  -  // org.apache.xerces.utils.StringPool fStringPool;
  +  // org.apache.xerces.utils.StringPool fStringPool; 
     ChunkedIntArray nodes = new ChunkedIntArray(4);
     
     /**
  @@ -141,6 +141,7 @@
   
     // MANEFEST CONSTANTS
     // Status bits, ORed with node type (assumed to be <256, should be safe)
  +  final int TEXT_DTM_POOL =  1 << 8; // Locally cached, eg concatenation
     final int TEXT_IGNORABLE = 2 << 8;
     final int TEXT_CDATA = 4 << 8;
     
  @@ -405,6 +406,7 @@
       throws org.xml.sax.SAXException
     {
       if(DISABLE)return;
  +    appendAccumulatedText();
   
       done=true;
       
  @@ -443,6 +445,8 @@
                              int attrListIndex) 
     {
       if(DISABLE)return;    
  +    appendAccumulatedText();
  +
       // Need to retrive the attrList...
           
       String attrname, attrvalue;
  @@ -609,6 +613,7 @@
     public final void endElement(QName name)
     {
       if(DISABLE)return;    
  +    appendAccumulatedText();
       int thisElement = currentParent;
       
       // If last node appended before we pop has a next-sib reference,
  @@ -776,25 +781,135 @@
       general_characters(dataIndex);
     }
   
  +  // Vector handles objects. Too much overhead. I could use ChunkedIntArray
  +  // (and did, in an early draft), but since we aren't trying to handle SAX
  +  // right now there's no need for the additional columns. So we'll use a
  +  // simple grow-it-myself array.
  +  int charChunks[]=new int[100];
  +  int charChunkStart=0,charChunkCount=0;
  +
     /** Text-accumulator operation for the integer-index version of
      * characters(). Obviously far simpler, since we are assured that
      * (unlike the parse buffers) the XML4J symbol table will persist.
      * @param index int Index of this string in XML4J's symbol tables.
  +   *<p>
  +   * Note: Even though we are using XML4J's internal events rather than SAX,
  +   * we <strong>must</strong> be prepared to normalize successive blocks
  +   * of characters():
  +   * <ul>
  +   * <li>when text runs over the end of a parse buffer (may not arise in
  +   * this parser),</li>
  +   * <li>when text and CDATA sections are intermixed (with intervening
  +   * start/end CDATA events),</li>
  +   * <li>and when text and entity references are intermixed (with intervening
  +   * start/end Entity Reference events).</li>
  +   * </ul>
  +   * The simplest way to handle this is to record the data, but defer
  +   * creating the Text node until we get an event indicating that no further
  +   * text will arrive. This logic was present in early versions of DTM,
  +   * but was lost during an overagressive optimization; we're restoring it 
now.
  +   *<p>
  +   * Note: Yes, the charChunks array grows monotonically during parsing,
  +   * and does not shrink back down when the chunks are concatenated later
  +   * in processing. Tough. I'm assuming that this is cheaper than allocating
  +   * a separate array for every multichunk string, despite the block-copying
  +   * that occurs when the array is grown.
      *<p>
  -   * KNOWN LIMITATION: DOESN'T PRESERVE CDATA FLAG.
  +   * KNOWN LIMITATION: DOESN'T PRESERVE CDATA FLAG. Since XSLT doesn't
  +   * care about that flag, this is not a problem for our target
  +   * application. It may be an issue if you try to reuse DTM elsewhere.
  +   *
  +   * @see appendAccumulatedText
      */
     public final void general_characters(int index) 
     {
  -    // Add this element to the document
  -    int w0 = Node.TEXT_NODE;
  -    // W1: Parent
  -    int w1 = currentParent;
  -    // W2: Start position within buffer (SAX), or text index (XML4J)
  -    int w2 = index;
  -    // W3: Length of this text (SAX), or 0 (XML4J)
  -    int w3 = gotslot[2];
  -    int ourslot = appendNode(w0, w1, w2, w3);
  -    previousSibling = ourslot;
  +    // Grow the array, if out of space. (Doubling may be excessive, but the
  +    // goal is to trade off minimum memory use versus minimum recopying.)
  +    if(charChunkCount==charChunks.length)
  +      {
  +     int[] newCharChunks=new int[2*charChunks.length];
  +     System.arraycopy(charChunks,0,newCharChunks,0,charChunks.length);
  +     charChunks=newCharChunks;
  +      }
  +    // Append to the array
  +    charChunks[charChunkCount++]=index;
  +  }
  +  
  +  /** appendAccumulatedText completes the work started by
  +   * general_characters(). It takes all the blocks of text which have
  +   * arrived, and generates a single Text node containing their
  +   * concatenated value. This routine _MUST_ be called at the first step
  +   * in processing any other event.
  +   *<p>
  +   * There are a few reasonable ways of handling this.
  +   * <ul>
  +   * <li> One is to hold onto the individual text chunks -- which are
  +   * already in a string pool inside the parser, since we're being
  +   * driven through XMLDocumentHandler -- and concatenate them on
  +   * demand when the user asks for this node's value; this minimizes
  +   * model-building time, especially if the user never asks for the
  +   * value of this node.</li>
  +   *<li>The other is to generate a concatenated string in a local
  +   * pool; this avoids re-concatenating the string if it should be
  +   * accessed more than once.</li>
  +   * <li>Or we could use the first solution, but convert it to the second
  +   * the first time the text node is accessed. This is probably the best
  +   * of both worlds... and we can get away with it because DTM  is
  +   * explicitly single-threaded after parsing, so there will be no
  +   * contention for the node during its conversion.</li>
  +   * </ul>
  +   * <p>
  +   * Early versions of DTM chose the first answer. I'm going to try the third
  +   * this time.
  +   *<p>
  +   * Length of 0 indicates the simple case, referenced directly from
  +   * the parser's pool.
  +   *<p>
  +
  +   * @see general_characters() */
  +  void appendAccumulatedText()
  +  {
  +    if(charChunkCount==charChunkStart)
  +      return;                        // No new text.
  +    else if(charChunkCount==charChunkStart+1)
  +      {
  +     // Single chunk. We can use the efficient inline version of Text
  +     
  +     int w0 = Node.TEXT_NODE;
  +     // W1: Parent
  +     int w1 = currentParent;
  +     // W2: Start position within charChunks (multiple),
  +     // or text index (inline), or local text index (multiple converted)
  +     int w2 = charChunks[charChunkStart];
  +     // W3: Start of next sequence, or 0 for inline
  +     int w3 = 0;
  +     int ourslot = appendNode(w0, w1, w2, w3);
  +     previousSibling = ourslot;
  +     
  +     // This chunk has been completely processed, so reuse its chunk slot
  +     // (They're cheap, but why waste them?)
  +     --charChunkCount;
  +      }
  +    else
  +      {
  +     // Here's our problem child. We need to record that the Text node's
  +     // value is represented by a sequence of nodes in 
  +     int w0 = Node.TEXT_NODE;
  +     // W1: Parent
  +     int w1 = currentParent;
  +     // W2: Start position within charChunks (multiple),
  +     // or text index (inline), or local text index (multiple converted)
  +     int w2 = charChunkStart;
  +     // W3: Start of next sequence, or 0 for inline
  +     int w3 = charChunkCount;
  +     int ourslot = appendNode(w0, w1, w2, w3);
  +     previousSibling = ourslot;
  +     
  +     // This time, we need to remember that these charChunks can _NOT_
  +     // be reused -- leave the high-water mark alone, and instead move
  +     // the baseline up.
  +     charChunkStart=charChunkCount;
  +      }
     }
   
     /**
  @@ -805,6 +920,7 @@
     public final void comment(int dataIndex) 
     {
       if(DISABLE)return;
  +    appendAccumulatedText();
       
       // Short Form, XML4J mode
       int w0, w1, w2, w3;
  @@ -832,7 +948,8 @@
     public final void processingInstruction(int target, int data) 
     {
       if(DISABLE)return;
  -    
  +    appendAccumulatedText();
  +
       // W0 Low: Node Type.
       int w0 = org.w3c.dom.Node.PROCESSING_INSTRUCTION_NODE;
       // W1: Parent
  @@ -1944,6 +2061,9 @@
         return intToString(w0>>16);
     }
   
  +  // Cache conversions of multi-charChunk text nodes
  +  Vector localStringPool=new Vector();
  +
     /**
      * DTM read API: Given a node index, return its node value. This is mostly
      * as defined by the DOM, but may ignore some conveniences.
  @@ -1962,7 +2082,48 @@
       {
       case Node.TEXT_NODE:
       case Node.CDATA_SECTION_NODE: // We handle as flagged Text...
  -      value=intToString(gotslot[2]);
  +      if((gotslot[0] & TEXT_DTM_POOL) != 0)
  +     {
  +       // Value of this node lives in DTM's pool, not in the parser's
  +       value=(String)(localStringPool.elementAt(gotslot[2]));
  +     }
  +      else if(gotslot[3]>0)          // (actually >1, but 1 never occurs)
  +     {
  +       // This was a multi-charChunk node. Its value is the concatenation
  +       // of those chunks. For efficient future access, we will now convert
  +       // this into a TEXT_DTM_POOL node
  +
  +       // First, concatenate the chunks to obtain the value
  +       int chunk=gotslot[2],stop=gotslot[3];
  +       StringBuffer sb=new StringBuffer(intToString(charChunks[chunk++]));
  +       while(chunk<stop)
  +         sb.append(intToString(charChunks[chunk++]));
  +       value=sb.toString();
  +
  +       // Add the normalized string to our local pool.
  +       // ****** Is it worth suppressing duplicates? 
  +       // int localStringNumber=localStringPool.indexOf(value);
  +       // if(-1 == localStringNumber) // Not found
  +       // {
  +         localStringPool.addElement(value);
  +         int localStringNumber=localStringPool.size();
  +       // }
  +       
  +       // Now back-patch the node. We can get away with not protecting
  +       // this since we assert that DTM's read access is single-threaded,
  +       // and hence nobody else is accessing this node right now.
  +       // (If you don't believe that, synchronize this and the preceeding
  +       // case on localStringPool.)
  +       gotslot[0] |= TEXT_DTM_POOL;
  +       gotslot[2] = localStringNumber-1;
  +       // ***** Would be nice right here to have an array-to-array write...
  +       nodes.writeSlot(position,gotslot[0],gotslot[1],gotslot[2],gotslot[3]);
  +     }
  +      else
  +     {
  +       // Single charChunk. Read the value direct from the parser's pool.
  +       value=intToString(gotslot[2]);
  +     }
         break;
       case Node.PROCESSING_INSTRUCTION_NODE:
       case Node.COMMENT_NODE:
  @@ -2184,4 +2345,4 @@
   
     }
   
  -}
  \ No newline at end of file
  +}

cvs commit: xml-xalan/src/org/apache/xalan/xpath/dtm DTM.java

Reply via email to