dom2dtm DOM2DTM.java

jkesselm Mon, 11 Jun 2001 11:19:29 -0700
jkesselm    01/06/11 11:41:04

  Modified:    java/src/org/apache/xml/dtm/ref/dom2dtm Tag: DTM_EXP
                        DOM2DTM.java
  Log:
  Major changes for both performance and robustness --
  new Namespace handling, new DOM/DTM treewalker
  
  Revision  Changes    Path
  No                   revision
  
  
  No                   revision
  
  
  1.1.2.9   +356 -253  
xml-xalan/java/src/org/apache/xml/dtm/ref/dom2dtm/Attic/DOM2DTM.java
  
  Index: DOM2DTM.java
  ===================================================================
  RCS file: 
/home/cvs/xml-xalan/java/src/org/apache/xml/dtm/ref/dom2dtm/Attic/DOM2DTM.java,v
  retrieving revision 1.1.2.8
  retrieving revision 1.1.2.9
  diff -u -r1.1.2.8 -r1.1.2.9
  --- DOM2DTM.java      2001/06/04 13:49:36     1.1.2.8
  +++ DOM2DTM.java      2001/06/11 18:41:04     1.1.2.9
  @@ -94,87 +94,27 @@
    * */
   public class DOM2DTM extends DTMDefaultBaseIterators
   {
  +  static final boolean JJK_DEBUG=false;
  +  
  +  /** The current position in the DOM tree. Last node examined for
  +   * possible copying to DTM. */
  +  transient private Node m_pos;
  +  /** The current position in the DTM tree. Who children get appended to. */
  +  private int m_last_parent=0;
  +  /** The current position in the DTM tree. Who children reference as their 
  +   * previous sib. */
  +  private int m_last_kid=NULL;
  +
     /** The top of the subtree.
      * %REVIEW%: 'may not be the same as m_context if "//foo" pattern.'
      * */
     transient private Node m_root;
   
  -  /** The current position in the DOM tree. This is used to keep track
  -   * of our progress as we incrementally build the DTM from the DOM. */
  -  transient private Node m_pos;
  -
     /** true if ALL the nodes in the m_root subtree have been processed;
      * false if our incremental build has not yet finished scanning the
      * DOM tree.  */
     transient private boolean m_nodesAreProcessed;
   
  -  /** We use this stack to keep track of some of the context
  -   * information as we build the DTM tables. Each time nextNode()
  -   * enters a new level of DTM hierarchy -- basically, each time we
  -   * enter a new element -- we push a two-integer stack frame:
  -   *
  -   * <ul>
  -
  -   * <li>The DTM nodeHandle index of this element, which is now the
  -   * parent to which children are being appended</li> and
  -   * <li>DTM.NULL, meaning no known previous sibling
  -   * (next node will be first-child)</li>
  -   * </ul>
  -   *
  -   * We can retrieve these values via
  -   *
  -   * <ul>
  -   * <li>m_levelInfo.peek(LEVELINFO_PARENT)</li> and
  -   * <li>m_levelInfo.peek(LEVELINFO_PREVSIB)</li>
  -   * </ul>
  -   *
  -   * respectively. As children are appended, the previous-sibling
  -   * field is maintained to keep track of them, either by popping the
  -   * old value and pushing a new one on, or by doing some magic with
  -   * m_levelInfo.setElementAt().
  -   *
  -   * Finally, when we're done constructing this element's children, we
  -   * pop both integers off the stack, returning us to our previous
  -   * context.
  -   * */
  -  transient private IntStack m_levelInfo = new IntStack();
  -
  -  /** Field LEVELINFO_PARENT can be used as an offset into m_levelInfo
  -   * to retrieve the DTM nodeHandle for the current Parent Node.
  -   * */
  -  static final int LEVELINFO_PARENT = 1;
  -
  -  /** Field LEVELINFO_PREVSIB can be used as an offset into m_levelInfo
  -   * to retrieve the DTM nodeHandle for the current Previous Sibling Node.
  -   * */
  -  static final int LEVELINFO_PREVSIB = 0;
  -
  -  /** Field LEVELINFO_NPERLEVEL is the number of integers used in each
  -   * m_levelInfo stack frame -- currently 2. This is used both to do a
  -   * "quickPop" of an entire stack frame, and to calculate the current DTM
  -   * "level" (as m_levelInfo.size()/LEVELINDO_NPERPLEVEL).
  -   * */
  -  static final int LEVELINFO_NPERLEVEL = 2;
  -  
  -  /** m_attrs points to the attributes belonging to the last DOM
  -   * Element entered by nextNode(). It's used together with m_attrsPos
  -   * to incrementally build the DTM nodes for those attributes.
  -   * */
  -  transient private NamedNodeMap m_attrs;
  -
  -  /** m_attrsPos indicates how far nextNode() has progressed through
  -   * the set of Attributes contained in m_attrs. It's used together
  -   * with m_attrsPos to incrementally build the DTM nodes for those
  -   * attributes.
  -   * */
  -  transient private int m_attrsPos;
  -
  -  /** Saved element for attribute iteration. */
  -  private Node m_elementForAttrs;
  -  
  -  /** Saved element index for attribute iteration */
  -  private int m_elementForAttrsIndex;
  -
     /** The node objects.  The instance part of the handle indexes
      * directly into this vector.  Each DTM node may actually be
      * composed of several DOM nodes (for example, if logically-adjacent
  @@ -203,10 +143,16 @@
       super(mgr, domSource, dtmIdentity, whiteSpaceFilter, 
             xstringfactory, doIndexing);
   
  -    m_root = domSource.getNode();
  -    m_pos = null;
  +        // Initialize DOM navigation
  +    m_pos=m_root = domSource.getNode();
  +        // Initialize DTM navigation
  +    m_last_parent=m_last_kid=NULL;
  +    m_last_kid=addNode(m_root, 0, m_last_parent,m_last_kid, NULL);
  +        m_level[0]=0;           // Unnecessary, included for paranoia
  +    m_nextsib[0]=NOTPROCESSED;  // Unnecessary, included for paranoia
  +    m_firstch[0]=NOTPROCESSED;  // Unnecessary, included for paranoia
  +        // Initialize DTM-completed status 
       m_nodesAreProcessed = false;
  -    addNode(m_root, 0, DTM.NULL, DTM.NULL);
     }
   
     /**
  @@ -216,18 +162,24 @@
      * @param level The current level in the tree.
      * @param parentIndex The current parent index.
      * @param previousSibling The previous sibling index.
  +   * @param forceNodeType If not DTM.NULL, overrides the DOM node type.
  +   * Used to force nodes to Text rather than CDATASection when their
  +   * coalesced value includes ordinary Text nodes (current DTM behavior).
      *
      * @return The index identity of the node that was added.
      */
     protected int addNode(Node node, int level, int parentIndex,
  -                        int previousSibling)
  +                        int previousSibling, int forceNodeType)
     {
  -
       int nodeIndex = m_nodes.size();
       m_size++;
       ensureSize(nodeIndex);
       
  -    int type = node.getNodeType();
  +    int type;
  +    if(NULL==forceNodeType)
  +     type = node.getNodeType();
  +    else
  +     type=forceNodeType;
       
       m_nodes.addElement(node);
       
  @@ -260,7 +212,7 @@
         if (name.startsWith("xmlns:") || name.equals("xmlns"))
         {
           type = DTM.NAMESPACE_NODE;
  -        m_haveSeenNamespace = true;
  +        declareNamespaceInContext(parentIndex,nodeIndex);
         }
       }
       
  @@ -288,7 +240,6 @@
                            node.getLocalName();
       ExpandedNameTable exnt = m_expandedNameTable;
   
  -
       // %TBD% Nodes created with the old non-namespace-aware DOM
       // calls createElement() and createAttribute() will never have a
       // localname. That will cause their expandedNameID to be just the
  @@ -324,9 +275,8 @@
     {
       return m_nodes.size();
     }
  -
  -
  -  /**
  +  
  + /**
      * This method iterates to the next node that will be added to the table.
      * Each call to this method adds a new node to the table, unless the end
      * is reached, in which case it returns null.
  @@ -338,190 +288,243 @@
     {
       // Non-recursive one-fetch-at-a-time depth-first traversal with 
       // attribute/namespace nodes and white-space stripping.
  -    // Yippee!  Not for the faint of heart.  I would be glad for 
  -    // constructive suggestions on how to make this cleaner.
  -
  +    // Navigating the DOM is simple, navigating the DTM is simple;
  +    // keeping track of both at once is a trifle baroque but at least
  +    // we've avoided most of the special cases.
       if (m_nodesAreProcessed)
  -    {
         return false;
  -    }
  -    
  -    if(m_nodes.size() == 47)
  -    {
  -      int x = 5;
  -      x++;
  -    }
  -
  -    Node top = m_root;  // tells us when to stop.
  -    Node pos = (null == m_pos) ? m_root : m_pos;
  +        
  +    // %REVIEW% Is this local copy Really Useful from a performance
  +    // point of view?  Or is this a false microoptimization?
  +    Node pos=m_pos; 
  +    Node next=null;
  +    int nexttype=NULL;
   
  -    Node nextNode;
  -    int type = pos.getNodeType();
  +    // Navigate DOM tree
  +    do
  +      {
  +        // Look down to first child.
  +        if (pos.hasChildNodes()) 
  +          {
  +            next = pos.getFirstChild();
   
  -    int currentIndexHandle = m_nodes.size()-1;
  -    int posInfo = currentIndexHandle;
  -    
  -    boolean shouldPushLevel = true;
  -    if (Node.ELEMENT_NODE == type)
  -    {
  -      m_attrs = pos.getAttributes();
  -      m_attrsPos = 0;
  +            // %REVIEW% There's probably a more elegant way to skip
  +            // the doctype. (Just let it go and Suppress it?
  +            if(next!=null && DOCUMENT_TYPE_NODE==next.getNodeType())
  +              next=next.getNextSibling();
  +
  +            // Push DTM context -- except for children of Entity References, 
  +            // which have no DTM equivalent and cause no DTM navigation.
  +            if(ENTITY_REFERENCE_NODE!=pos.getNodeType())
  +              {
  +                m_last_parent=m_last_kid;
  +                m_last_kid=NULL;
  +                // Whitespace-handler context stacking
  +                if(null != m_wsfilter)
  +                {
  +                  short wsv =
  +                    m_wsfilter.getShouldStripSpace(m_last_parent|m_dtmIdent,this);
  +                  boolean shouldStrip = (DTMWSFilter.INHERIT == wsv) 
  +                    ? getShouldStripWhitespace() 
  +                    : (DTMWSFilter.STRIP == wsv);
  +                  pushShouldStripWhitespace(shouldStrip);
  +                } // if(m_wsfilter)
  +              }
  +          }
   
  -      if (null != m_attrs)
  -      {
  -        if (m_attrsPos < m_attrs.getLength())
  -        {
  -          m_elementForAttrs = pos;
  -          m_elementForAttrsIndex = currentIndexHandle;
  -          nextNode = m_attrs.item(m_attrsPos);
  -        }
  -        else
  -          nextNode = pos.getFirstChild();
  +        // If that fails, look up and right (but not past root!)
  +        else 
  +          {
  +            if(m_last_kid!=NULL)
  +              {
  +                // Last node posted at this level had no more children
  +                // If it has _no_ children, we need to record that.
  +                if(m_firstch[m_last_kid]==NOTPROCESSED)
  +                  m_firstch[m_last_kid]=NULL;
  +              }
  +                        
  +            while(m_last_parent != NULL)
  +              {
  +                // %REVIEW% There's probably a more elegant way to
  +                // skip the doctype. (Just let it go and Suppress it?
  +                next = pos.getNextSibling();
  +                if(next!=null && DOCUMENT_TYPE_NODE==next.getNodeType())
  +                  next=next.getNextSibling();
  +
  +                if(next!=null)
  +                  break; // Found it!
  +                
  +                // No next-sibling found. Pop the DOM.
  +                pos=pos.getParentNode();
  +                if(pos==null)
  +                  {
  +                    // %TBD% Should never arise, but I want to be sure of that...
  +                    if(JJK_DEBUG)
  +                      {
  +                        System.out.println("***** DOM2DTM Pop Control Flow 
problem");
  +                        for(;;); // Freeze right here!
  +                      }
  +                  }
  +                
  +                // The only parents in the DTM are Elements.  However,
  +                // the DOM could contain EntityReferences.  If we
  +                // encounter one, pop it _without_ popping DTM.
  +                if(pos!=null && ENTITY_REFERENCE_NODE == pos.getNodeType())
  +                  {
  +                    // Nothing needs doing
  +                    if(JJK_DEBUG)
  +                      System.out.println("***** DOM2DTM popping EntRef");
  +                  }
  +                else
  +                  {
  +                    popShouldStripWhitespace();
  +                    // Fix and pop DTM
  +                    if(m_last_kid==NULL)
  +                      m_firstch[m_last_parent]=NULL; // Popping from an element
  +                    else
  +                      m_nextsib[m_last_kid]=NULL; // Popping from anything else
  +                    m_last_parent=m_parent[m_last_kid=m_last_parent];
  +                  }
  +              }
  +            if(m_last_parent==NULL)
  +              next=null;
  +          }
  +                
  +        if(next!=null)
  +          nexttype=next.getNodeType();
  +                
  +        // If it's an entity ref, advance past it.
  +        //
  +        // %REVIEW% Should we let this out the door and just suppress it?
  +        // More work, but simpler code, more likely to be correct, and
  +        // it doesn't happen very often. We'd get rid of the loop too.
  +        if (ENTITY_REFERENCE_NODE == nexttype)
  +          pos=next;
         }
  -      else
  +    while (ENTITY_REFERENCE_NODE == nexttype); 
  +        
  +    // Did we run out of the tree?
  +    if(next==null)
         {
  -        nextNode = pos.getFirstChild();
  +        m_nextsib[0] = NULL;
  +        m_nodesAreProcessed = true;
  +        m_pos=null;
  +                
  +        if(JJK_DEBUG)
  +          {
  +            System.out.println("***** DOM2DTM Crosscheck:");
  +            for(int i=0;i<m_nodes.size();++i)
  +              System.out.println(i+":\t"+m_firstch[i]+"\t"+m_nextsib[i]);
  +          }
  +                
  +        return false;
         }
  -    }
  -    else if (Node.ATTRIBUTE_NODE == type)
  -    {
  -      m_firstch[posInfo] = DTM.NULL;
  -      m_attrsPos++;
   
  -      if (m_attrsPos < m_attrs.getLength())
  -      {
  -        nextNode = m_attrs.item(m_attrsPos);
  -        shouldPushLevel = false;
  -      }
  -      else
  -      {
  -        m_nextsib[posInfo] = NULL;
  -        pos = m_elementForAttrs;
  -        currentIndexHandle = m_elementForAttrsIndex;
  -        posInfo = currentIndexHandle;
  -        nextNode = pos.getFirstChild();
  -        m_levelInfo.quickPop(LEVELINFO_NPERLEVEL);
  -      }
  -    }
  -    else
  -      nextNode = pos.getFirstChild();  
  -     
  -    // %TBD% Text node coalition.
  -    if((null != m_wsfilter) && (null != nextNode) && getShouldStripWhitespace())
  -    {
  -      int t = nextNode.getNodeType();
  -      
  -      if((Node.CDATA_SECTION_NODE == t) || (Node.TEXT_NODE == t))
  -      {
  -        String data = nextNode.getNodeValue();
  -        if(XMLCharacterRecognizer.isWhiteSpace(data))
  -        {
  -          nextNode = nextNode.getNextSibling();
  -        }
  -      }
  -    }
  -    if (shouldPushLevel && (null != nextNode))
  -    {
  -      m_levelInfo.push(currentIndexHandle); // parent
  -      m_levelInfo.push(DTM.NULL); // previous sibling
  -    }
  +    // Text needs some special handling:
  +    //
  +    // DTM may skip whitespace. This is handled by the suppressNode flag, which
  +    // when true will keep the DTM node from being created.
  +    //
  +    // DTM only directly records the first DOM node of any logically-contiguous
  +    // sequence. The lastTextNode value will be set to the last node in the 
  +    // contiguous sequence, and -- AFTER the DTM addNode -- can be used to 
  +    // advance next over this whole block. Should be simpler than special-casing
  +    // the above loop for "Was the logically-preceeding sibling a text node".
  +    // 
  +    // Finally, a DTM node should be considered a CDATASection only if all the
  +    // contiguous text it covers is CDATASections. The first Text should
  +    // force DTM to Text.
  +        
  +    boolean suppressNode=false;
  +    Node lastTextNode=null;
   
  -    while (null == nextNode)
  -    {
  -      if(m_firstch[posInfo] == NOTPROCESSED)
  -      {
  -        m_firstch[posInfo] = NULL;
  -      }
  -      
  -      if (top.equals(pos))
  -      {
  -        m_nextsib[posInfo] = NULL;
  -        break;
  -      }
  -      
  -      nextNode = pos.getNextSibling();
  -      if(null != nextNode && Node.DOCUMENT_TYPE_NODE == nextNode.getNodeType())
  -      {
  -        // Xerces
  -        nextNode = nextNode.getNextSibling(); // just skip it.
  -      }
  -      
  -      if(Node.ELEMENT_NODE == pos.getNodeType())
  +    nexttype=next.getNodeType();
  +        
  +    // nexttype=pos.getNodeType();
  +    if(TEXT_NODE == nexttype || CDATA_SECTION_NODE == nexttype)
         {
  -        // I think this only has to be popped here, and not at getParent,
  -        // oddly enough at first glance.
  -        popShouldStripWhitespace();
  -      }
  +        // If filtering, initially assume we're going to suppress the node
  +        suppressNode=((null != m_wsfilter) && getShouldStripWhitespace());
   
  -      // %TBD% Text node coalition.
  -      if((null != nextNode) && (null != m_wsfilter) && getShouldStripWhitespace())
  -      {
  -        int t = nextNode.getNodeType();
  -        
  -        if((Node.CDATA_SECTION_NODE == t) || (Node.TEXT_NODE == t))
  -        {
  -          String data = nextNode.getNodeValue();
  -          if(XMLCharacterRecognizer.isWhiteSpace(data))
  +        // Scan logically contiguous text (siblings, plus "flattening"
  +        // of entity reference boundaries).
  +        Node n=next;
  +        while(n!=null)
             {
  -            nextNode = nextNode.getNextSibling();
  +            lastTextNode=n;
  +            // Any Text node means DTM considers it all Text
  +            if(TEXT_NODE == n.getNodeType())
  +              nexttype=TEXT_NODE;
  +            // Any non-whitespace in this sequence blocks whitespace
  +            // suppression
  +            suppressNode &=
  +              XMLCharacterRecognizer.isWhiteSpace(n.getNodeValue());
  +                        
  +            n=logicalNextDOMTextNode(n);
             }
  -        }
         }
  -                  
  -      if (null == nextNode)
  +        
  +    // Special handling for PIs: Some DOMs represent the XML
  +    // Declaration as a PI. This is officially incorrect, per the DOM
  +    // spec, but is considered a "wrong but tolerable" temporary
  +    // workaround pending proper handling of these fields in DOM Level
  +    // 3. We want to recognize and reject that case.
  +    else if(PROCESSING_INSTRUCTION_NODE==nexttype)
         {
  -        m_nextsib[posInfo] = NULL;
  -        currentIndexHandle = m_parent[posInfo];
  -        posInfo = currentIndexHandle;
  -        m_levelInfo.quickPop(LEVELINFO_NPERLEVEL);
  -        pos = pos.getParentNode();
  -
  -        if ((null == pos) || (top.equals(pos)))
  -        {
  -          m_nextsib[posInfo] = NULL;
  -          nextNode = null;
  -          // break;
  -          m_nodesAreProcessed = true;
  -          return false;
  -        }
  +        suppressNode = (pos.getNodeName().toLowerCase().equals("xml"));
         }
           
  -      
  -    } // end while (null == nextNode) [for next sibling, parent]
  -
  -    pos = nextNode;
  -
  -    if (null != pos)
  -    {
  -      int level = m_levelInfo.size() / LEVELINFO_NPERLEVEL;
  -      int newIndexHandle = 
  -            addNode(pos, level, m_levelInfo.peek(LEVELINFO_PARENT),
  -                m_levelInfo.peek(LEVELINFO_PREVSIB));
  -
  -      m_pos = pos;
  -
  -      int sz = m_levelInfo.size();
  -
  -      m_levelInfo.setElementAt(newIndexHandle,
  -                               sz - (1 + LEVELINFO_PREVSIB));
  -                               
  -      if((null != m_wsfilter) && (Node.ELEMENT_NODE == pos.getNodeType()))
  +        
  +    if(!suppressNode)
         {
  -        short wsv = m_wsfilter.getShouldStripSpace(newIndexHandle|m_dtmIdent, 
  -                                                                      this);
  -        boolean shouldStrip = (DTMWSFilter.INHERIT == wsv) ? 
  -                  getShouldStripWhitespace() : (DTMWSFilter.STRIP == wsv);
  -        pushShouldStripWhitespace(shouldStrip);
  +        // Inserting next. NOTE that we force the node type; for
  +        // coalesced Text, this records CDATASections adjacent to
  +        // ordinary Text as Text.
  +        int level=m_level[m_last_parent]+1;
  +        int nextindex=addNode(next,level,m_last_parent,m_last_kid,
  +                              nexttype);
  +        m_last_kid=nextindex;
  +
  +        if(ELEMENT_NODE == nexttype)
  +          {
  +            // Process attributes _now_, rather than waiting.
  +            // Simpler control flow, makes NS cache available immediately.
  +            NamedNodeMap attrs=next.getAttributes();
  +            int attrsize=(attrs==null) ? 0 : attrs.getLength();
  +            if(attrsize>0)
  +              {
  +                int attrlevel=level+1;
  +                int attrIndex=NULL; // start with no previous sib
  +                for(int i=0;i<attrsize;++i)
  +                  {
  +                    // No need to force nodetype in this case;
  +                    // addNode() will take care of switching it from
  +                    // Attr to Namespace if necessary.
  +                    attrIndex=addNode(attrs.item(i),attrlevel,
  +                                      nextindex,attrIndex,NULL);
  +                    m_firstch[attrIndex]=DTM.NULL;
  +                  }
  +                // Terminate list of attrs, and make sure they aren't
  +                // considered children of the element
  +                m_nextsib[attrIndex]=DTM.NULL;
  +              } // if attrs exist
  +          } //if(ELEMENT_NODE)
  +      } // (if !suppressNode)
  +
  +    // Text postprocessing: Act on values stored above
  +    if(TEXT_NODE == nexttype || CDATA_SECTION_NODE == nexttype)
  +      {
  +        // %TBD% If nexttype was forced to TEXT, patch the DTM node
  +                
  +        next=lastTextNode;      // Advance the DOM cursor over contiguous text
         }
  -      return true;
  -    }
  +        
  +    // Remember where we left off.
  +    m_pos=next;
  +    return true;
  +  }  
   
   
  -    m_nodesAreProcessed = true;
  -    m_pos = null;
  -    return false;
  -  }
  -
     /**
      * Return an DOM node for the given node.
      *
  @@ -745,9 +748,28 @@
         }
     
         return m_xstrf.newstr( s );
  -
  +    }
  +    else if(TEXT_NODE == type || CDATA_SECTION_NODE == type)
  +    {
  +      // If this is a DTM text node, it may be made of multiple DOM text
  +      // nodes -- including navigating into Entity References. DOM2DTM
  +      // records the first node in the sequence and requires that we
  +      // pick up the others when we retrieve the DTM node's value.
  +      //
  +      // %REVIEW% DOM Level 3 is expected to add a "whole text"
  +      // retrieval method which performs this function for us.
  +      FastStringBuffer buf = StringBufferPool.get();
  +      while(node!=null)
  +      {
  +        buf.append(node.getNodeValue());
  +        node=logicalNextDOMTextNode(node);
  +      }
  +      String s=(buf.length() > 0) ? buf.toString() : "";
  +      StringBufferPool.free(buf);
  +      return m_xstrf.newstr( s );
       }
  -    return m_xstrf.newstr( node.getNodeValue() );
  +    else
  +      return m_xstrf.newstr( node.getNodeValue() );
     }
     
     /**
  @@ -761,6 +783,11 @@
      * whitespace-in-element-context from genuine #PCDATA. Note that we
      * should probably also consider xml:space if/when we address this.
      * DOM Level 3 may solve the problem for us.
  +   * <p>
  +   * %REVIEW% Actually, since this method operates on the DOM side of the
  +   * fence rather than the DTM side, it SHOULDN'T do
  +   * any special handling. The DOM does what the DOM does; if you want
  +   * DTM-level abstractions, use DTM-level methods.
      *
      * @param node Node whose subtree is to be walked, gathering the
      * contents of all Text or CDATASection nodes.
  @@ -785,9 +812,7 @@
       break;
       case Node.TEXT_NODE :
       case Node.CDATA_SECTION_NODE :
  -      buf.append(node.getNodeValue());
  -      break;
  -    case Node.ATTRIBUTE_NODE :
  +    case Node.ATTRIBUTE_NODE :       // Never a child but might be our starting node
         buf.append(node.getNodeValue());
         break;
       case Node.PROCESSING_INSTRUCTION_NODE :
  @@ -999,6 +1024,47 @@
   
       return nsuri;
     }
  +  
  +  /** Utility function: Given a DOM Text node, determine whether it is
  +   * logically followed by another Text or CDATASection node. This may
  +   * involve traversing into Entity References.
  +   * 
  +   * %REVIEW% DOM Level 3 is expected to add functionality which may 
  +   * allow us to retire this.
  +   */
  +  private Node logicalNextDOMTextNode(Node n)
  +  {
  +        Node p=n.getNextSibling();
  +        if(p==null)
  +        {
  +                // Walk out of any EntityReferenceNodes that ended with text
  +                for(n=n.getParentNode();
  +                        n!=null && ENTITY_REFERENCE_NODE == n.getNodeType();
  +                        n=n.getParentNode())
  +                {
  +                        p=n.getNextSibling();
  +                        if(p!=null)
  +                                break;
  +                }
  +        }
  +        n=p;
  +        while(n!=null && ENTITY_REFERENCE_NODE == n.getNodeType())
  +        {
  +                // Walk into any EntityReferenceNodes that start with text
  +                if(n.hasChildNodes())
  +                        n=n.getFirstChild();
  +                else
  +                        n=n.getNextSibling();
  +        }
  +        if(n!=null)
  +        {
  +                // Found a logical next sibling. Is it text?
  +                int ntype=n.getNodeType();
  +                if(TEXT_NODE != ntype && CDATA_SECTION_NODE != ntype)
  +                        n=null;
  +        }
  +        return n;
  +  }
   
     /**
      * Given a node handle, return its node value. This is mostly
  @@ -1011,13 +1077,34 @@
      */
     public String getNodeValue(int nodeHandle)
     {
  -
  +    int type=_type(nodeHandle);
  +    if(TEXT_NODE!=type && CDATA_SECTION_NODE!=type)
  +      return getNode(nodeHandle).getNodeValue();
  +    
  +    // If this is a DTM text node, it may be made of multiple DOM text
  +    // nodes -- including navigating into Entity References. DOM2DTM
  +    // records the first node in the sequence and requires that we
  +    // pick up the others when we retrieve the DTM node's value.
  +    //
  +    // %REVIEW% DOM Level 3 is expected to add a "whole text"
  +    // retrieval method which performs this function for us.
       Node node = getNode(nodeHandle);
  -
  -    return node.getNodeValue();
  +    Node n=logicalNextDOMTextNode(node);
  +    if(n==null)
  +      return node.getNodeValue();
  +    
  +    FastStringBuffer buf = StringBufferPool.get();
  +     buf.append(node.getNodeValue());
  +    while(n!=null)
  +    {
  +      buf.append(n.getNodeValue());
  +      n=logicalNextDOMTextNode(n);
  +    }
  +    String s = (buf.length() > 0) ? buf.toString() : "";
  +    StringBufferPool.free(buf);
  +    return s;
     }
   
  -
     /**
      *   A document type declaration information item has the following properties:
      *
  @@ -1379,6 +1466,15 @@
         int type = getNodeType(nodeHandle);
         Node node = getNode(nodeHandle);
         dispatchNodeData(node, ch, 0);
  +       // Text coalition -- a DTM text node may represent multiple
  +       // DOM nodes.
  +       if(TEXT_NODE == type || CDATA_SECTION_NODE == type)
  +       {
  +               while( null != (node=logicalNextDOMTextNode(node)) )
  +               {
  +                   dispatchNodeData(node, ch, 0);
  +               }
  +       }
       }
     }
     
  @@ -1393,6 +1489,11 @@
      * whitespace-in-element-context from genuine #PCDATA. Note that we
      * should probably also consider xml:space if/when we address this.
      * DOM Level 3 may solve the problem for us.
  +   * <p>
  +   * %REVIEW% Note that as a DOM-level operation, it can be argued that this
  +   * routine _shouldn't_ perform any processing beyond what the DOM already
  +   * does, and that whitespace stripping and so on belong at the DTM level.
  +   * If you want a stripped DOM view, wrap DTM2DOM around DOM2DTM.
      *
      * @param node Node whose subtree is to be walked, gathering the
      * contents of all Text or CDATASection nodes.
  @@ -1422,6 +1523,8 @@
       case Node.COMMENT_NODE :
         if(0 != depth)
           break;
  +     // NOTE: Because this operation works in the DOM space, it does _not_ attempt
  +     // to perform Text Coalition. That should only be done in DTM space. 
       case Node.TEXT_NODE :
       case Node.CDATA_SECTION_NODE :
       case Node.ATTRIBUTE_NODE :
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]
cvs commit: xml-xalan/java/src/org/apache/xml/dtm/ref/dom2dtm DOM2DTM.java

Reply via email to