internal WFXMLScanner.hpp WFXMLScanner.cpp

knoaman Tue, 03 Dec 2002 18:02:29 -0800

knoaman     2002/12/03 18:01:30

  Added:       c/src/xercesc/internal WFXMLScanner.hpp WFXMLScanner.cpp
  Log:
  Initial checkin.
  
  Revision  Changes    Path
  1.1                  xml-xerces/c/src/xercesc/internal/WFXMLScanner.hpp
  
  Index: WFXMLScanner.hpp
  ===================================================================
  /*
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2002 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Xerces" and "Apache Software Foundation" must
   *    not be used to endorse or promote products derived from this
   *    software without prior written permission. For written
   *    permission, please contact apache\@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    nor may "Apache" appear in their name, without prior written
   *    permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation, and was
   * originally based on software copyright (c) 1999, International
   * Business Machines, Inc., http://www.ibm.com .  For more information
   * on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  /*
   * $Log: WFXMLScanner.hpp,v $
   * Revision 1.1  2002/12/04 02:01:29  knoaman
   * Initial checkin.
   *
   */
  
  
  #if !defined(WFXMLSCANNER_HPP)
  #define WFXMLSCANNER_HPP
  
  #include <xercesc/internal/XMLScanner.hpp>
  #include <xercesc/internal/ElemStack.hpp>
  #include <xercesc/util/ValueHashTableOf.hpp>
  #include <xercesc/util/ValueVectorOf.hpp>
  #include <xercesc/validators/DTD/DTDElementDecl.hpp>
  
  XERCES_CPP_NAMESPACE_BEGIN
  
  
  //  This is a a non-validating scanner. No DOCTYPE or XML Schema processing
  //  will take place.
  class XMLPARSER_EXPORT WFXMLScanner : public XMLScanner
  {
  public :
      // -----------------------------------------------------------------------
      //  Constructors and Destructor
      // -----------------------------------------------------------------------
      WFXMLScanner
      (
          XMLValidator* const valToAdopt
      );
      WFXMLScanner
      (
          XMLDocumentHandler* const  docHandler
          , DocTypeHandler* const    docTypeHandler
          , XMLEntityHandler* const  entityHandler
          , XMLErrorReporter* const  errReporter
          , XMLValidator* const      valToAdopt
      );
      virtual ~WFXMLScanner();
  
      // -----------------------------------------------------------------------
      //  XMLScanner public virtual methods
      // -----------------------------------------------------------------------
      virtual const XMLCh* getName() const;
      virtual NameIdPool<DTDEntityDecl>* getEntityDeclPool();
      virtual const NameIdPool<DTDEntityDecl>* getEntityDeclPool() const;
      virtual unsigned int resolveQName
      (
          const   XMLCh* const        qName
          ,       XMLBuffer&          prefixBufToFill
          , const short               mode
          ,       int&                prefixColonPos
      );
      virtual void scanDocument
      (
          const   InputSource&    src
      );
      virtual bool scanNext(XMLPScanToken& toFill);
      virtual Grammar* loadGrammar
      (
          const   InputSource&    src
          , const short           grammarType
          , const bool            toCache = false
      );
  
  private :
      // -----------------------------------------------------------------------
      //  Unimplemented constructors and operators
      // -----------------------------------------------------------------------
      WFXMLScanner();
      WFXMLScanner(const WFXMLScanner&);
      void operator=(const WFXMLScanner&);
  
      // -----------------------------------------------------------------------
      //  XMLScanner virtual methods
      // -----------------------------------------------------------------------
      virtual void scanCDSection();
      virtual void scanCharData(XMLBuffer& toToUse);
      virtual EntityExpRes scanEntityRef
      (
          const   bool    inAttVal
          ,       XMLCh&  firstCh
          ,       XMLCh&  secondCh
          ,       bool&   escaped
      );
      virtual void scanDocTypeDecl();
      virtual void scanReset(const InputSource& src);
      virtual void sendCharData(XMLBuffer& toSend);
  
      // -----------------------------------------------------------------------
      //  Private helper methods
      // -----------------------------------------------------------------------
      void commonInit();
      void cleanUp();
      unsigned int resolvePrefix
      (
          const   XMLCh* const        prefix
          , const WFElemStack::MapModes mode
      );
      
      // -----------------------------------------------------------------------
      //  Private scanning methods
      // -----------------------------------------------------------------------
      bool scanAttValue
      (
          const   XMLCh* const    attrName
          ,       XMLBuffer&      toFill
      );
      bool scanContent(const bool extEntity);
      void scanEndTag(bool& gotData);
      bool scanStartTag(bool& gotData);
      bool scanStartTagNS(bool& gotData);
  
      // -----------------------------------------------------------------------
      //  Data members
      //
      //  fElemStack
      //      This is the element stack that is used to track the elements that
      //      are currently being worked on.
      //
      //  fEntityTable
      //      This the table that contains the default entity entries.
      //
      //  fDTDElemDecl
      //      The XMLElementDecl object to be passed to the document handler.
      //
      //  fAttrNameHashList
      //      This contains the hash value for attribute names. It's used when
      //      checking for duplicate attributes.
      //
      //  fAttrNSList
      //      This contains XMLAttr objects that we need to map their prefixes
      //      to URIs when namespace is enabled.
      //
      // -----------------------------------------------------------------------
      ValueHashTableOf<XMLCh>*     fEntityTable;
      ValueVectorOf<unsigned int>* fAttrNameHashList;
      ValueVectorOf<XMLAttr*>*     fAttrNSList;
      WFElemStack                  fElemStack;
      DTDElementDecl               fDTDElemDecl;
  };
  
  inline const XMLCh* WFXMLScanner::getName() const
  {
      return XMLUni::fgWFScanner;
  }
  
  
  XERCES_CPP_NAMESPACE_END
  
  #endif
  
  
  
  1.1                  xml-xerces/c/src/xercesc/internal/WFXMLScanner.cpp
  
  Index: WFXMLScanner.cpp
  ===================================================================
  /*
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2002 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Xerces" and "Apache Software Foundation" must
   *    not be used to endorse or promote products derived from this
   *    software without prior written permission. For written
   *    permission, please contact apache\@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    nor may "Apache" appear in their name, without prior written
   *    permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation, and was
   * originally based on software copyright (c) 1999, International
   * Business Machines, Inc., http://www.ibm.com .  For more information
   * on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  /*
    * $Id: WFXMLScanner.cpp,v 1.1 2002/12/04 02:01:29 knoaman Exp $
   */
  
  
  // ---------------------------------------------------------------------------
  //  Includes
  // ---------------------------------------------------------------------------
  #include <xercesc/internal/WFXMLScanner.hpp>
  #include <xercesc/util/Janitor.hpp>
  #include <xercesc/util/RuntimeException.hpp>
  #include <xercesc/util/UnexpectedEOFException.hpp>
  #include <xercesc/util/XMLUniDefs.hpp>
  #include <xercesc/util/XMLUni.hpp>
  #include <xercesc/sax/InputSource.hpp>
  #include <xercesc/framework/XMLDocumentHandler.hpp>
  #include <xercesc/framework/XMLErrorReporter.hpp>
  #include <xercesc/framework/XMLEntityHandler.hpp>
  #include <xercesc/framework/XMLPScanToken.hpp>
  #include <xercesc/framework/XMLValidator.hpp>
  #include <xercesc/framework/XMLValidityCodes.hpp>
  #include <xercesc/internal/EndOfEntityException.hpp>
  
  
  XERCES_CPP_NAMESPACE_BEGIN
  
  // ---------------------------------------------------------------------------
  //  WFXMLScanner: Constructors and Destructor
  // ---------------------------------------------------------------------------
  WFXMLScanner::WFXMLScanner(XMLValidator* const valToAdopt) :
      XMLScanner(valToAdopt)
      , fEntityTable(0)
      , fAttrNameHashList(0)
      , fAttrNSList(0)
  {
      try {
          commonInit();
      }
      catch(...) {
  
          cleanUp();
          throw;
      }
  }
  
  WFXMLScanner::WFXMLScanner( XMLDocumentHandler* const  docHandler
                              , DocTypeHandler* const    docTypeHandler
                              , XMLEntityHandler* const  entityHandler
                              , XMLErrorReporter* const  errHandler
                              , XMLValidator* const      valToAdopt) :
  
      XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt)
      , fEntityTable(0)
      , fAttrNameHashList(0)
      , fAttrNSList(0)
  {
      try {
          commonInit();
      }
      catch(...) {
  
          cleanUp();
          throw;
      }
  }
  
  WFXMLScanner::~WFXMLScanner()
  {
      cleanUp();
  }
  
  
  // ---------------------------------------------------------------------------
  //  WFXMLScanner: Main entry point to scan a document
  // ---------------------------------------------------------------------------
  void WFXMLScanner::scanDocument(const InputSource& src)
  {
      //  Bump up the sequence id for this parser instance. This will invalidate
      //  any previous progressive scan tokens.
      fSequenceId++;
  
      try
      {
          //  Reset the scanner and its plugged in stuff for a new run. This
          //  resets all the data structures, creates the initial reader and
          //  pushes it on the stack, and sets up the base document path.
          scanReset(src);
  
          // If we have a document handler, then call the start document
          if (fDocHandler)
              fDocHandler->startDocument();
  
          //  Scan the prolog part, which is everything before the root element
          //  including the DTD subsets.
          scanProlog();
  
          //  If we got to the end of input, then its not a valid XML file.
          //  Else, go on to scan the content.
          if (fReaderMgr.atEOF())
          {
              emitError(XMLErrs::EmptyMainEntity);
          }
          else
          {
              // Scan content, and tell it its not an external entity
              if (scanContent(false))
              {
                  // That went ok, so scan for any miscellaneous stuff
                  if (!fReaderMgr.atEOF())
                      scanMiscellaneous();
              }
          }
  
          // If we have a document handler, then call the end document
          if (fDocHandler)
              fDocHandler->endDocument();
  
          // Reset the reader manager to close all files, sockets, etc...
          fReaderMgr.reset();
      }
      //  NOTE:
      //
      //  In all of the error processing below, the emitError() call MUST come
      //  before the flush of the reader mgr, or it will fail because it tries
      //  to find out the position in the XML source of the error.
      catch(const XMLErrs::Codes)
      {
          // This is a 'first fatal error' type exit, so reset and fall through
          fReaderMgr.reset();
      }
      catch(const XMLValid::Codes)
      {
          // This is a 'first fatal error' type exit, so reset and fall through
          fReaderMgr.reset();
      }
      catch(const XMLException& excToCatch)
      {
          //  Emit the error and catch any user exception thrown from here. Make
          //  sure in all cases we flush the reader manager.
          fInException = true;
          try
          {
              if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
                  emitError
                  (
                      XMLErrs::XMLException_Warning
                      , excToCatch.getType()
                      , excToCatch.getMessage()
                  );
              else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
                  emitError
                  (
                      XMLErrs::XMLException_Fatal
                      , excToCatch.getType()
                      , excToCatch.getMessage()
                  );
              else
                  emitError
                  (
                      XMLErrs::XMLException_Error
                      , excToCatch.getType()
                      , excToCatch.getMessage()
                  );
          }
          catch(...)
          {
              // Flush the reader manager and rethrow user's error
              fReaderMgr.reset();
              throw;
          }
  
          // If it returned, then reset the reader manager and fall through
          fReaderMgr.reset();
      }
      catch(...)
      {
          // Reset and rethrow
          fReaderMgr.reset();
          throw;
      }
  }
  
  
  bool WFXMLScanner::scanNext(XMLPScanToken& token)
  {
      // Make sure this token is still legal
      if (!isLegalToken(token))
          ThrowXML(RuntimeException, XMLExcepts::Scan_BadPScanToken);
  
      // Find the next token and remember the reader id
      unsigned int orgReader;
      XMLTokens curToken;
  
      bool retVal = true;
  
      try
      {
          while (true)
          {
              //  We have to handle any end of entity exceptions that happen here.
              //  We could be at the end of X nested entities, each of which will
              //  generate an end of entity exception as we try to move forward.
              try
              {
                  curToken = senseNextToken(orgReader);
                  break;
              }
              catch(const EndOfEntityException& toCatch)
              {
                  // Send an end of entity reference event
                  if (fDocHandler)
                      fDocHandler->endEntityReference(toCatch.getEntity());
              }
          }
  
          if (curToken == Token_CharData)
          {
              scanCharData(fCDataBuf);
          }
          else if (curToken == Token_EOF)
          {
              if (!fElemStack.isEmpty())
              {
                  const WFElemStack::StackElem* topElem = fElemStack.popTop();
                  emitError
                  (
                      XMLErrs::EndedWithTagsOnStack
                      , topElem->fThisElement
                  );
              }
  
              retVal = false;
          }
          else
          {
              // Its some sort of markup
              bool gotData = true;
              switch(curToken)
              {
                  case Token_CData :
                      // Make sure we are within content
                      if (fElemStack.isEmpty())
                          emitError(XMLErrs::CDATAOutsideOfContent);
                      scanCDSection();
                      break;
  
                  case Token_Comment :
                      scanComment();
                      break;
  
                  case Token_EndTag :
                      scanEndTag(gotData);
                      break;
  
                  case Token_PI :
                      scanPI();
                      break;
  
                  case Token_StartTag :
                      if (fDoNamespaces)
                          scanStartTagNS(gotData);
                      else
                          scanStartTag(gotData);
                      break;
  
                  default :
                      fReaderMgr.skipToChar(chOpenAngle);
                      break;
              }
  
              if (orgReader != fReaderMgr.getCurrentReaderNum())
                  emitError(XMLErrs::PartialMarkupInEntity);
  
              // If we hit the end, then do the miscellaneous part
              if (!gotData)
              {
                  // That went ok, so scan for any miscellaneous stuff
                  scanMiscellaneous();
  
                  if (fDocHandler)
                      fDocHandler->endDocument();
              }
          }
      }
      //  NOTE:
      //
      //  In all of the error processing below, the emitError() call MUST come
      //  before the flush of the reader mgr, or it will fail because it tries
      //  to find out the position in the XML source of the error.
      catch(const XMLErrs::Codes)
      {
          // This is a 'first failure' exception, so reset and return failure
          fReaderMgr.reset();
          return false;
      }
      catch(const XMLValid::Codes)
      {
          // This is a 'first fatal error' type exit, so reset and reuturn failure
          fReaderMgr.reset();
          return false;
      }
      catch(const XMLException& excToCatch)
      {
          //  Emit the error and catch any user exception thrown from here. Make
          //  sure in all cases we flush the reader manager.
          fInException = true;
          try
          {
              if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
                  emitError
                  (
                      XMLErrs::XMLException_Warning
                      , excToCatch.getType()
                      , excToCatch.getMessage()
                  );
              else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
                  emitError
                  (
                      XMLErrs::XMLException_Fatal
                      , excToCatch.getType()
                      , excToCatch.getMessage()
                  );
              else
                  emitError
                  (
                      XMLErrs::XMLException_Error
                      , excToCatch.getType()
                      , excToCatch.getMessage()
                  );
          }
          catch(...)
          {
              // Reset and rethrow user error
              fReaderMgr.reset();
              throw;
          }
  
          // Reset and return failure
          fReaderMgr.reset();
          return false;
      }
      catch(...)
      {
          // Reset and rethrow original error
          fReaderMgr.reset();
          throw;
      }
  
      // If we hit the end, then flush the reader manager
      if (!retVal)
          fReaderMgr.reset();
  
      return retVal;
  }
  
  
  // ---------------------------------------------------------------------------
  //  WFXMLScanner: Private helper methods.
  // ---------------------------------------------------------------------------
  
  //  This method handles the common initialization, to avoid having to do
  //  it redundantly in multiple constructors.
  void WFXMLScanner::commonInit()
  {
      fEntityTable = new ValueHashTableOf<XMLCh>(11);
      fAttrNameHashList = new ValueVectorOf<unsigned int>(16);
      fAttrNSList = new ValueVectorOf<XMLAttr*>(8);
  
      //  Add the default entity entries for the character refs that must always
      //  be present.
      fEntityTable->put((void*) XMLUni::fgAmp, chAmpersand);
      fEntityTable->put((void*) XMLUni::fgLT, chOpenAngle);
      fEntityTable->put((void*) XMLUni::fgGT, chCloseAngle);
      fEntityTable->put((void*) XMLUni::fgQuot, chDoubleQuote);
      fEntityTable->put((void*) XMLUni::fgApos, chSingleQuote);
  }
  
  void WFXMLScanner::cleanUp()
  {
      delete fEntityTable;
      delete fAttrNameHashList;
      delete fAttrNSList;
  }
  
  unsigned int
  WFXMLScanner::resolvePrefix(const   XMLCh* const          prefix
                              , const WFElemStack::MapModes mode)
  {
      //  Watch for the special namespace prefixes. We always map these to
      //  special URIs. 'xml' gets mapped to the official URI that its defined
      //  to map to by the NS spec. xmlns gets mapped to a special place holder
      //  URI that we define (so that it maps to something checkable.)
      if (XMLString::equals(prefix, XMLUni::fgXMLNSString))
          return fXMLNSNamespaceId;
      else if (XMLString::equals(prefix, XMLUni::fgXMLString))
          return fXMLNamespaceId;
  
      //  Ask the element stack to search up itself for a mapping for the
      //  passed prefix.
      bool unknown;
      unsigned int uriId = fElemStack.mapPrefixToURI(prefix, mode, unknown);
  
      // If it was unknown, then the URI was faked in but we have to issue an error
      if (unknown)
          emitError(XMLErrs::UnknownPrefix, prefix);
  
      return uriId;
  }
  
  //  This method will reset the scanner data structures, and related plugged
  //  in stuff, for a new scan session. We get the input source for the primary
  //  XML entity, create the reader for it, and push it on the stack so that
  //  upon successful return from here we are ready to go.
  void WFXMLScanner::scanReset(const InputSource& src)
  {
      //  And for all installed handlers, send reset events. This gives them
      //  a chance to flush any cached data.
      if (fDocHandler)
          fDocHandler->resetDocument();
      if (fEntityHandler)
          fEntityHandler->resetEntities();
      if (fErrorReporter)
          fErrorReporter->resetErrors();
  
      //  Reset the element stack, and give it the latest ids for the special
      //  URIs it has to know about.
      fElemStack.reset
      (
          fEmptyNamespaceId
          , fUnknownNamespaceId
          , fXMLNamespaceId
          , fXMLNSNamespaceId
      );
  
      // Reset some status flags
      fInException = false;
      fStandalone = false;
      fErrorCount = 0;
      fHasNoDTD = true;
  
      //  Handle the creation of the XML reader object for this input source.
      //  This will provide us with transcoding and basic lexing services.
      XMLReader* newReader = fReaderMgr.createReader
      (
          src
          , true
          , XMLReader::RefFrom_NonLiteral
          , XMLReader::Type_General
          , XMLReader::Source_External
          , fCalculateSrcOfs
      );
  
      if (!newReader) {
          if (src.getIssueFatalErrorIfNotFound())
              ThrowXML1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, 
src.getSystemId());
          else
              ThrowXML1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, 
src.getSystemId());
      }
  
      // Push this read onto the reader manager
      fReaderMgr.pushReader(newReader, 0);
  }
  
  //  This method is called between markup in content. It scans for character
  //  data that is sent to the document handler. It watches for any markup
  //  characters that would indicate that the character data has ended. It also
  //  handles expansion of general and character entities.
  //
  //  sendData() is a local static helper for this method which handles some
  //  code that must be done in three different places here.
  void WFXMLScanner::sendCharData(XMLBuffer& toSend)
  {
      // If no data in the buffer, then nothing to do
      if (toSend.isEmpty())
          return;
  
      // Always assume its just char data if not validating
      if (fDocHandler)
          fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false);
  
      // Reset buffer
      toSend.reset();
  }
  
  // ---------------------------------------------------------------------------
  //  WFXMLScanner: Getter methods
  // ---------------------------------------------------------------------------
  NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool()
  {
      return 0;
  }
      
  const NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool() const
  {
      return 0;
  }
  
  
  // ---------------------------------------------------------------------------
  //  WFXMLScanner: Private scanning methods
  // ---------------------------------------------------------------------------
  
  //  This method will kick off the scanning of the primary content of the
  //  document, i.e. the elements.
  bool WFXMLScanner::scanContent(const bool extEntity)
  {
      //  Go into a loop until we hit the end of the root element, or we fall
      //  out because there is no root element.
      //
      //  We have to do kind of a deeply nested double loop here in order to
      //  avoid doing the setup/teardown of the exception handler on each
      //  round. Doing it this way we only do it when an exception actually
      //  occurs.
      bool gotData = true;
      bool inMarkup = false;
      while (gotData)
      {
          try
          {
              while (gotData)
              {
                  //  Sense what the next top level token is. According to what
                  //  this tells us, we will call something to handle that kind
                  //  of thing.
                  unsigned int orgReader;
                  const XMLTokens curToken = senseNextToken(orgReader);
  
                  //  Handle character data and end of file specially. Char data
                  //  is not markup so we don't want to handle it in the loop
                  //  below.
                  if (curToken == Token_CharData)
                  {
                      //  Scan the character data and call appropriate events. Let
                      //  him use our local character data buffer for efficiency.
                      scanCharData(fCDataBuf);
                      continue;
                  }
                  else if (curToken == Token_EOF)
                  {
                      //  The element stack better be empty at this point or we
                      //  ended prematurely before all elements were closed.
                      if (!fElemStack.isEmpty())
                      {
                          const WFElemStack::StackElem* topElem = fElemStack.popTop();
                          emitError
                          (
                              XMLErrs::EndedWithTagsOnStack
                              , topElem->fThisElement
                          );
                      }
  
                      // Its the end of file, so clear the got data flag
                      gotData = false;
                      continue;
                  }
  
                  // We are in some sort of markup now
                  inMarkup = true;
  
                  //  According to the token we got, call the appropriate
                  //  scanning method.
                  switch(curToken)
                  {
                      case Token_CData :
                          // Make sure we are within content
                          if (fElemStack.isEmpty())
                              emitError(XMLErrs::CDATAOutsideOfContent);
                          scanCDSection();
                          break;
  
                      case Token_Comment :
                          scanComment();
                          break;
  
                      case Token_EndTag :
                          scanEndTag(gotData);
                          break;
  
                      case Token_PI :
                          scanPI();
                          break;
  
                      case Token_StartTag :
                          if (fDoNamespaces)
                              scanStartTagNS(gotData);
                          else
                              scanStartTag(gotData);
                          break;
  
                      default :
                          fReaderMgr.skipToChar(chOpenAngle);
                          break;
                  }
  
                  if (orgReader != fReaderMgr.getCurrentReaderNum())
                      emitError(XMLErrs::PartialMarkupInEntity);
  
                  // And we are back out of markup again
                  inMarkup = false;
              }
          }
          catch(const EndOfEntityException& toCatch)
          {
              //  If we were in some markup when this happened, then its a
              //  partial markup error.
              if (inMarkup)
                  emitError(XMLErrs::PartialMarkupInEntity);
  
              // Send an end of entity reference event
              if (fDocHandler)
                  fDocHandler->endEntityReference(toCatch.getEntity());
  
              inMarkup = false;
          }
      }
  
      // It went ok, so return success
      return true;
  }
  
  
  void WFXMLScanner::scanEndTag(bool& gotData)
  {
      //  Assume we will still have data until proven otherwise. It will only
      //  ever be false if this is the end of the root element.
      gotData = true;
  
      //  Check if the element stack is empty. If so, then this is an unbalanced
      //  element (i.e. more ends than starts, perhaps because of bad text
      //  causing one to be skipped.)
      if (fElemStack.isEmpty())
      {
          emitError(XMLErrs::MoreEndThanStartTags);
          fReaderMgr.skipPastChar(chCloseAngle);
          ThrowXML(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd);
      }
  
      // After the </ is the element QName, so get a name from the input
      if (!fReaderMgr.getName(fQNameBuf))
      {
          // It failed so we can't really do anything with it
          emitError(XMLErrs::ExpectedElementName);
          fReaderMgr.skipPastChar(chCloseAngle);
          return;
      }
  
      //  Pop the stack of the element we are supposed to be ending. Remember
      //  that we don't own this. The stack just keeps them and reuses them.
      unsigned int uriId = (fDoNamespaces)
          ? fElemStack.getCurrentURI() : fEmptyNamespaceId;
      const WFElemStack::StackElem* topElem = fElemStack.popTop();    
  
      // See if it was the root element, to avoid multiple calls below
      const bool isRoot = fElemStack.isEmpty();
  
      // Make sure that its the end of the element that we expect
      if (!XMLString::equals(topElem->fThisElement, fQNameBuf.getRawBuffer())) {
          emitError
          (
              XMLErrs::ExpectedEndOfTagX
              , topElem->fThisElement
          );
      }
  
      // Make sure we are back on the same reader as where we started
      if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum())
          emitError(XMLErrs::PartialTagMarkupError);
  
      // Skip optional whitespace
      fReaderMgr.skipPastSpaces();
  
      // Make sure we find the closing bracket
      if (!fReaderMgr.skippedChar(chCloseAngle))
      {
          emitError
          (
              XMLErrs::UnterminatedEndTag
              , topElem->fThisElement
          );
      }
  
      // If we have a doc handler, tell it about the end tag
      if (fDocHandler)
      {
          fDTDElemDecl.setElementName(topElem->fThisElement, uriId);
          fDocHandler->endElement
          (
              fDTDElemDecl
              , uriId
              , isRoot
              , fDTDElemDecl.getElementName()->getPrefix()
          );
      }
  
      // If this was the root, then done with content
      gotData = !isRoot;
  }
  
  
  void WFXMLScanner::scanDocTypeDecl()
  {
      // Just skips over it
      // REVISIT: Should we issue a warning
      fReaderMgr.skipPastChar(chCloseAngle);
  }
  
  bool WFXMLScanner::scanStartTag(bool& gotData)
  {
      // Assume we will still have data until proven otherwise. It will only
      // ever be false if this is the root and its empty.
      gotData = true;
  
      //  Get the QName. In this case, we are not doing namespaces, so we just
      //  use it as is and don't have to break it into parts.
      if (!fReaderMgr.getName(fQNameBuf))
      {
          emitError(XMLErrs::ExpectedElementName);
          fReaderMgr.skipToChar(chOpenAngle);
          return false;
      }
  
      // Assume it won't be an empty tag
      bool isEmpty = false;
  
      // See if its the root element
      const bool isRoot = fElemStack.isEmpty();
  
      // Skip any whitespace after the name
      fReaderMgr.skipPastSpaces();
  
      // Expand the element stack and add the new element
      const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
      fElemStack.addLevel
      (
          qnameRawBuf
          , fQNameBuf.getLen()
          , fReaderMgr.getCurrentReaderNum()
      );
  
      //  We loop until we either see a /> or >, handling attribute/value
      //  pairs until we get there.
      unsigned int    attCount = 0;
      unsigned int    curAttListSize = fAttrList->size();
      while (true)
      {
          // And get the next non-space character
          XMLCh nextCh = fReaderMgr.peekNextChar();
  
          //  If the next character is not a slash or closed angle bracket,
          //  then it must be whitespace, since whitespace is required
          //  between the end of the last attribute and the name of the next
          //  one.
          if (attCount)
          {
              if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
              {
                  if (XMLReader::isWhitespace(nextCh))
                  {
                      // Ok, skip by them and peek another char
                      fReaderMgr.skipPastSpaces();
                      nextCh = fReaderMgr.peekNextChar();
                  }
                  else
                  {
                      // Emit the error but keep on going
                      emitError(XMLErrs::ExpectedWhitespace);
                  }
              }
          }
  
          //  Ok, here we first check for any of the special case characters.
          //  If its not one, then we do the normal case processing, which
          //  assumes that we've hit an attribute value, Otherwise, we do all
          //  the special case checks.
          if (!XMLReader::isSpecialStartTagChar(nextCh))
          {
              //  Assume its going to be an attribute, so get a name from
              //  the input.
              if (!fReaderMgr.getName(fAttNameBuf))
              {
                  emitError(XMLErrs::ExpectedAttrName);
                  fReaderMgr.skipPastChar(chCloseAngle);
                  return false;
              }
  
              // And next must be an equal sign
              if (!scanEq())
              {
                  static const XMLCh tmpList[] =
                  {
                      chSingleQuote, chDoubleQuote, chCloseAngle
                      , chOpenAngle, chForwardSlash, chNull
                  };
  
                  emitError(XMLErrs::ExpectedEqSign);
  
                  //  Try to sync back up by skipping forward until we either
                  //  hit something meaningful.
                  const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
  
                  if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
                  {
                      // Jump back to top for normal processing of these
                      continue;
                  }
                  else if ((chFound == chSingleQuote)
                        ||  (chFound == chDoubleQuote)
                        ||  XMLReader::isWhitespace(chFound))
                  {
                      // Just fall through assuming that the value is to follow
                  }
                  else if (chFound == chOpenAngle)
                  {
                      // Assume a malformed tag and that new one is starting
                      emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
                      return false;
                  }
                  else
                  {
                      // Something went really wrong
                      return false;
                  }
              }
  
              //  See if this attribute is declared more than one for this element.
              const XMLCh* attNameRawBuf = fAttNameBuf.getRawBuffer(); 
              unsigned int attNameHash = XMLString::hash(attNameRawBuf, 109);
  
              if (attCount) {
  
                  for (unsigned int k=0; k < attCount; k++) {
  
                      if (fAttrNameHashList->elementAt(k) == attNameHash) {
                          if (
                                 XMLString::equals
                                 ( 
                                     fAttrList->elementAt(k)->getName()
                                     , attNameRawBuf
                                 )
                             )
                          {
                              emitError
                              (
                                  XMLErrs::AttrAlreadyUsedInSTag
                                  , attNameRawBuf
                                  , qnameRawBuf
                              );
                              break;
                          }
                      }
                  }
              }
  
              //  Skip any whitespace before the value and then scan the att
              //  value. This will come back normalized with entity refs and
              //  char refs expanded.
              fReaderMgr.skipPastSpaces();
              if (!scanAttValue(attNameRawBuf, fAttValueBuf))
              {
                  static const XMLCh tmpList[] =
                  {
                      chCloseAngle, chOpenAngle, chForwardSlash, chNull
                  };
  
                  emitError(XMLErrs::ExpectedAttrValue);
  
                  //  It failed, so lets try to get synced back up. We skip
                  //  forward until we find some whitespace or one of the
                  //  chars in our list.
                  const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
  
                  if ((chFound == chCloseAngle)
                  ||  (chFound == chForwardSlash)
                  ||  XMLReader::isWhitespace(chFound))
                  {
                      //  Just fall through and process this attribute, though
                      //  the value will be "".
                  }
                  else if (chFound == chOpenAngle)
                  {
                      // Assume a malformed tag and that new one is starting
                      emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
                      return false;
                  }
                  else
                  {
                      // Something went really wrong
                      return false;
                  }
              }
  
              //  Add this attribute to the attribute list that we use to
              //  pass them to the handler. We reuse its existing elements
              //  but expand it as required.
              if (attCount >= curAttListSize)
              {
                  fAttrList->addElement
                  (
                      new XMLAttr
                      (
                          -1
                          , attNameRawBuf
                          , XMLUni::fgZeroLenString
                          , fAttValueBuf.getRawBuffer()
                      )
                  );                
                  fAttrNameHashList->addElement(attNameHash);
              }
              else
              {
                  XMLAttr* curAtt = fAttrList->elementAt(attCount);
                  curAtt->set
                  (
                      -1
                      , attNameRawBuf
                      , XMLUni::fgZeroLenString
                      , fAttValueBuf.getRawBuffer()
                  );
              }
  
              attCount++;
  
              // And jump back to the top of the loop
              continue;
          }
  
          //  It was some special case character so do all of the checks and
          //  deal with it.
          if (!nextCh)
              ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
  
          if (nextCh == chForwardSlash)
          {
              fReaderMgr.getNextChar();
              isEmpty = true;
              if (!fReaderMgr.skippedChar(chCloseAngle))
                  emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
              break;
          }
          else if (nextCh == chCloseAngle)
          {
              fReaderMgr.getNextChar();
              break;
          }
          else if (nextCh == chOpenAngle)
          {
              //  Check for this one specially, since its going to be common
              //  and it is kind of auto-recovering since we've already hit the
              //  next open bracket, which is what we would have seeked to (and
              //  skipped this whole tag.)
              emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
              break;
          }
          else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
          {
              //  Check for this one specially, which is probably a missing
              //  attribute name, e.g. ="value". Just issue expected name
              //  error and eat the quoted string, then jump back to the
              //  top again.
              emitError(XMLErrs::ExpectedAttrName);
              fReaderMgr.getNextChar();
              fReaderMgr.skipQuotedString(nextCh);
              fReaderMgr.skipPastSpaces();
              continue;
          }
      }
  
      //  If we have a document handler, then tell it about this start tag. We
      //  don't have any URI id to send along, so send fEmptyNamespaceId. We also do 
not send
      //  any prefix since its just one big name if we are not doing namespaces.
      if (fDocHandler)
      {
          fDTDElemDecl.setElementName(qnameRawBuf, fEmptyNamespaceId);
          fDocHandler->startElement
          (
              fDTDElemDecl
              , fEmptyNamespaceId
              , 0
              , *fAttrList
              , attCount
              , isEmpty
              , isRoot
          );
      }
  
      //  If empty, validate content right now if we are validating and then
      //  pop the element stack top.
      if (isEmpty)
      {
          // Pop the element stack back off since it'll never be used now
          fElemStack.popTop();
  
          // If the elem stack is empty, then it was an empty root
          if (isRoot)
              gotData = false;
      }
  
      return true;
  }
  
  
  //  This method is called to scan a start tag when we are processing
  //  namespaces. There are two different versions of this method, one for
  //  namespace aware processing and one for non-namespace aware processing.
  //
  //  This method is called after we've scanned the < of a start tag. So we
  //  have to get the element name, then scan the attributes, after which
  //  we are either going to see >, />, or attributes followed by one of those
  //  sequences.
  bool WFXMLScanner::scanStartTagNS(bool& gotData)
  {
      //  Assume we will still have data until proven otherwise. It will only
      //  ever be false if this is the root and its empty.
      gotData = true;
  
      //  The current position is after the open bracket, so we need to read in
      //  in the element name.
      if (!fReaderMgr.getName(fQNameBuf))
      {
          emitError(XMLErrs::ExpectedElementName);
          fReaderMgr.skipToChar(chOpenAngle);
          return false;
      }
  
      // See if its the root element
      const bool isRoot = fElemStack.isEmpty();
  
        // Assume it won't be an empty tag
      bool isEmpty = false;
  
      // Skip any whitespace after the name
      fReaderMgr.skipPastSpaces();
  
      // Expand the element stack and add the new element
      const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
      fElemStack.addLevel(qnameRawBuf, fQNameBuf.getLen(), 
fReaderMgr.getCurrentReaderNum());
  
      // reset NS attribute list
      fAttrNSList->removeAllElements();
  
      //  We loop until we either see a /> or >, handling attribute/value
      //  pairs until we get there.
      unsigned int attCount = 0;
      unsigned int curAttListSize = fAttrList->size();
      while (true)
      {
          // And get the next non-space character
          XMLCh nextCh = fReaderMgr.peekNextChar();
  
          //  If the next character is not a slash or closed angle bracket,
          //  then it must be whitespace, since whitespace is required
          //  between the end of the last attribute and the name of the next
          //  one.
          if (attCount)
          {
              if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
              {
                  if (XMLReader::isWhitespace(nextCh))
                  {
                      // Ok, skip by them and peek another char
                      fReaderMgr.skipPastSpaces();
                      nextCh = fReaderMgr.peekNextChar();
                  }
                  else
                  {
                      // Emit the error but keep on going
                      emitError(XMLErrs::ExpectedWhitespace);
                  }
              }
          }
  
          //  Ok, here we first check for any of the special case characters.
          //  If its not one, then we do the normal case processing, which
          //  assumes that we've hit an attribute value, Otherwise, we do all
          //  the special case checks.
          if (!XMLReader::isSpecialStartTagChar(nextCh))
          {
              //  Assume its going to be an attribute, so get a name from
              //  the input.
              if (!fReaderMgr.getName(fAttNameBuf))
              {
                  emitError(XMLErrs::ExpectedAttrName);
                  fReaderMgr.skipPastChar(chCloseAngle);
                  return false;
              }
  
              // And next must be an equal sign
              if (!scanEq())
              {
                  static const XMLCh tmpList[] =
                  {
                      chSingleQuote, chDoubleQuote, chCloseAngle
                      , chOpenAngle, chForwardSlash, chNull
                  };
  
                  emitError(XMLErrs::ExpectedEqSign);
  
                  //  Try to sync back up by skipping forward until we either
                  //  hit something meaningful.
                  const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
  
                  if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
                  {
                      // Jump back to top for normal processing of these
                      continue;
                  }
                  else if ((chFound == chSingleQuote)
                        ||  (chFound == chDoubleQuote)
                        ||  XMLReader::isWhitespace(chFound))
                  {
                      // Just fall through assuming that the value is to follow
                  }
                  else if (chFound == chOpenAngle)
                  {
                      // Assume a malformed tag and that new one is starting
                      emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
                      return false;
                  }
                  else
                  {
                      // Something went really wrong
                      return false;
                  }
              }
  
              //  See if this attribute is declared more than one for this element.
              const XMLCh* attNameRawBuf = fAttNameBuf.getRawBuffer(); 
              unsigned int attNameHash = XMLString::hash(attNameRawBuf, 109);
              if (attCount) {
  
                  for (unsigned int k=0; k < attCount; k++) {
  
                      if (fAttrNameHashList->elementAt(k) == attNameHash) {
                          if (XMLString::equals(
                                  fAttrList->elementAt(k)->getQName()
                                  , attNameRawBuf))
                          {
                              emitError
                              (
                                  XMLErrs::AttrAlreadyUsedInSTag
                                  , attNameRawBuf
                                  , qnameRawBuf
                              );
                              break;
                          }
                      }
                  }
              }
  
              //  Skip any whitespace before the value and then scan the att
              //  value. This will come back normalized with entity refs and
              //  char refs expanded.
              fReaderMgr.skipPastSpaces();
              if (!scanAttValue(attNameRawBuf, fAttValueBuf))
              {
                  static const XMLCh tmpList[] =
                  {
                      chCloseAngle, chOpenAngle, chForwardSlash, chNull
                  };
  
                  emitError(XMLErrs::ExpectedAttrValue);
  
                  //  It failed, so lets try to get synced back up. We skip
                  //  forward until we find some whitespace or one of the
                  //  chars in our list.
                  const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
  
                  if ((chFound == chCloseAngle)
                  ||  (chFound == chForwardSlash)
                  ||  XMLReader::isWhitespace(chFound))
                  {
                      //  Just fall through and process this attribute, though
                      //  the value will be "".
                  }
                  else if (chFound == chOpenAngle)
                  {
                      // Assume a malformed tag and that new one is starting
                      emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
                      return false;
                  }
                  else
                  {
                      // Something went really wrong
                      return false;
                  }
              }
  
              //  Add this attribute to the attribute list that we use to
              //  pass them to the handler. We reuse its existing elements
              //  but expand it as required.
              const XMLCh* attValueRawBuf = fAttValueBuf.getRawBuffer();
              XMLAttr* curAtt = 0;
              if (attCount >= curAttListSize)
              {
                  curAtt = new XMLAttr
                  (
                      fEmptyNamespaceId
                      , attNameRawBuf
                      , attValueRawBuf
                  );
                  fAttrList->addElement(curAtt);
                  fAttrNameHashList->addElement(attNameHash);
              }
              else
              {
                  curAtt = fAttrList->elementAt(attCount);
                  curAtt->set
                  (
                      fEmptyNamespaceId
                      , attNameRawBuf
                      , attValueRawBuf
                  );
                  fAttrNameHashList->setElementAt(attNameHash, attCount);
              }
  
              // Make sure that the name is basically well formed for namespace
              //
              // Map prefix to namespace
              const XMLCh* attPrefix = curAtt->getPrefix();
              const XMLCh* attLocalName = curAtt->getName();
              if (attPrefix && *attPrefix) {
  
                  int colonPos = XMLString::indexOf(attLocalName, chColon);
  
                  if (colonPos != -1) {
  
                      curAttListSize = fAttrList->size(); 
                      emitError(XMLErrs::TooManyColonsInName);
                      continue;
                  }
  
                  if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) {
                      curAtt->setURIId(fXMLNamespaceId);
                  }
                  else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) {
  
                      fElemStack.addPrefix
                      (
                          attLocalName
                          , fURIStringPool->addOrFind(fAttValueBuf.getRawBuffer())
                      );
                      curAtt->setURIId(fXMLNSNamespaceId);
                  }
                  else {
                      fAttrNSList->addElement(curAtt);
                  }
              }
              else {
                  if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName)) {
  
                      fElemStack.addPrefix
                      (
                          XMLUni::fgZeroLenString
                          , fURIStringPool->addOrFind(fAttValueBuf.getRawBuffer())
                      );
                  }
              }
  
              // increment attribute count
              attCount++;
              
              // And jump back to the top of the loop
              continue;
          }
  
          //  It was some special case character so do all of the checks and
          //  deal with it.
          if (!nextCh)
              ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
  
          if (nextCh == chForwardSlash)
          {
              fReaderMgr.getNextChar();
              isEmpty = true;
              if (!fReaderMgr.skippedChar(chCloseAngle))
                  emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
              break;
          }
          else if (nextCh == chCloseAngle)
          {
              fReaderMgr.getNextChar();
              break;
          }
          else if (nextCh == chOpenAngle)
          {
              //  Check for this one specially, since its going to be common
              //  and it is kind of auto-recovering since we've already hit the
              //  next open bracket, which is what we would have seeked to (and
              //  skipped this whole tag.)
              emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
              break;
          }
          else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
          {
              //  Check for this one specially, which is probably a missing
              //  attribute name, e.g. ="value". Just issue expected name
              //  error and eat the quoted string, then jump back to the
              //  top again.
              emitError(XMLErrs::ExpectedAttrName);
              fReaderMgr.getNextChar();
              fReaderMgr.skipQuotedString(nextCh);
              fReaderMgr.skipPastSpaces();
              continue;
          }
      }
  
      unsigned int uriId = fEmptyNamespaceId;
      if (fDocHandler) {
  
          // set element
          fDTDElemDecl.setElementName(qnameRawBuf, uriId);
  
          // Handle provided attributes that we did not map their prefixes
          for (unsigned int i=0; i < fAttrNSList->size(); i++) {
  
              XMLAttr* providedAttr = fAttrNSList->elementAt(i);
  
              providedAttr->setURIId
              (
                    resolvePrefix
                  (
                      providedAttr->getPrefix(),
                      WFElemStack::Mode_Attribute
                  )
              );
          }
  
          // Resolve the qualified name to a URI.
          uriId = resolvePrefix
          (
              fDTDElemDecl.getElementName()->getPrefix()
              , WFElemStack::Mode_Element
          );
  
          //  Now we can update the element stack 
          fElemStack.setCurrentURI(uriId);
  
          // Tell the document handler about this start tag
          fDocHandler->startElement
          (
              fDTDElemDecl
              , uriId
              , fDTDElemDecl.getElementName()->getPrefix()
              , *fAttrList
              , attCount
              , false
              , isRoot
          );
      }
  
      //  If empty, validate content right now if we are validating and then
      //  pop the element stack top.
      if (isEmpty)
      {
          // Pop the element stack back off since it'll never be used now
          fElemStack.popTop();
  
          // If we have a doc handler, tell it about the end tag
          if (fDocHandler)
          {
              fDocHandler->endElement
              (
                  fDTDElemDecl
                  , uriId
                  , isRoot
                  , fDTDElemDecl.getElementName()->getPrefix()
              );
          }
  
          // If the elem stack is empty, then it was an empty root
          if (isRoot)
              gotData = false;
      }
  
      return true;
  }
  
  unsigned int
  WFXMLScanner::resolveQName(const   XMLCh* const qName
                             ,       XMLBuffer&   prefixBuf
                             , const short        mode
                             ,       int&         prefixColonPos)
  {
      //  Lets split out the qName into a URI and name buffer first. The URI
      //  can be empty.
      prefixColonPos = XMLString::indexOf(qName, chColon);
      if (prefixColonPos == -1)
      {
          //  Its all name with no prefix, so put the whole thing into the name
          //  buffer. Then map the empty string to a URI, since the empty string
          //  represents the default namespace. This will either return some
          //  explicit URI which the default namespace is mapped to, or the
          //  the default global namespace.
          bool unknown = false;
  
          prefixBuf.reset();
          return fElemStack.mapPrefixToURI(XMLUni::fgZeroLenString, 
(WFElemStack::MapModes) mode, unknown);
      }
      else
      {
          //  Copy the chars up to but not including the colon into the prefix
          //  buffer.
          prefixBuf.set(qName, prefixColonPos);
  
          //  Watch for the special namespace prefixes. We always map these to
          //  special URIs. 'xml' gets mapped to the official URI that its defined
          //  to map to by the NS spec. xmlns gets mapped to a special place holder
          //  URI that we define (so that it maps to something checkable.)
          const XMLCh* prefixRawBuf = prefixBuf.getRawBuffer();
          if (XMLString::equals(prefixRawBuf, XMLUni::fgXMLNSString)) {
  
              // if this is an element, it is an error to have xmlns as prefix
              if (mode == WFElemStack::Mode_Element)
                  emitError(XMLErrs::NoXMLNSAsElementPrefix, qName);
  
              return fXMLNSNamespaceId;
          }
          else if (XMLString::equals(prefixRawBuf, XMLUni::fgXMLString)) {
              return fXMLNamespaceId;
          }
          else
          {
              bool unknown = false;
              unsigned int uriId = fElemStack.mapPrefixToURI(prefixRawBuf, 
(WFElemStack::MapModes)mode, unknown);
  
              if (unknown)
                  emitError(XMLErrs::UnknownPrefix, prefixBuf.getRawBuffer());
  
              return uriId;
          }
      }
  }
  
  // ---------------------------------------------------------------------------
  //  XMLScanner: Private parsing methods
  // ---------------------------------------------------------------------------
  bool WFXMLScanner::scanAttValue(const XMLCh* const attrName
                                ,     XMLBuffer&   toFill)
  {
      // Reset the target buffer
      toFill.reset();
  
      // Get the next char which must be a single or double quote
      XMLCh quoteCh;
      if (!fReaderMgr.skipIfQuote(quoteCh))
          return false;
  
      //  We have to get the current reader because we have to ignore closing
      //  quotes until we hit the same reader again.
      const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
  
      //  Loop until we get the attribute value. Note that we use a double
      //  loop here to avoid the setup/teardown overhead of the exception
      //  handler on every round.
      XMLCh   nextCh;
      XMLCh   secondCh = 0;
      bool    firstNonWS = false;
      bool    gotLeadingSurrogate = false;
      bool    escaped;
      while (true)
      {
      try
      {
          while(true)
          {
              // Get another char. Use second char if one is waiting
              if (secondCh)
              {
                  nextCh = secondCh;
                  secondCh = 0;
              }
              else
              {
                  nextCh = fReaderMgr.getNextChar();
              }
  
              if (!nextCh)
                  ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
  
              // Check for our ending quote in the same entity
              if (nextCh == quoteCh)
              {
                  if (curReader == fReaderMgr.getCurrentReaderNum())
                      return true;
  
                  // Watch for spillover into a previous entity
                  if (curReader > fReaderMgr.getCurrentReaderNum())
                  {
                      emitError(XMLErrs::PartialMarkupInEntity);
                      return false;
                  }
              }
  
              //  Check for an entity ref now, before we let it affect our
              //  whitespace normalization logic below. We ignore the empty flag
              //  in this one.
              escaped = false;
              if (nextCh == chAmpersand)
              {
                  if (scanEntityRef(true, nextCh, secondCh, escaped) != 
EntityExp_Returned)
                  {
                      gotLeadingSurrogate = false;
                      continue;
                  }
              }
  
              // Its got to at least be a valid XML character
              if (!XMLReader::isXMLChar(nextCh)) {
  
                  // Deal with surrogate pairs
                  if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) {
  
                      //  Its a leading surrogate. If we already got one, then
                      //  issue an error, else set leading flag to make sure that
                      //  we look for a trailing next time.
                      if (gotLeadingSurrogate)
                          emitError(XMLErrs::Expected2ndSurrogateChar);
                      else
                          gotLeadingSurrogate = true;
                  }
                  else
                  {
                      //  If its a trailing surrogate, make sure that we are
                      //  prepared for that.
                      if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
                      {
                          // Its trailing, so make sure we were expecting it
                          if (!gotLeadingSurrogate)
                              emitError(XMLErrs::Unexpected2ndSurrogateChar);
                      }
                      else
                      {
                          // Its just a char, so make sure we were not
                          // expecting a trailing surrogate.
                          if (gotLeadingSurrogate)
                              emitError(XMLErrs::Expected2ndSurrogateChar);
  
                          // Its not a valid XML character
                          XMLCh tmpBuf[9];
                          XMLString::binToText
                          (
                              nextCh
                              , tmpBuf
                              , 8
                              , 16
                          );
                          emitError
                          (
                              XMLErrs::InvalidCharacterInAttrValue
                              , attrName
                              , tmpBuf
                          );
                      }
  
                      gotLeadingSurrogate = false;
                  }
              }
  
              //  If its not escaped, then make sure its not a < character, which
              //  is not allowed in attribute values.
              if (!escaped) {
                                if (nextCh == chOpenAngle)
                      emitError(XMLErrs::BracketInAttrValue, attrName);
                  else if (XMLReader::isWhitespace(nextCh))
                      nextCh = chSpace;
              }
  
              // Else add it to the buffer
              toFill.append(nextCh);
          }
      }
      catch(const EndOfEntityException&)
      {
          // Just eat it and continue.
          gotLeadingSurrogate = false;
          escaped = false;
      }
      }
      return true;
  }
  
  
  //  This method scans a CDATA section. It collects the character into one
  //  of the temp buffers and calls the document handler, if any, with the
  //  characters. It assumes that the <![CDATA string has been scanned before
  //  this call.
  void WFXMLScanner::scanCDSection()
  {
      //  This is the CDATA section opening sequence, minus the '<' character.
      //  We use this to watch for nested CDATA sections, which are illegal.
      static const XMLCh CDataPrefix[] =
      {
              chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
          ,   chLatin_T, chLatin_A, chOpenSquare, chNull
      };
  
      static const XMLCh CDataClose[] =
      {
              chCloseSquare, chCloseAngle, chNull
      };
  
      //  The next character should be the opening square bracket. If not
      //  issue an error, but then try to recover by skipping any whitespace
      //  and checking again.
      if (!fReaderMgr.skippedChar(chOpenSquare))
      {
          emitError(XMLErrs::ExpectedOpenSquareBracket);
          fReaderMgr.skipPastSpaces();
  
          // If we still don't find it, then give up, else keep going
          if (!fReaderMgr.skippedChar(chOpenSquare))
              return;
      }
  
      // Get a buffer for this
      XMLBufBid bbCData(&fBufMgr);
  
      //  We just scan forward until we hit the end of CDATA section sequence.
      //  CDATA is effectively a big escape mechanism so we don't treat markup
      //  characters specially here.
      bool            emittedError = false;
      while (true)
      {
          const XMLCh nextCh = fReaderMgr.getNextChar();
  
          // Watch for unexpected end of file
          if (!nextCh)
          {
              emitError(XMLErrs::UnterminatedCDATASection);
              ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
          }
  
          //  If this is a close square bracket it could be our closing
          //  sequence.
          if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose))
          {
              // If we have a doc handler, call it
              if (fDocHandler)
              {
                  fDocHandler->docCharacters
                  (
                      bbCData.getRawBuffer()
                      , bbCData.getLen()
                      , true
                  );
              }
  
              // And we are done
              break;
          }
  
          //  Make sure its a valid character. But if we've emitted an error
          //  already, don't bother with the overhead since we've already told
          //  them about it.
          if (!emittedError)
          {
              if (!XMLReader::isXMLChar(nextCh))
              {
                  XMLCh tmpBuf[9];
                  XMLString::binToText
                  (
                      nextCh
                      , tmpBuf
                      , 8
                      , 16
                  );
                  emitError(XMLErrs::InvalidCharacter, tmpBuf);
                  emittedError = true;
              }
          }
  
          // Add it to the buffer
          bbCData.append(nextCh);
      }
  }
  
  
  void WFXMLScanner::scanCharData(XMLBuffer& toUse)
  {
      //  We have to watch for the stupid ]]> sequence, which is illegal in
      //  character data. So this is a little state machine that handles that.
      enum States
      {
          State_Waiting
          , State_GotOne
          , State_GotTwo
      };
  
      // Reset the buffer before we start
      toUse.reset();
  
      // Turn on the 'throw at end' flag of the reader manager
      ThrowEOEJanitor jan(&fReaderMgr, true);
  
      //  In order to be more efficient we have to use kind of a deeply nested
      //  set of blocks here. The outer block puts on a try and catches end of
      //  entity exceptions. The inner loop is the per-character loop. If we
      //  put the try inside the inner loop, it would work but would require
      //  the exception handling code setup/teardown code to be invoked for
      //  each character.
      XMLCh   nextCh;
      XMLCh   secondCh = 0;
      States  curState = State_Waiting;
      bool    escaped = false;
      bool    gotLeadingSurrogate = false;
      bool    notDone = true;
      while (notDone)
      {
          try
          {
              while (true)
              {
                  if (secondCh)
                  {
                      nextCh = secondCh;
                      secondCh = 0;
                  }
                  else
                  {
                      //  Eat through as many plain content characters as possible 
without
                      //  needing special handling.  Moving most content characters 
here,
                      //  in this one call, rather than running the overall loop once
                      //  per content character, is a speed optimization.
                      if (curState == State_Waiting  &&  !gotLeadingSurrogate)
                      {
                           fReaderMgr.movePlainContentChars(toUse);
                      }
  
                      // Try to get another char from the source
                      //   The code from here on down covers all contengencies,
                      if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh))
                      {
                          // If we were waiting for a trailing surrogate, its an error
                          if (gotLeadingSurrogate)
                              emitError(XMLErrs::Expected2ndSurrogateChar);
  
                          notDone = false;
                          break;
                      }
                  }
  
                  //  Watch for a reference. Note that the escapement mechanism
                  //  is ignored in this content.
                  if (nextCh == chAmpersand)
                  {
                      sendCharData(toUse);
  
                      // Turn off the throwing at the end of entity during this
                      ThrowEOEJanitor jan(&fReaderMgr, false);
  
                      if (scanEntityRef(false, nextCh, secondCh, escaped) != 
EntityExp_Returned)
                      {
                          gotLeadingSurrogate = false;
                          continue;
                      }
                  }
                  else
                  {
                      escaped = false;
                  }
  
                   // Keep the state machine up to date
                  if (!escaped)
                  {
                      if (nextCh == chCloseSquare)
                      {
                          if (curState == State_Waiting)
                              curState = State_GotOne;
                          else if (curState == State_GotOne)
                              curState = State_GotTwo;
                      }
                      else if (nextCh == chCloseAngle)
                      {
                          if (curState == State_GotTwo)
                              emitError(XMLErrs::BadSequenceInCharData);
                          curState = State_Waiting;
                      }
                      else
                      {
                          curState = State_Waiting;
                      }
                  }
                  else
                  {
                      curState = State_Waiting;
                  }
  
                  // Its got to at least be a valid XML character
                  if (!XMLReader::isXMLChar(nextCh)) {
  
                      // Deal with surrogate pairs
                      if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) {
  
                          //  Its a leading surrogate. If we already got one,
                          //  then issue an error, else set leading flag to make
                          //  sure that we look for a trailing next time.
                          if (gotLeadingSurrogate)
                              emitError(XMLErrs::Expected2ndSurrogateChar);
                          else
                              gotLeadingSurrogate = true;
                      }
                      else
                      {
                          //  If its a trailing surrogate, make sure that we are
                          //  prepared for that.
                          if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
                          {
                              // Its trailing, so make sure we were expecting it
                              if (!gotLeadingSurrogate)
                                  emitError(XMLErrs::Unexpected2ndSurrogateChar);
                          }
                          else
                          {
                              // Its just a char, so make sure we were not
                              // expecting a trailing surrogate.
                              if (gotLeadingSurrogate)
                                  emitError(XMLErrs::Expected2ndSurrogateChar);
  
                              // Its not a valid XML character
                              XMLCh tmpBuf[9];
                              XMLString::binToText
                              (
                                  nextCh
                                  , tmpBuf
                                  , 8
                                  , 16
                              );
                              emitError(XMLErrs::InvalidCharacter, tmpBuf);
                          }
  
                          gotLeadingSurrogate = false;
                      }
                  }
  
                  // Add this char to the buffer
                  toUse.append(nextCh);
              }
          }
          catch(const EndOfEntityException& toCatch)
          {
              //  Some entity ended, so we have to send any accumulated
              //  chars and send an end of entity event.
              sendCharData(toUse);
              gotLeadingSurrogate = false;
  
              if (fDocHandler)
                  fDocHandler->endEntityReference(toCatch.getEntity());
          }
      }
  
      // Send any char data that we accumulated into the buffer
      sendCharData(toUse);
  }
  
  
  //  This method will scan a general/character entity ref. It will either
  //  expand a char ref and return it directly, or push a reader for a general
  //  entity.
  //
  //  The return value indicates whether the char parameters hold the value
  //  or whether the value was pushed as a reader, or that it failed.
  //
  //  The escaped flag tells the caller whether the returned parameter resulted
  //  from a character reference, which escapes the character in some cases. It
  //  only makes any difference if the return value indicates the value was
  //  returned directly.
  XMLScanner::EntityExpRes
  WFXMLScanner::scanEntityRef(const bool    inAttVal
                              ,     XMLCh&  firstCh
                              ,     XMLCh&  secondCh
                              ,     bool&   escaped)
  {
      // Assume no escape
      secondCh = 0;
      escaped = false;
  
      // We have to insure that its all in one entity
      const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
  
      //  If the next char is a pound, then its a character reference and we
      //  need to expand it always.
      if (fReaderMgr.skippedChar(chPound))
      {
          //  Its a character reference, so scan it and get back the numeric
          //  value it represents.
          if (!scanCharRef(firstCh, secondCh))
              return EntityExp_Failed;
  
          escaped = true;
  
          if (curReader != fReaderMgr.getCurrentReaderNum())
              emitError(XMLErrs::PartialMarkupInEntity);
  
          return EntityExp_Returned;
      }
  
      // Expand it since its a normal entity ref
      XMLBufBid bbName(&fBufMgr);
      if (!fReaderMgr.getName(bbName.getBuffer()))
      {
          emitError(XMLErrs::ExpectedEntityRefName);
          return EntityExp_Failed;
      }
  
      //  Next char must be a semi-colon. But if its not, just emit
      //  an error and try to continue.
      if (!fReaderMgr.skippedChar(chSemiColon))
          emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
  
      // Make sure we ended up on the same entity reader as the & char
      if (curReader != fReaderMgr.getCurrentReaderNum())
          emitError(XMLErrs::PartialMarkupInEntity);
  
      // Look up the name in the general entity pool
      // If it does not exist, then obviously an error
      if (!fEntityTable->containsKey(bbName.getRawBuffer()))
      {
          // XML 1.0 Section 4.1
          // Well-formedness Constraint for entity not found:
          //   In a document without any DTD, a document with only an internal DTD 
subset which contains no parameter entity references,
          //      or a document with "standalone='yes'", for an entity reference that 
does not occur within the external subset
          //      or a parameter entity
          if (fStandalone || fHasNoDTD)
              emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
  
          return EntityExp_Failed;
      }
  
      firstCh = fEntityTable->get(bbName.getRawBuffer());
      escaped = true;
      return EntityExp_Returned;
  }
  
  // ---------------------------------------------------------------------------
  //  WFXMLScanner: Grammar preparsing
  // ---------------------------------------------------------------------------
  Grammar* WFXMLScanner::loadGrammar(const   InputSource&
                                     , const short
                                     , const bool)
  {
      // REVISIT: emit a warning or throw an exception
      return 0;
  }
  
  
  XERCES_CPP_NAMESPACE_END


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: xml-xerces/c/src/xercesc/internal WFXMLScanner.hpp WFXMLScanner.cpp

Reply via email to