internal XMLReader.cpp XMLReader.hpp

cargilld Tue, 22 Mar 2005 13:11:31 -0800

cargilld    2005/03/22 12:43:27

  Modified:    c/src/xercesc/internal XMLReader.cpp XMLReader.hpp
  Log:
  Check in Christian's patches for Xercesc-1369 and 1370.
  
  Revision  Changes    Path
  1.29      +188 -8    xml-xerces/c/src/xercesc/internal/XMLReader.cpp
  
  Index: XMLReader.cpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/internal/XMLReader.cpp,v
  retrieving revision 1.28
  retrieving revision 1.29
  diff -u -r1.28 -r1.29
  --- XMLReader.cpp     20 Mar 2005 19:02:45 -0000      1.28
  +++ XMLReader.cpp     22 Mar 2005 20:43:27 -0000      1.29
  @@ -40,7 +40,7 @@
   //  not. Breaks out on the first non-whitespace.
   //
   bool XMLReader::isAllSpaces(const   XMLCh* const    toCheck
  -                            , const unsigned int    count)
  +                            , const unsigned int    count) const
   {
       const XMLCh* curCh = toCheck;
       const XMLCh* endPtr = toCheck + count;
  @@ -58,7 +58,7 @@
   //  not.
   //
   bool XMLReader::containsWhiteSpace(const   XMLCh* const    toCheck
  -                            , const unsigned int    count)
  +                            , const unsigned int    count) const
   {
       const XMLCh* curCh = toCheck;
       const XMLCh* endPtr = toCheck + count;
  @@ -73,7 +73,7 @@
   //
   //  This one is not called terribly often, so call the XMLChar utility
   //
  -bool XMLReader::isPublicIdChar(const XMLCh toCheck)
  +bool XMLReader::isPublicIdChar(const XMLCh toCheck) const
   {
       if (fXMLVersion == XMLV1_1)
           return XMLChar1_1::isPublicIdChar(toCheck);
  @@ -704,7 +704,28 @@
                   // Eat this char
                   fCharIndex++;
   
  -                handleEOL(curCh, false);
  +                //
  +                //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can 
have
  +                //  end-of-line combinations with a leading chCR(xD) or 
chLF(xA)
  +                //
  +                //  100000 x20
  +                //  001001 x9
  +                //  001010 chLF
  +                //  001101 chCR
  +                //  -----------
  +                //  000110 == (chCR|chLF) & ~(0x9|0x20)
  +                //
  +                //  if the result of thelogical-& operation is
  +                //  true  : 'curCh' must be xA  or xD
  +                //  false : 'curCh' must be x20 or x9
  +                //
  +                if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
  +                {
  +                    fCurCol++;
  +                } else
  +                {
  +                    handleEOL(curCh, false);
  +                }
   
                   // Ok we can add this guy to our buffer
                   toFill.append(curCh);
  @@ -747,7 +768,27 @@
                   // Eat this char
                   fCharIndex++;
   
  -                handleEOL(curCh, false);
  +                //
  +                //  'curCh' is not a whitespace(x20|x9|xD|xA), so we only can
  +                //  have end-of-line combinations with a leading chNEL(x85) 
or
  +                //  chLineSeparator(x2028)
  +                //
  +                //  0010000000101000 chLineSeparator
  +                //  0000000010000101 chNEL
  +                //  ---------------------
  +                //  1101111101010010 == ~(chNEL|chLineSeparator)
  +                //
  +                //  if the result of the logical-& operation is
  +                //  true  : 'curCh' can not be chNEL or chLineSeparator
  +                //  false : 'curCh' can be chNEL or chLineSeparator
  +                //
  +                if ( curCh & (XMLCh) ~(chNEL|chLineSeparator) )
  +                {
  +                    fCurCol++;
  +                } else
  +                {
  +                    handleEOL(curCh, false);
  +                }
   
                   // Add it to our buffer
                   toFill.append(curCh);
  @@ -813,7 +854,28 @@
                   // Get the current char out of the buffer and eat it
                   XMLCh curCh = fCharBuf[fCharIndex++];
   
  -                handleEOL(curCh, inDecl);
  +                //
  +                //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can 
have
  +                //  end-of-line combinations with a leading chCR(xD) or 
chLF(xA)
  +                //
  +                //  100000 x20
  +                //  001001 x9
  +                //  001010 chLF
  +                //  001101 chCR
  +                //  -----------
  +                //  000110 == (chCR|chLF) & ~(0x9|0x20)
  +                //
  +                //  if the result of the logical-& operation is
  +                //  true  : 'curCh' must be xA  or xD
  +                //  false : 'curCh' must be x20 or x9
  +                //
  +                if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
  +                {
  +                    fCurCol++;
  +                } else
  +                {
  +                    handleEOL(curCh, inDecl);
  +                }
   
               }
               else
  @@ -883,7 +945,28 @@
           // Eat the character
           fCharIndex++;
   
  -        handleEOL((XMLCh&)curCh, false);
  +        //
  +        //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
  +        //  end-of-line combinations with a leading chCR(xD) or chLF(xA)
  +        //
  +        //  100000 x20
  +        //  001001 x9
  +        //  001010 chLF
  +        //  001101 chCR
  +        //  -----------
  +        //  000110 == (chCR|chLF) & ~(0x9|0x20)
  +        //
  +        //  if the result of the logical-& operation is
  +        //  true  : 'curCh' must be xA  or xD
  +        //  false : 'curCh' must be x20 or x9
  +        //
  +        if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
  +        {
  +            fCurCol++;
  +        } else
  +        {
  +            handleEOL((XMLCh&)curCh, false);
  +        }
   
           return true;
       }
  @@ -1522,4 +1605,101 @@
       return charsDone;
   }
   
  +/***
  + *
  + * XML1.1
  + *
  + * 2.11 End-of-Line Handling
  + *
  + *    XML parsed entities are often stored in computer files which, for 
editing 
  + *    convenience, are organized into lines. These lines are typically 
separated 
  + *    by some combination of the characters CARRIAGE RETURN (#xD) and LINE 
FEED (#xA).
  + *
  + *    To simplify the tasks of applications, the XML processor MUST behave 
as if 
  + *    it normalized all line breaks in external parsed entities (including 
the document 
  + *    entity) on input, before parsing, by translating all of the following 
to a single 
  + *    #xA character:
  + *
  + *  1. the two-character sequence #xD #xA
  + *  2. the two-character sequence #xD #x85
  + *  3. the single character #x85
  + *  4. the single character #x2028
  + *  5. any #xD character that is not immediately followed by #xA or #x85.
  + *
  + *
  + ***/
  +void XMLReader::handleEOL(XMLCh& curCh, bool inDecl)
  +{
  +    // 1. the two-character sequence #xD #xA
  +    // 2. the two-character sequence #xD #x85
  +    // 5. any #xD character that is not immediately followed by #xA or #x85.
  +    if (curCh == chCR)
  +    {
  +        fCurCol = 1;
  +        fCurLine++;
  +
  +        //
  +        //  If not already internalized, then convert it to an
  +        //  LF and eat any following LF.
  +        //
  +        if (fSource == Source_External)
  +        {
  +            if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
  +            {
  +                if ( fCharBuf[fCharIndex] == chLF              || 
  +                    ((fCharBuf[fCharIndex] == chNEL) && fNEL)  )
  +                {
  +                    fCharIndex++;
  +                }
  +            }
  +            curCh = chLF;
  +        }
  +    }
  +    else if (curCh == chLF)                   
  +    {
  +        fCurCol = 1;
  +        fCurLine++;
  +    }
  +    // 3. the single character #x85
  +    // 4. the single character #x2028
  +    else if (curCh == chNEL || curCh == chLineSeparator)
  +    {
  +        if (inDecl && fXMLVersion == XMLV1_1)
  +        {
  +
  +        /***
  +         * XML1.1
  +         *
  +         * 2.11 End-of-Line Handling
  +         *  ...
  +         *   The characters #x85 and #x2028 cannot be reliably recognized 
and translated 
  +         *   until an entity's encoding declaration (if present) has been 
read. 
  +         *   Therefore, it is a fatal error to use them within the XML 
declaration or 
  +         *   text declaration. 
  +         *
  +         ***/
  +            ThrowXMLwithMemMgr1
  +                (
  +                TranscodingException
  +                , XMLExcepts::Reader_NelLsepinDecl
  +                , fSystemId
  +                , fMemoryManager
  +                );
  +        }
  +
  +        if (fNEL && fSource == Source_External)
  +        {
  +            fCurCol = 1;
  +            fCurLine++;
  +            curCh = chLF;
  +        }
  +    }
  +    else
  +    {
  +        fCurCol++;
  +    }
  +
  +    return;
  +}
  +
   XERCES_CPP_NAMESPACE_END
  
  
  
  1.21      +67 -119   xml-xerces/c/src/xercesc/internal/XMLReader.hpp
  
  Index: XMLReader.hpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/internal/XMLReader.hpp,v
  retrieving revision 1.20
  retrieving revision 1.21
  diff -u -r1.20 -r1.21
  --- XMLReader.hpp     29 Sep 2004 00:24:01 -0000      1.20
  +++ XMLReader.hpp     22 Mar 2005 20:43:27 -0000      1.21
  @@ -16,6 +16,9 @@
   
   /*
    * $Log$
  + * Revision 1.21  2005/03/22 20:43:27  cargilld
  + * Check in Christian's patches for Xercesc-1369 and 1370.
  + *
    * Revision 1.20  2004/09/29 00:24:01  knoaman
    * Performance: improve src offset calculation. Patch by Anthony O'Dowd.
    *
  @@ -219,24 +222,24 @@
       (
           const   XMLCh* const    toCheck
           , const unsigned int    count
  -    );
  +    ) const;
   
       bool containsWhiteSpace
       (
           const   XMLCh* const    toCheck
           , const unsigned int    count
  -    );
  +    ) const;
   
   
  -    bool isXMLLetter(const XMLCh toCheck);
  -    bool isFirstNameChar(const XMLCh toCheck);
  -    bool isNameChar(const XMLCh toCheck);
  -    bool isPlainContentChar(const XMLCh toCheck);
  -    bool isSpecialStartTagChar(const XMLCh toCheck);
  -    bool isXMLChar(const XMLCh toCheck);
  -    bool isWhitespace(const XMLCh toCheck);
  -    bool isControlChar(const XMLCh toCheck);
  -    bool isPublicIdChar(const XMLCh toCheck);
  +    bool isXMLLetter(const XMLCh toCheck) const;
  +    bool isFirstNameChar(const XMLCh toCheck) const;
  +    bool isNameChar(const XMLCh toCheck) const;
  +    bool isPlainContentChar(const XMLCh toCheck) const;
  +    bool isSpecialStartTagChar(const XMLCh toCheck) const;
  +    bool isXMLChar(const XMLCh toCheck) const;
  +    bool isWhitespace(const XMLCh toCheck) const;
  +    bool isControlChar(const XMLCh toCheck) const;
  +    bool isPublicIdChar(const XMLCh toCheck) const;
   
       // 
-----------------------------------------------------------------------
       //  Constructors and Destructor
  @@ -400,7 +403,7 @@
           , const unsigned int            maxChars
       );
   
  -    inline void handleEOL
  +    void handleEOL
       (
                 XMLCh&   curCh
               , bool     inDecl = false
  @@ -590,43 +593,43 @@
   // 
---------------------------------------------------------------------------
   //  XMLReader: Public, query methods
   // 
---------------------------------------------------------------------------
  -inline bool XMLReader::isNameChar(const XMLCh toCheck)
  +inline bool XMLReader::isNameChar(const XMLCh toCheck) const
   {
       return ((fgCharCharsTable[toCheck] & gNameCharMask) != 0);
   }
   
  -inline bool XMLReader::isPlainContentChar(const XMLCh toCheck)
  +inline bool XMLReader::isPlainContentChar(const XMLCh toCheck) const
   {
       return ((fgCharCharsTable[toCheck] & gPlainContentCharMask) != 0);
   }
   
   
  -inline bool XMLReader::isFirstNameChar(const XMLCh toCheck)
  +inline bool XMLReader::isFirstNameChar(const XMLCh toCheck) const
   {
       return ((fgCharCharsTable[toCheck] & gFirstNameCharMask) != 0);
   }
   
  -inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck)
  +inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck) const
   {
       return ((fgCharCharsTable[toCheck] & gSpecialStartTagCharMask) != 0);
   }
   
  -inline bool XMLReader::isXMLChar(const XMLCh toCheck)
  +inline bool XMLReader::isXMLChar(const XMLCh toCheck) const
   {
       return ((fgCharCharsTable[toCheck] & gXMLCharMask) != 0);
   }
   
  -inline bool XMLReader::isXMLLetter(const XMLCh toCheck)
  +inline bool XMLReader::isXMLLetter(const XMLCh toCheck) const
   {
       return ((fgCharCharsTable[toCheck] & gLetterCharMask) != 0);
   }
   
  -inline bool XMLReader::isWhitespace(const XMLCh toCheck)
  +inline bool XMLReader::isWhitespace(const XMLCh toCheck) const
   {
       return ((fgCharCharsTable[toCheck] & gWhitespaceCharMask) != 0);
   }
   
  -inline bool XMLReader::isControlChar(const XMLCh toCheck)
  +inline bool XMLReader::isControlChar(const XMLCh toCheck) const
   {
       return ((fgCharCharsTable[toCheck] & gControlCharMask) != 0);
   }
  @@ -784,7 +787,28 @@
       chGotten = fCharBuf[fCharIndex++];
   
       // Handle end of line normalization and line/col member maintenance.
  -    handleEOL(chGotten, false);
  +    //
  +    // we can have end-of-line combinations with a leading
  +    // chCR(xD), chLF(xA), chNEL(x85), or chLineSeparator(x2028)
  +    //
  +    // 0000000000001101 chCR
  +    // 0000000000001010 chLF
  +    // 0000000010000101 chNEL
  +    // 0010000000101000 chLineSeparator
  +    // -----------------------
  +    // 1101111101010000 == ~(chCR|chLF|chNEL|chLineSeparator)
  +    //
  +    // if the result of the logical-& operation is
  +    // true  : 'curCh' can not be chCR, chLF, chNEL or chLineSeparator
  +    // false : 'curCh' can be chCR, chLF, chNEL or chLineSeparator
  +    //
  +    if ( chGotten & (XMLCh) ~(chCR|chLF|chNEL|chLineSeparator) )
  +    {
  +        fCurCol++;
  +    } else
  +    {
  +        handleEOL(chGotten, false);
  +    }
   
       return true;
   }
  @@ -812,7 +836,28 @@
       chGotten = fCharBuf[fCharIndex++];
   
       // Handle end of line normalization and line/col member maintenance.
  -    handleEOL(chGotten, false);
  +    //
  +    // we can have end-of-line combinations with a leading
  +    // chCR(xD), chLF(xA), chNEL(x85), or chLineSeparator(x2028)
  +    //
  +    // 0000000000001101 chCR
  +    // 0000000000001010 chLF
  +    // 0000000010000101 chNEL
  +    // 0010000000101000 chLineSeparator
  +    // -----------------------
  +    // 1101111101010000 == ~(chCR|chLF|chNEL|chLineSeparator)
  +    //
  +    // if the result of the logical-& operation is
  +    // true  : 'curCh' can not be chCR, chLF, chNEL or chLineSeparator
  +    // false : 'curCh' can be chCR, chLF, chNEL or chLineSeparator
  +    //
  +    if ( chGotten & (XMLCh) ~(chCR|chLF|chNEL|chLineSeparator) )
  +    {
  +        fCurCol++;
  +    } else
  +    {
  +        handleEOL(chGotten, false);
  +    }
   
       return true;
   }
  @@ -851,103 +896,6 @@
       return true;
   }
   
  -/***
  - *
  - * XML1.1
  - *
  - * 2.11 End-of-Line Handling
  - *
  - *    XML parsed entities are often stored in computer files which, for 
editing 
  - *    convenience, are organized into lines. These lines are typically 
separated 
  - *    by some combination of the characters CARRIAGE RETURN (#xD) and LINE 
FEED (#xA).
  - *
  - *    To simplify the tasks of applications, the XML processor MUST behave 
as if 
  - *    it normalized all line breaks in external parsed entities (including 
the document 
  - *    entity) on input, before parsing, by translating all of the following 
to a single 
  - *    #xA character:
  - *
  - *  1. the two-character sequence #xD #xA
  - *  2. the two-character sequence #xD #x85
  - *  3. the single character #x85
  - *  4. the single character #x2028
  - *  5. any #xD character that is not immediately followed by #xA or #x85.
  - *
  - *
  - ***/
  -inline void XMLReader::handleEOL(XMLCh& curCh, bool inDecl)
  -{
  -    // 1. the two-character sequence #xD #xA
  -    // 2. the two-character sequence #xD #x85
  -    // 5. any #xD character that is not immediately followed by #xA or #x85.
  -    if (curCh == chCR)
  -    {
  -        fCurCol = 1;
  -        fCurLine++;
  -
  -        //
  -        //  If not already internalized, then convert it to an
  -        //  LF and eat any following LF.
  -        //
  -        if (fSource == Source_External)
  -        {
  -            if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
  -            {
  -                if ( fCharBuf[fCharIndex] == chLF              || 
  -                    ((fCharBuf[fCharIndex] == chNEL) && fNEL)  )
  -                {
  -                    fCharIndex++;
  -                }
  -            }
  -            curCh = chLF;
  -        }
  -    }
  -    else if (curCh == chLF)                   
  -    {
  -        fCurCol = 1;
  -        fCurLine++;
  -    }
  -    // 3. the single character #x85
  -    // 4. the single character #x2028
  -    else if (curCh == chNEL || curCh == chLineSeparator)
  -    {
  -        if (inDecl && fXMLVersion == XMLV1_1)
  -        {
  -
  -        /***
  -         * XML1.1
  -         *
  -         * 2.11 End-of-Line Handling
  -         *  ...
  -         *   The characters #x85 and #x2028 cannot be reliably recognized 
and translated 
  -         *   until an entity's encoding declaration (if present) has been 
read. 
  -         *   Therefore, it is a fatal error to use them within the XML 
declaration or 
  -         *   text declaration. 
  -         *
  -         ***/
  -            ThrowXMLwithMemMgr1
  -                (
  -                TranscodingException
  -                , XMLExcepts::Reader_NelLsepinDecl
  -                , fSystemId
  -                , fMemoryManager
  -                );
  -        }
  -
  -        if (fNEL && fSource == Source_External)
  -        {
  -            fCurCol = 1;
  -            fCurLine++;
  -            curCh = chLF;
  -        }
  -    }
  -    else
  -    {
  -        fCurCol++;
  -    }
  -
  -    return;
  -}
  -
   XERCES_CPP_NAMESPACE_END
   
   #endif


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: xml-xerces/c/src/xercesc/internal XMLReader.cpp XMLReader.hpp

Reply via email to