Author: amassari
Date: Thu Jun 26 05:19:31 2008
New Revision: 671870

URL: http://svn.apache.org/viewvc?rev=671870&view=rev
Log:
Define regular expression category \w as defined by the XMLSchema specs 
(XERCESC-541)

Modified:
    xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp
    xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp
    xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp
    xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp
    xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp

Modified: xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp
URL: 
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp Thu Jun 26 
05:19:31 2008
@@ -34,54 +34,46 @@
 // ---------------------------------------------------------------------------
 //  Local data
 // ---------------------------------------------------------------------------
-const int            UNICATEGSIZE     = 37;
-const unsigned short CHAR_LETTER      = 30;
-const unsigned short CHAR_MARK        = 31;
-const unsigned short CHAR_NUMBER      = 32;
-const unsigned short CHAR_SEPARATOR   = 33;
-const unsigned short CHAR_OTHER       = 34;
-const unsigned short CHAR_PUNCTUATION = 35;
-const unsigned short CHAR_SYMBOL      = 36;
 
 const XMLCh uniCategNames[][3] =
 {
-    {chLatin_C, chLatin_n, chNull},
-    {chLatin_L, chLatin_u, chNull},
-    {chLatin_L, chLatin_l, chNull},
-    {chLatin_L, chLatin_t, chNull},
-    {chLatin_L, chLatin_m, chNull},
-    {chLatin_L, chLatin_o, chNull},
-    {chLatin_M, chLatin_n, chNull},
-    {chLatin_M, chLatin_e, chNull},
-    {chLatin_M, chLatin_c, chNull},
-    {chLatin_N, chLatin_d, chNull},
-    {chLatin_N, chLatin_l, chNull},
-    {chLatin_N, chLatin_o, chNull},
-    {chLatin_Z, chLatin_s, chNull},
-    {chLatin_Z, chLatin_l, chNull},
-    {chLatin_Z, chLatin_p, chNull},
-    {chLatin_C, chLatin_c, chNull},
-    {chLatin_C, chLatin_f, chNull},
-    {chLatin_C, chLatin_o, chNull},
-    {chLatin_C, chLatin_s, chNull},
-    {chLatin_P, chLatin_d, chNull},
-    {chLatin_P, chLatin_s, chNull},
-    {chLatin_P, chLatin_e, chNull},
-    {chLatin_P, chLatin_c, chNull},
-    {chLatin_P, chLatin_o, chNull},
-    {chLatin_S, chLatin_m, chNull},
-    {chLatin_S, chLatin_c, chNull},
-    {chLatin_S, chLatin_k, chNull},
-    {chLatin_S, chLatin_o, chNull},
-    {chLatin_P, chLatin_i, chNull},
-    {chLatin_P, chLatin_f, chNull},
-    {chLatin_L, chNull},
-    {chLatin_M, chNull},
-    {chLatin_N, chNull},
-    {chLatin_Z, chNull},
-    {chLatin_C, chNull},
-    {chLatin_P, chNull},
-    {chLatin_S, chNull},
+    {chLatin_C, chLatin_n, chNull},     // UNASSIGNED
+    {chLatin_L, chLatin_u, chNull},     // UPPERCASE_LETTER
+    {chLatin_L, chLatin_l, chNull},     // LOWERCASE_LETTER
+    {chLatin_L, chLatin_t, chNull},     // TITLECASE_LETTER
+    {chLatin_L, chLatin_m, chNull},     // MODIFIER_LETTER
+    {chLatin_L, chLatin_o, chNull},     // OTHER_LETTER
+    {chLatin_M, chLatin_n, chNull},     // NON_SPACING_MARK
+    {chLatin_M, chLatin_e, chNull},     // ENCLOSING_MARK
+    {chLatin_M, chLatin_c, chNull},     // COMBINING_SPACING_MARK
+    {chLatin_N, chLatin_d, chNull},     // DECIMAL_DIGIT_NUMBER
+    {chLatin_N, chLatin_l, chNull},     // LETTER_NUMBER
+    {chLatin_N, chLatin_o, chNull},     // OTHER_NUMBER
+    {chLatin_Z, chLatin_s, chNull},     // SPACE_SEPARATOR
+    {chLatin_Z, chLatin_l, chNull},     // LINE_SEPARATOR
+    {chLatin_Z, chLatin_p, chNull},     // PARAGRAPH_SEPARATOR
+    {chLatin_C, chLatin_c, chNull},     // CONTROL
+    {chLatin_C, chLatin_f, chNull},     // FORMAT
+    {chLatin_C, chLatin_o, chNull},     // PRIVATE_USE
+    {chLatin_C, chLatin_s, chNull},     // SURROGATE
+    {chLatin_P, chLatin_d, chNull},     // DASH_PUNCTUATION
+    {chLatin_P, chLatin_s, chNull},     // START_PUNCTUATION
+    {chLatin_P, chLatin_e, chNull},     // END_PUNCTUATION
+    {chLatin_P, chLatin_c, chNull},     // CONNECTOR_PUNCTUATION
+    {chLatin_P, chLatin_o, chNull},     // OTHER_PUNCTUATION
+    {chLatin_S, chLatin_m, chNull},     // MATH_SYMBOL
+    {chLatin_S, chLatin_c, chNull},     // CURRENCY_SYMBOL
+    {chLatin_S, chLatin_k, chNull},     // MODIFIER_SYMBOL
+    {chLatin_S, chLatin_o, chNull},     // OTHER_SYMBOL
+    {chLatin_P, chLatin_i, chNull},     // INITIAL_PUNCTUATION
+    {chLatin_P, chLatin_f, chNull},     // FINAL_PUNCTUATION
+    {chLatin_L, chNull},                // CHAR_LETTER
+    {chLatin_M, chNull},                // CHAR_MARK
+    {chLatin_N, chNull},                // CHAR_NUMBER
+    {chLatin_Z, chNull},                // CHAR_SEPARATOR
+    {chLatin_C, chNull},                // CHAR_OTHER
+    {chLatin_P, chNull},                // CHAR_PUNCTUATION
+    {chLatin_S, chNull},                // CHAR_SYMBOL
 };
 
 // ---------------------------------------------------------------------------
@@ -108,7 +100,7 @@
     }
 
     TokenFactory* tokFactory = rangeTokMap->getTokenFactory();
-       RangeToken* ranges[UNICATEGSIZE];
+    RangeToken* ranges[UNICATEGSIZE];
     RangeToken* tok;
 
     for (int i=0; i < UNICATEGSIZE; i++) {
@@ -174,7 +166,7 @@
     // Create assigned range
     tok = (RangeToken*)RangeToken::complementRanges(
                 ranges[XMLUniCharacter::UNASSIGNED],
-                       tokFactory,
+                tokFactory,
                 tokFactory->getMemoryManager());
     // build the internal map.
     tok->createMap();
@@ -217,7 +209,7 @@
     if (fKeywordsInitialized)
         return;
 
-       for (int k=0; k < UNICATEGSIZE; k++) {
+    for (int k=0; k < UNICATEGSIZE; k++) {
         rangeTokMap->addKeywordMap(uniCategNames[k], fgUnicodeCategory);
     }
 

Modified: xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp
URL: 
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp Thu Jun 26 
05:19:31 2008
@@ -26,6 +26,7 @@
 //  Includes
 // ---------------------------------------------------------------------------
 #include <xercesc/util/regx/RangeFactory.hpp>
+#include <xercesc/util/regx/XMLUniCharacter.hpp>
 
 XERCES_CPP_NAMESPACE_BEGIN
 
@@ -33,6 +34,21 @@
 
 public:
     // -----------------------------------------------------------------------
+    //  Public Constants
+    // -----------------------------------------------------------------------
+    // Unicode categories 
+    enum {
+        CHAR_LETTER      = XMLUniCharacter::FINAL_PUNCTUATION+1,
+        CHAR_MARK,
+        CHAR_NUMBER,
+        CHAR_SEPARATOR,
+        CHAR_OTHER,
+        CHAR_PUNCTUATION,
+        CHAR_SYMBOL,
+        UNICATEGSIZE
+    };
+
+    // -----------------------------------------------------------------------
     //  Constructors and operators
     // -----------------------------------------------------------------------
     UnicodeRangeFactory();
@@ -41,25 +57,26 @@
     // -----------------------------------------------------------------------
     //  Initialization methods
     // -----------------------------------------------------------------------
-       void initializeKeywordMap(RangeTokenMap *rangeTokMap = 0);
+    void initializeKeywordMap(RangeTokenMap *rangeTokMap = 0);
+
+    // -----------------------------------------------------------------------
+    //  Helper methods
+    // -----------------------------------------------------------------------
+    static unsigned short getUniCategory(const unsigned short type);
 
 protected:
     // -----------------------------------------------------------------------
     //  Private Helper methods
     // -----------------------------------------------------------------------
-       void buildRanges(RangeTokenMap *rangeTokMap = 0);
+    void buildRanges(RangeTokenMap *rangeTokMap = 0);
 
 private:
-       // 
-----------------------------------------------------------------------
+    // -----------------------------------------------------------------------
     //  Unimplemented constructors and operators
     // -----------------------------------------------------------------------
     UnicodeRangeFactory(const UnicodeRangeFactory&);
     UnicodeRangeFactory& operator=(const UnicodeRangeFactory&);
 
-    // -----------------------------------------------------------------------
-    //  Helper methods
-    // -----------------------------------------------------------------------
-    unsigned short getUniCategory(const unsigned short type);
 };
 
 XERCES_CPP_NAMESPACE_END
@@ -67,5 +84,5 @@
 #endif
 
 /**
-  *    End file UnicodeRangeFactory.hpp
+  *    End file UnicodeRangeFactory.hpp
   */

Modified: xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp
URL: 
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp Thu Jun 26 
05:19:31 2008
@@ -28,6 +28,7 @@
 #include <xercesc/util/regx/TokenFactory.hpp>
 #include <xercesc/util/regx/RangeToken.hpp>
 #include <xercesc/util/regx/RangeTokenMap.hpp>
+#include <xercesc/util/regx/UnicodeRangeFactory.hpp>
 #include <xercesc/util/Janitor.hpp>
 #include <string.h>
 
@@ -204,19 +205,27 @@
     rangeTokMap->setRangeToken(fgXMLInitialNameChar, tok , true);
 
     // Create word range
+    // \w = [#x0000-#x10FFFF]-[\p{P}\p{Z}\p{C}] (all characters except the set 
of "punctuation", "separator" and "other" characters) 
     tok = tokFactory->createRange();
-    tok->setRangeValues(wordRange, wordRangeLen);
-    janWordRange.orphan();
+    for(int i=0; i<=0xFFFF; i++)
+    {
+        unsigned short 
chType=UnicodeRangeFactory::getUniCategory(XMLUniCharacter::getType(i));
+        if(chType == UnicodeRangeFactory::CHAR_PUNCTUATION || 
+           chType == UnicodeRangeFactory::CHAR_SEPARATOR || 
+           chType == UnicodeRangeFactory::CHAR_OTHER)
+            tok->addRange(i, i);
+    }
     tok->sortRanges();
     tok->compactRanges();
     // Build the internal map.
     tok->createMap();
-    rangeTokMap->setRangeToken(fgXMLWord, tok);
+    rangeTokMap->setRangeToken(fgXMLWord, tok , true);
 
     tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
     // Build the internal map.
     tok->createMap();
-    rangeTokMap->setRangeToken(fgXMLWord, tok , true);
+    rangeTokMap->setRangeToken(fgXMLWord, tok);
+
 
     fRangesCreated = true;
 }

Modified: xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp
URL: 
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp Thu Jun 26 
05:19:31 2008
@@ -35,7 +35,7 @@
     // -----------------------------------------------------------------------
     //  Public Constants
     // -----------------------------------------------------------------------
-    // Unicode chara types
+    // Unicode char types
     enum {
         UNASSIGNED              = 0,
         UPPERCASE_LETTER        = 1,
@@ -59,17 +59,17 @@
         DASH_PUNCTUATION        = 19,
         START_PUNCTUATION       = 20,
         END_PUNCTUATION         = 21,
-               CONNECTOR_PUNCTUATION   = 22,
+        CONNECTOR_PUNCTUATION   = 22,
         OTHER_PUNCTUATION       = 23,
         MATH_SYMBOL             = 24,
         CURRENCY_SYMBOL         = 25,
         MODIFIER_SYMBOL         = 26,
         OTHER_SYMBOL            = 27,
-               INITIAL_PUNCTUATION     = 28,
-               FINAL_PUNCTUATION       = 29
-       };
+        INITIAL_PUNCTUATION     = 28,
+        FINAL_PUNCTUATION       = 29
+    };
 
-       /** destructor */
+    /** destructor */
     ~XMLUniCharacter() {}
 
     /* Static methods for getting unicode character type */
@@ -81,7 +81,7 @@
       * @param ch The character we want to get its unicode type
       */
     static unsigned short getType(const XMLCh ch);
-       //@}
+    //@}
 
 private :
 

Modified: xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp
URL: 
http://svn.apache.org/viewvc/xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp (original)
+++ xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp Thu Jun 26 
05:19:31 2008
@@ -399,6 +399,18 @@
     while((nRead=stream->readBytes(buffer, 255)) >0)
     {
         buffer[nRead]=0;
+        // sending data containing \n\r to cout generates \n\n\r, so strip any 
\r
+        XMLSize_t idx=0;
+        while(true)
+        {
+            int cr=XMLString::indexOf((const char*)buffer, '\r', idx);
+            if(cr==-1)
+                break;
+            memmove(&buffer[cr], &buffer[cr+1], XMLString::stringLen((const 
char*)&buffer[cr+1])+1);
+            idx=cr;
+            if(buffer[idx]==0)
+                break;
+        }
         XERCES_STD_QUALIFIER cout << (const char*)buffer;
     }
     XERCES_STD_QUALIFIER cout << XERCES_STD_QUALIFIER endl;



---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to