Author: amassari
Date: Thu Jun 26 05:19:31 2008
New Revision: 671870
URL: http://svn.apache.org/viewvc?rev=671870&view=rev
Log:
Define regular expression category \w as defined by the XMLSchema specs
(XERCESC-541)
Modified:
xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp
xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp
xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp
xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp
xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp
Modified: xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp
URL:
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.cpp Thu Jun 26
05:19:31 2008
@@ -34,54 +34,46 @@
// ---------------------------------------------------------------------------
// Local data
// ---------------------------------------------------------------------------
-const int UNICATEGSIZE = 37;
-const unsigned short CHAR_LETTER = 30;
-const unsigned short CHAR_MARK = 31;
-const unsigned short CHAR_NUMBER = 32;
-const unsigned short CHAR_SEPARATOR = 33;
-const unsigned short CHAR_OTHER = 34;
-const unsigned short CHAR_PUNCTUATION = 35;
-const unsigned short CHAR_SYMBOL = 36;
const XMLCh uniCategNames[][3] =
{
- {chLatin_C, chLatin_n, chNull},
- {chLatin_L, chLatin_u, chNull},
- {chLatin_L, chLatin_l, chNull},
- {chLatin_L, chLatin_t, chNull},
- {chLatin_L, chLatin_m, chNull},
- {chLatin_L, chLatin_o, chNull},
- {chLatin_M, chLatin_n, chNull},
- {chLatin_M, chLatin_e, chNull},
- {chLatin_M, chLatin_c, chNull},
- {chLatin_N, chLatin_d, chNull},
- {chLatin_N, chLatin_l, chNull},
- {chLatin_N, chLatin_o, chNull},
- {chLatin_Z, chLatin_s, chNull},
- {chLatin_Z, chLatin_l, chNull},
- {chLatin_Z, chLatin_p, chNull},
- {chLatin_C, chLatin_c, chNull},
- {chLatin_C, chLatin_f, chNull},
- {chLatin_C, chLatin_o, chNull},
- {chLatin_C, chLatin_s, chNull},
- {chLatin_P, chLatin_d, chNull},
- {chLatin_P, chLatin_s, chNull},
- {chLatin_P, chLatin_e, chNull},
- {chLatin_P, chLatin_c, chNull},
- {chLatin_P, chLatin_o, chNull},
- {chLatin_S, chLatin_m, chNull},
- {chLatin_S, chLatin_c, chNull},
- {chLatin_S, chLatin_k, chNull},
- {chLatin_S, chLatin_o, chNull},
- {chLatin_P, chLatin_i, chNull},
- {chLatin_P, chLatin_f, chNull},
- {chLatin_L, chNull},
- {chLatin_M, chNull},
- {chLatin_N, chNull},
- {chLatin_Z, chNull},
- {chLatin_C, chNull},
- {chLatin_P, chNull},
- {chLatin_S, chNull},
+ {chLatin_C, chLatin_n, chNull}, // UNASSIGNED
+ {chLatin_L, chLatin_u, chNull}, // UPPERCASE_LETTER
+ {chLatin_L, chLatin_l, chNull}, // LOWERCASE_LETTER
+ {chLatin_L, chLatin_t, chNull}, // TITLECASE_LETTER
+ {chLatin_L, chLatin_m, chNull}, // MODIFIER_LETTER
+ {chLatin_L, chLatin_o, chNull}, // OTHER_LETTER
+ {chLatin_M, chLatin_n, chNull}, // NON_SPACING_MARK
+ {chLatin_M, chLatin_e, chNull}, // ENCLOSING_MARK
+ {chLatin_M, chLatin_c, chNull}, // COMBINING_SPACING_MARK
+ {chLatin_N, chLatin_d, chNull}, // DECIMAL_DIGIT_NUMBER
+ {chLatin_N, chLatin_l, chNull}, // LETTER_NUMBER
+ {chLatin_N, chLatin_o, chNull}, // OTHER_NUMBER
+ {chLatin_Z, chLatin_s, chNull}, // SPACE_SEPARATOR
+ {chLatin_Z, chLatin_l, chNull}, // LINE_SEPARATOR
+ {chLatin_Z, chLatin_p, chNull}, // PARAGRAPH_SEPARATOR
+ {chLatin_C, chLatin_c, chNull}, // CONTROL
+ {chLatin_C, chLatin_f, chNull}, // FORMAT
+ {chLatin_C, chLatin_o, chNull}, // PRIVATE_USE
+ {chLatin_C, chLatin_s, chNull}, // SURROGATE
+ {chLatin_P, chLatin_d, chNull}, // DASH_PUNCTUATION
+ {chLatin_P, chLatin_s, chNull}, // START_PUNCTUATION
+ {chLatin_P, chLatin_e, chNull}, // END_PUNCTUATION
+ {chLatin_P, chLatin_c, chNull}, // CONNECTOR_PUNCTUATION
+ {chLatin_P, chLatin_o, chNull}, // OTHER_PUNCTUATION
+ {chLatin_S, chLatin_m, chNull}, // MATH_SYMBOL
+ {chLatin_S, chLatin_c, chNull}, // CURRENCY_SYMBOL
+ {chLatin_S, chLatin_k, chNull}, // MODIFIER_SYMBOL
+ {chLatin_S, chLatin_o, chNull}, // OTHER_SYMBOL
+ {chLatin_P, chLatin_i, chNull}, // INITIAL_PUNCTUATION
+ {chLatin_P, chLatin_f, chNull}, // FINAL_PUNCTUATION
+ {chLatin_L, chNull}, // CHAR_LETTER
+ {chLatin_M, chNull}, // CHAR_MARK
+ {chLatin_N, chNull}, // CHAR_NUMBER
+ {chLatin_Z, chNull}, // CHAR_SEPARATOR
+ {chLatin_C, chNull}, // CHAR_OTHER
+ {chLatin_P, chNull}, // CHAR_PUNCTUATION
+ {chLatin_S, chNull}, // CHAR_SYMBOL
};
// ---------------------------------------------------------------------------
@@ -108,7 +100,7 @@
}
TokenFactory* tokFactory = rangeTokMap->getTokenFactory();
- RangeToken* ranges[UNICATEGSIZE];
+ RangeToken* ranges[UNICATEGSIZE];
RangeToken* tok;
for (int i=0; i < UNICATEGSIZE; i++) {
@@ -174,7 +166,7 @@
// Create assigned range
tok = (RangeToken*)RangeToken::complementRanges(
ranges[XMLUniCharacter::UNASSIGNED],
- tokFactory,
+ tokFactory,
tokFactory->getMemoryManager());
// build the internal map.
tok->createMap();
@@ -217,7 +209,7 @@
if (fKeywordsInitialized)
return;
- for (int k=0; k < UNICATEGSIZE; k++) {
+ for (int k=0; k < UNICATEGSIZE; k++) {
rangeTokMap->addKeywordMap(uniCategNames[k], fgUnicodeCategory);
}
Modified: xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp
URL:
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/UnicodeRangeFactory.hpp Thu Jun 26
05:19:31 2008
@@ -26,6 +26,7 @@
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/regx/RangeFactory.hpp>
+#include <xercesc/util/regx/XMLUniCharacter.hpp>
XERCES_CPP_NAMESPACE_BEGIN
@@ -33,6 +34,21 @@
public:
// -----------------------------------------------------------------------
+ // Public Constants
+ // -----------------------------------------------------------------------
+ // Unicode categories
+ enum {
+ CHAR_LETTER = XMLUniCharacter::FINAL_PUNCTUATION+1,
+ CHAR_MARK,
+ CHAR_NUMBER,
+ CHAR_SEPARATOR,
+ CHAR_OTHER,
+ CHAR_PUNCTUATION,
+ CHAR_SYMBOL,
+ UNICATEGSIZE
+ };
+
+ // -----------------------------------------------------------------------
// Constructors and operators
// -----------------------------------------------------------------------
UnicodeRangeFactory();
@@ -41,25 +57,26 @@
// -----------------------------------------------------------------------
// Initialization methods
// -----------------------------------------------------------------------
- void initializeKeywordMap(RangeTokenMap *rangeTokMap = 0);
+ void initializeKeywordMap(RangeTokenMap *rangeTokMap = 0);
+
+ // -----------------------------------------------------------------------
+ // Helper methods
+ // -----------------------------------------------------------------------
+ static unsigned short getUniCategory(const unsigned short type);
protected:
// -----------------------------------------------------------------------
// Private Helper methods
// -----------------------------------------------------------------------
- void buildRanges(RangeTokenMap *rangeTokMap = 0);
+ void buildRanges(RangeTokenMap *rangeTokMap = 0);
private:
- //
-----------------------------------------------------------------------
+ // -----------------------------------------------------------------------
// Unimplemented constructors and operators
// -----------------------------------------------------------------------
UnicodeRangeFactory(const UnicodeRangeFactory&);
UnicodeRangeFactory& operator=(const UnicodeRangeFactory&);
- // -----------------------------------------------------------------------
- // Helper methods
- // -----------------------------------------------------------------------
- unsigned short getUniCategory(const unsigned short type);
};
XERCES_CPP_NAMESPACE_END
@@ -67,5 +84,5 @@
#endif
/**
- * End file UnicodeRangeFactory.hpp
+ * End file UnicodeRangeFactory.hpp
*/
Modified: xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp
URL:
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/XMLRangeFactory.cpp Thu Jun 26
05:19:31 2008
@@ -28,6 +28,7 @@
#include <xercesc/util/regx/TokenFactory.hpp>
#include <xercesc/util/regx/RangeToken.hpp>
#include <xercesc/util/regx/RangeTokenMap.hpp>
+#include <xercesc/util/regx/UnicodeRangeFactory.hpp>
#include <xercesc/util/Janitor.hpp>
#include <string.h>
@@ -204,19 +205,27 @@
rangeTokMap->setRangeToken(fgXMLInitialNameChar, tok , true);
// Create word range
+ // \w = [#x0000-#x10FFFF]-[\p{P}\p{Z}\p{C}] (all characters except the set
of "punctuation", "separator" and "other" characters)
tok = tokFactory->createRange();
- tok->setRangeValues(wordRange, wordRangeLen);
- janWordRange.orphan();
+ for(int i=0; i<=0xFFFF; i++)
+ {
+ unsigned short
chType=UnicodeRangeFactory::getUniCategory(XMLUniCharacter::getType(i));
+ if(chType == UnicodeRangeFactory::CHAR_PUNCTUATION ||
+ chType == UnicodeRangeFactory::CHAR_SEPARATOR ||
+ chType == UnicodeRangeFactory::CHAR_OTHER)
+ tok->addRange(i, i);
+ }
tok->sortRanges();
tok->compactRanges();
// Build the internal map.
tok->createMap();
- rangeTokMap->setRangeToken(fgXMLWord, tok);
+ rangeTokMap->setRangeToken(fgXMLWord, tok , true);
tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
// Build the internal map.
tok->createMap();
- rangeTokMap->setRangeToken(fgXMLWord, tok , true);
+ rangeTokMap->setRangeToken(fgXMLWord, tok);
+
fRangesCreated = true;
}
Modified: xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp
URL:
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/XMLUniCharacter.hpp Thu Jun 26
05:19:31 2008
@@ -35,7 +35,7 @@
// -----------------------------------------------------------------------
// Public Constants
// -----------------------------------------------------------------------
- // Unicode chara types
+ // Unicode char types
enum {
UNASSIGNED = 0,
UPPERCASE_LETTER = 1,
@@ -59,17 +59,17 @@
DASH_PUNCTUATION = 19,
START_PUNCTUATION = 20,
END_PUNCTUATION = 21,
- CONNECTOR_PUNCTUATION = 22,
+ CONNECTOR_PUNCTUATION = 22,
OTHER_PUNCTUATION = 23,
MATH_SYMBOL = 24,
CURRENCY_SYMBOL = 25,
MODIFIER_SYMBOL = 26,
OTHER_SYMBOL = 27,
- INITIAL_PUNCTUATION = 28,
- FINAL_PUNCTUATION = 29
- };
+ INITIAL_PUNCTUATION = 28,
+ FINAL_PUNCTUATION = 29
+ };
- /** destructor */
+ /** destructor */
~XMLUniCharacter() {}
/* Static methods for getting unicode character type */
@@ -81,7 +81,7 @@
* @param ch The character we want to get its unicode type
*/
static unsigned short getType(const XMLCh ch);
- //@}
+ //@}
private :
Modified: xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp
URL:
http://svn.apache.org/viewvc/xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp?rev=671870&r1=671869&r2=671870&view=diff
==============================================================================
--- xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp (original)
+++ xerces/c/trunk/tests/src/XSTSHarness/XSTSHarnessHandlers.cpp Thu Jun 26
05:19:31 2008
@@ -399,6 +399,18 @@
while((nRead=stream->readBytes(buffer, 255)) >0)
{
buffer[nRead]=0;
+ // sending data containing \n\r to cout generates \n\n\r, so strip any
\r
+ XMLSize_t idx=0;
+ while(true)
+ {
+ int cr=XMLString::indexOf((const char*)buffer, '\r', idx);
+ if(cr==-1)
+ break;
+ memmove(&buffer[cr], &buffer[cr+1], XMLString::stringLen((const
char*)&buffer[cr+1])+1);
+ idx=cr;
+ if(buffer[idx]==0)
+ break;
+ }
XERCES_STD_QUALIFIER cout << (const char*)buffer;
}
XERCES_STD_QUALIFIER cout << XERCES_STD_QUALIFIER endl;
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]