knoaman 2003/09/30 17:20:41 Modified: c/src/xercesc/util XMLUri.hpp XMLUri.cpp Log: Add a static method to check whether a given string is a valid URI. Revision Changes Path 1.12 +21 -4 xml-xerces/c/src/xercesc/util/XMLUri.hpp Index: XMLUri.hpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/XMLUri.hpp,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- XMLUri.hpp 25 Sep 2003 22:23:25 -0000 1.11 +++ XMLUri.hpp 1 Oct 2003 00:20:41 -0000 1.12 @@ -57,6 +57,9 @@ /* * $Id$ * $Log$ + * Revision 1.12 2003/10/01 00:20:41 knoaman + * Add a static method to check whether a given string is a valid URI. + * * Revision 1.11 2003/09/25 22:23:25 peiyongz * Implementation of Serialization/Deserialization * @@ -144,7 +147,7 @@ * */ - class XMLUTIL_EXPORT XMLUri : public XSerializable, public XMemory +class XMLUTIL_EXPORT XMLUri : public XSerializable, public XMemory { public: @@ -366,6 +369,12 @@ */ static bool isURIString(const XMLCh* const uric); + /** + * Determine whether a given string is a valid URI + */ + static bool isValidURI( const XMLUri* const baseURI + , const XMLCh* const uriStr); + /*** * Support for Serialization/De-serialization ***/ @@ -452,7 +461,7 @@ * * @return true if the string is a syntactically valid IPv4 address */ - static bool isWellFormedIPv4Address(const XMLCh* const addr, const int& length); + static bool isWellFormedIPv4Address(const XMLCh* const addr, const int length); /** * Determines whether a string is an IPv6 reference as defined @@ -467,7 +476,7 @@ * * @return true if the string is a syntactically valid IPv6 reference */ - static bool isWellFormedIPv6Reference(const XMLCh* const addr, const int& length); + static bool isWellFormedIPv6Reference(const XMLCh* const addr, const int length); /** * Helper function for isWellFormedIPv6Reference which scans the @@ -553,6 +562,14 @@ * */ void cleanUp(); + + static bool isConformantSchemeName(const XMLCh* const scheme, + const int schemeLen); + static bool processScheme(const XMLCh* const uriStr, int& index); + static bool processAuthority(const XMLCh* const uriStr, const int authLen); + static bool isWellFormedAddress(const XMLCh* const addr, const int addrLen); + static bool processPath(const XMLCh* const pathStr, const int pathStrLen, + const bool isSchemePresent); // ----------------------------------------------------------------------- // Data members 1.16 +412 -32 xml-xerces/c/src/xercesc/util/XMLUri.cpp Index: XMLUri.cpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/XMLUri.cpp,v retrieving revision 1.15 retrieving revision 1.16 diff -u -r1.15 -r1.16 --- XMLUri.cpp 25 Sep 2003 22:23:25 -0000 1.15 +++ XMLUri.cpp 1 Oct 2003 00:20:41 -0000 1.16 @@ -61,13 +61,10 @@ // --------------------------------------------------------------------------- // Includes // --------------------------------------------------------------------------- -#include <xercesc/util/PlatformUtils.hpp> #include <xercesc/util/Janitor.hpp> #include <xercesc/util/XMLURL.hpp> #include <xercesc/util/XMLUri.hpp> -#include <xercesc/util/XMLString.hpp> -#include <xercesc/util/XMLUniDefs.hpp> -#include <xercesc/util/XMLUni.hpp> +#include <xercesc/util/XMLChar.hpp> XERCES_CPP_NAMESPACE_BEGIN @@ -1512,12 +1509,11 @@ // // IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT // -bool XMLUri::isWellFormedIPv4Address(const XMLCh* const addr, const int& length) +bool XMLUri::isWellFormedIPv4Address(const XMLCh* const addr, const int length) { int numDots = 0; int numDigits = 0; - // // IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT // // make sure that @@ -1525,24 +1521,20 @@ // 2) that any dot separator is preceded and followed by a digit // 3) that we find 3 dots // 4) that each segment contains 1 to 3 digits. - // 5) that each segment is not greater than 255. - + // 5) that each segment is not greater than 255. for (int i = 0; i < length; ++i) { if (addr[i] == chPeriod) { - if (((i > 0) && - (!XMLString::isDigit(addr[i-1]))) || - ((i + 1 < length) && - (!XMLString::isDigit(addr[i+1]))) ) + if ((i == 0) || + (i+1 == length) || + !XMLString::isDigit(addr[i+1])) { return false; } numDigits = 0; if (++numDots > 3) - { return false; - } } else if (!XMLString::isDigit(addr[i])) { @@ -1575,7 +1567,7 @@ // // IPv6reference = "[" IPv6address "]" // -bool XMLUri::isWellFormedIPv6Reference(const XMLCh* const addr, const int& length) +bool XMLUri::isWellFormedIPv6Reference(const XMLCh* const addr, const int length) { int index = 1; int end = length-1; @@ -1623,18 +1615,9 @@ else { if (counter == 6) - { - XMLCh* ipVfour = (XMLCh*) - XMLPlatformUtils::fgMemoryManager->allocate((length+1) * sizeof(XMLCh)); - XMLString::subString(ipVfour, addr, index+1, end); - ArrayJanitor<XMLCh> janitor(ipVfour, XMLPlatformUtils::fgMemoryManager); - int newLength = XMLString::stringLen(ipVfour); - return isWellFormedIPv4Address(ipVfour, newLength); - } + return isWellFormedIPv4Address(addr+index+1, end-index-1); else - { return false; - } } } else @@ -1658,12 +1641,8 @@ // The address ends in an IPv4 address, or it is invalid. // scanHexSequence has already made sure that we have the right number of bits. - XMLCh* ipVfour = - (XMLCh*) XMLPlatformUtils::fgMemoryManager->allocate((length+1) * sizeof(XMLCh)); - XMLString::subString(ipVfour, addr, (counter > prevCount) ? index+1 : index, end); - ArrayJanitor<XMLCh> janitor(ipVfour, XMLPlatformUtils::fgMemoryManager); - int newLength = XMLString::stringLen(ipVfour); - return isWellFormedIPv4Address(ipVfour, newLength); + int shiftCount = (counter > prevCount) ? index+1 : index; + return isWellFormedIPv4Address(addr + shiftCount, end - shiftCount); } // @@ -1811,6 +1790,407 @@ // Cap it off in case the last op was not a string copy *outPtr = 0; +} + +// NOTE: no check for NULL value of uriStr (caller responsiblilty) +bool XMLUri::isValidURI(const XMLUri* const baseURI + , const XMLCh* const uriStr) +{ + // get a trimmed version of uriStr + // uriStr will NO LONGER be used in this function. + const XMLCh* trimedUriSpec = uriStr; + + while (XMLChar1_0::isWhitespace(*trimedUriSpec)) + trimedUriSpec++; + + int trimedUriSpecLen = XMLString::stringLen(trimedUriSpec); + + while (trimedUriSpecLen) { + if (XMLChar1_0::isWhitespace(trimedUriSpec[trimedUriSpecLen-1])) + trimedUriSpecLen--; + else + break; + } + + if (trimedUriSpecLen == 0) + { + if (!baseURI) + return false; + else + return true; + } + + int index = 0; + bool foundScheme = false; + + // Check for scheme, which must be before `/', '?' or '#'. + // Also handle names with DOS drive letters ('D:'), + // so 1-character schemes are not allowed. + int colonIdx = XMLString::indexOf(trimedUriSpec, chColon); + int slashIdx = XMLString::indexOf(trimedUriSpec, chForwardSlash); + int queryIdx = XMLString::indexOf(trimedUriSpec, chQuestion); + int fragmentIdx = XMLString::indexOf(trimedUriSpec, chPound); + + if ((colonIdx < 2) || + (colonIdx > slashIdx && slashIdx != -1) || + (colonIdx > queryIdx && queryIdx != -1) || + (colonIdx > fragmentIdx && fragmentIdx != -1)) + { + // A standalone base is a valid URI according to spec + if (colonIdx == 0 || (!baseURI && fragmentIdx != 0)) + return false; + } + else + { + if (!processScheme(trimedUriSpec, index)) + return false; + foundScheme = true; + ++index; + } + + // It's an error if we stop here + if (index == trimedUriSpecLen || (foundScheme && (trimedUriSpec[index] == chPound))) + return false; + + // two slashes means generic URI syntax, so we get the authority + const XMLCh* authUriSpec = trimedUriSpec + index; + if (((index+1) < trimedUriSpecLen) && + XMLString::startsWith(authUriSpec, DOUBLE_SLASH)) + { + index += 2; + int startPos = index; + + // get authority - everything up to path, query or fragment + XMLCh testChar; + while (index < trimedUriSpecLen) + { + testChar = trimedUriSpec[index]; + if (testChar == chForwardSlash || + testChar == chQuestion || + testChar == chPound ) + { + break; + } + + index++; + } + + // if we found authority, parse it out, otherwise we set the + // host to empty string + if (index > startPos) + { + if (!processAuthority(trimedUriSpec + startPos, index - startPos)) + return false; + } + } + + // we need to check if index has exceed the lenght or not + if (index < trimedUriSpecLen) + { + if (!processPath(trimedUriSpec + index, trimedUriSpecLen - index, foundScheme)) + return false; + } + + return true; +} + +bool XMLUri::isWellFormedAddress(const XMLCh* const addrString, + const int addrStrLen) +{ + // Check that we have a non-zero length string. + if (addrStrLen == 0) + return false; + + // Check if the host is a valid IPv6reference. + if (*addrString == chOpenSquare) + { + return isWellFormedIPv6Reference(addrString, addrStrLen); + } + + // + // Cannot start with a '.', '-', or end with a '-'. + // + if (*addrString == chPeriod || + *addrString == chDash || + addrString[addrStrLen-1] == chDash) + return false; + + // rightmost domain label starting with digit indicates IP address + // since top level domain label can only start with an alpha + // see RFC 2396 Section 3.2.2 + + int lastPeriodPos = XMLString::lastIndexOf(chPeriod, addrString, addrStrLen); + + // if the string ends with "." + // get the second last "." + if (lastPeriodPos + 1 == addrStrLen) + { + lastPeriodPos = XMLString::lastIndexOf(chPeriod, addrString, lastPeriodPos); + + if ( XMLString::isDigit(addrString[lastPeriodPos + 1])) + return false; + } + + if (XMLString::isDigit(addrString[lastPeriodPos + 1])) + { + return isWellFormedIPv4Address(addrString, addrStrLen); + } // end of IPv4address + else + { + // + // hostname = *( domainlabel "." ) toplabel [ "." ] + // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum + // toplabel = alpha | alpha *( alphanum | "-" ) alphanum + + // RFC 2396 states that hostnames take the form described in + // RFC 1034 (Section 3) and RFC 1123 (Section 2.1). According + // to RFC 1034, hostnames are limited to 255 characters. + if (addrStrLen > 255) { + return false; + } + + unsigned int labelCharCount = 0; + + // domain labels can contain alphanumerics and '-" + // but must start and end with an alphanumeric + for (int i = 0; i < addrStrLen; i++) + { + if (addrString[i] == chPeriod) + { + if (((i > 0) && + (!XMLString::isAlphaNum(addrString[i-1]))) || + ((i + 1 < addrStrLen) && + (!XMLString::isAlphaNum(addrString[i+1]))) ) + { + return false; + } + labelCharCount = 0; + } + else if (!XMLString::isAlphaNum(addrString[i]) && + addrString[i] != chDash) + { + return false; + } + // RFC 1034: Labels must be 63 characters or less. + else if (++labelCharCount > 63) { + return false; + } + } //for + } + + return true; +} + +bool XMLUri::processScheme(const XMLCh* const schemeStr, int& index) +{ + const XMLCh* tmpPtr = XMLString::findAny(schemeStr, SCHEME_SEPARATORS); + + if (tmpPtr) { + index = tmpPtr - schemeStr; + return isConformantSchemeName(schemeStr, index); + } + else { + return false; + } +} + + +bool XMLUri::isConformantSchemeName( const XMLCh* const scheme + , const int schemeLen) +{ + if (!XMLString::isAlpha(*scheme)) // first: alpha + return false; + + // second onwards: ( alpha | digit | "+" | "-" | "." ) + for (int i=1; i<schemeLen; i++) + { + if ( !XMLString::isAlphaNum(scheme[i]) && + (XMLString::indexOf(SCHEME_CHARACTERS, scheme[i]) == -1)) + return false; + } + + return true; +} + +bool XMLUri::processAuthority( const XMLCh* const authSpec + , const int authLen) +{ + int index = XMLString::indexOf(authSpec, chAt); + int start = 0; + + // server = [ [ userinfo "@" ] hostport ] + // userinfo is everything up @, + const XMLCh* userinfo; + int userInfoLen = 0; + if ((index != -1) && (index < authLen)) + { + userinfo = authSpec; + userInfoLen = index; + start = index + 1; + } + else + { + userinfo = XMLUni::fgZeroLenString; + } + + // hostport = host [ ":" port ] + // host is everything up to ':', or up to + // and including ']' if followed by ':'. + // + // Search for port boundary. + const XMLCh* host; + int hostLen = 0; + if ((start < authLen) && (authSpec[start] == chOpenSquare)) + { + index = XMLString::indexOf(&(authSpec[start]), chCloseSquare); + if ((index != -1) && (index < authLen)) + { + // skip the ']' + index = ((start + index + 1) < authLen + && authSpec[start + index + 1] == chColon) ? index+1 : -1; + } + } + else + { + index = XMLString::indexOf(&(authSpec[start]), chColon); + if (index >= authLen) + index = -1; + } + + host = &(authSpec[start]); + if (index != -1) + { + hostLen = index; + start += index + 1; // skip the : + } + else + { + hostLen = authLen - start; + start = authLen; + } + + // port is everything after ":" + int port = -1; + if ((hostLen) && // non empty host + (index != -1) && // ":" found + (start < authLen) ) // ":" is not the last + { + const XMLCh* portStr = &(authSpec[start]); + if (*portStr) + { + port = 0; + for (int i=0; i<(authLen - start); i++) + { + if (portStr[i] < chDigit_0 || portStr[i] > chDigit_9) + return false; + + port = (port * 10) + (int) (portStr[i] - chDigit_0); + } + + } + } + + // The order is important, do not change + if (!isWellFormedAddress(host, hostLen)) + return false; + + // check port number + if ((port > 65535) || (port < 0 && port != -1)) + return false; + + // check userinfo + index = 0; + while (index < userInfoLen) + { + if (isUnreservedCharacter(userinfo[index]) || + (XMLString::indexOf(USERINFO_CHARACTERS, userinfo[index]) != -1)) + { + index++; + } + else if (userinfo[index] == chPercent) // '%' + { + if (XMLString::isHex(userinfo[index+1]) && // 1st hex + XMLString::isHex(userinfo[index+2]) ) // 2nd hex + index +=3; + else + return false; + } + else + return false; + } //while + + return true; +} + +bool XMLUri::processPath(const XMLCh* const pathStr, + const int pathStrLen, + const bool isSchemePresent) +{ + if (pathStrLen != 0) + { + int index = 0; + XMLCh testChar = chNull; + bool isOpaque = (!isSchemePresent || *pathStr == chForwardSlash); + + // path - everything up to query string or fragment + // + // RFC 2732 only allows '[' and ']' to appear in the opaque part. + while (index < pathStrLen) + { + testChar = pathStr[index]; + if (testChar == chQuestion || testChar == chPound) + break; + + if (testChar == chPercent) + { + if (index+2 >= pathStrLen || + !XMLString::isHex(pathStr[index+1]) || + !XMLString::isHex(pathStr[index+2])) + return false; + } + else if (!isUnreservedCharacter(testChar) && + ((isOpaque && !isPathCharacter(testChar)) || + (!isOpaque && !isReservedCharacter(testChar)))) + { + return false; + } + + index++; + } + + // query - starts with ? and up to fragment or end + // fragment - starts with # + bool isQuery = (testChar == chQuestion); + if (isQuery || testChar == chPound) + { + index++; + while (index < pathStrLen) + { + testChar = pathStr[index]; + if (testChar == chPound && isQuery) { + isQuery = false; + index++; + continue; + } + + if (testChar == chPercent) + { + if (index+2 >= pathStrLen || + !XMLString::isHex(pathStr[index+1]) || + !XMLString::isHex(pathStr[index+2])) + return false; + } + else if (!isUnreservedCharacter(testChar) && + !isReservedCharacter(testChar)) + { + return false; + } + index++; + } + } + } //if (pathStrLen...) + + return true; } /***
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]