mrglavas 2003/07/14 21:09:45 Modified: java/src/org/apache/xerces/util URI.java Log: Fixed 16566, 18320, 18344, 18345, 18776, 18780, 18782, 18785 (partial). We're much closer to meeting RFC 2396, though a few other fixes remain. Support for IPv6 was the largest change. Revision Changes Path 1.8 +335 -87 xml-xerces/java/src/org/apache/xerces/util/URI.java Index: URI.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/util/URI.java,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- URI.java 13 Jan 2003 17:12:54 -0000 1.7 +++ URI.java 15 Jul 2003 04:09:45 -0000 1.8 @@ -2,7 +2,7 @@ * The Apache Software License, Version 1.1 * * - * Copyright (c) 1999-2002 The Apache Software Foundation. All rights + * Copyright (c) 1999-2003 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without @@ -67,16 +67,18 @@ * <p> * Parsing of a URI specification is done according to the URI * syntax described in RFC 2396 -* <http://www.ietf.org/rfc/rfc2396.txt?number=2396>. Every URI consists -* of a scheme, followed by a colon (':'), followed by a scheme-specific -* part. For URIs that follow the "generic URI" syntax, the scheme- -* specific part begins with two slashes ("//") and may be followed -* by an authority segment (comprised of user information, host, and -* port), path segment, query segment and fragment. Note that RFC 2396 -* no longer specifies the use of the parameters segment and excludes -* the "user:password" syntax as part of the authority segment. If -* "user:password" appears in a URI, the entire user/password string -* is stored as userinfo. +* <http://www.ietf.org/rfc/rfc2396.txt?number=2396>, and amended by +* RFC 2732 <http://www.ietf.org/rfc/rfc2732.txt?number=2732>. +* <p> +* Every absolute URI consists of a scheme, followed by a colon (':'), +* followed by a scheme-specific part. For URIs that follow the +* "generic URI" syntax, the scheme-specific part begins with two +* slashes ("//") and may be followed by an authority segment (comprised +* of user information, host, and port), path segment, query segment +* and fragment. Note that RFC 2396 no longer specifies the use of the +* parameters segment and excludes the "user:password" syntax as part of +* the authority segment. If "user:password" appears in a URI, the entire +* user/password string is stored as userinfo. * <p> * For URIs that do not follow the "generic URI" syntax (e.g. mailto), * the entire scheme-specific part is treated as the "path" portion @@ -149,6 +151,9 @@ /** ASCII hex characters */ private static final int ASCII_HEX_CHARACTERS = 0x40; + + /** Path characters */ + private static final int PATH_CHARACTERS = 0x80; /** Mask for alpha-numeric characters */ private static final int MASK_ALPHA_NUMERIC = ASCII_ALPHA_CHARACTERS | ASCII_DIGIT_CHARACTERS; @@ -165,6 +170,9 @@ /** Mask for userinfo characters */ private static final int MASK_USERINFO_CHARACTER = MASK_UNRESERVED_MASK | USERINFO_CHARACTERS; + /** Mask for path characters */ + private static final int MASK_PATH_CHARACTER = MASK_UNRESERVED_MASK | PATH_CHARACTERS; + static { // Add ASCII Digits and ASCII Hex Numbers for (int i = '0'; i <= '9'; ++i) { @@ -221,6 +229,17 @@ fgLookupTable['+'] |= USERINFO_CHARACTERS; fgLookupTable['$'] |= USERINFO_CHARACTERS; fgLookupTable[','] |= USERINFO_CHARACTERS; + + // Add Path Characters + fgLookupTable[';'] |= PATH_CHARACTERS; + fgLookupTable['/'] |= PATH_CHARACTERS; + fgLookupTable[':'] |= PATH_CHARACTERS; + fgLookupTable['@'] |= PATH_CHARACTERS; + fgLookupTable['&'] |= PATH_CHARACTERS; + fgLookupTable['='] |= PATH_CHARACTERS; + fgLookupTable['+'] |= PATH_CHARACTERS; + fgLookupTable['$'] |= PATH_CHARACTERS; + fgLookupTable[','] |= PATH_CHARACTERS; } /** Stores the scheme (usually the protocol) for this URI. */ @@ -451,10 +470,10 @@ private void initialize(URI p_base, String p_uriSpec) throws MalformedURIException { - String uriSpec = (p_uriSpec != null) ? p_uriSpec.trim() : null; - int uriSpecLen = (uriSpec != null) ? uriSpec.length() : 0; + String uriSpec = (p_uriSpec != null) ? p_uriSpec.trim() : null; + int uriSpecLen = (uriSpec != null) ? uriSpec.length() : 0; - if (p_base == null && uriSpecLen == 0) { + if (p_base == null && uriSpecLen == 0) { throw new MalformedURIException( "Cannot initialize URI with empty parameters."); } @@ -480,13 +499,18 @@ (colonIdx > queryIdx && queryIdx != -1) || (colonIdx > fragmentIdx && fragmentIdx != -1)) { // A standalone base is a valid URI according to spec - if (p_base == null && fragmentIdx != 0 ) { + if (colonIdx == 0 || (p_base == null && fragmentIdx != 0)) { throw new MalformedURIException("No scheme found in URI."); } } else { initializeScheme(uriSpec); index = m_scheme.length()+1; + + // Neither 'scheme:' or 'scheme:#fragment' are valid URIs. + if (colonIdx == uriSpecLen - 1 || uriSpec.charAt(colonIdx+1) == '#') { + throw new MalformedURIException("Scheme specific part cannot be empty."); + } } // two slashes means generic URI syntax, so we get the authority @@ -508,7 +532,7 @@ // if we found authority, parse it out, otherwise we set the // host to empty string if (index > startPos) { - initializeAuthority(uriSpec, startPos, index); + initializeAuthority(uriSpec.substring(startPos, index)); } else { m_host = ""; @@ -573,16 +597,19 @@ // if we get to this point, we need to resolve relative path // RFC 2396 5.2 #6 - String path = new String(); + String path = ""; String basePath = p_base.getPath(); // 6a - get all but the last segment of the base URI path - if (basePath != null) { + if (basePath != null && basePath.length() > 0) { int lastSlash = basePath.lastIndexOf('/'); if (lastSlash != -1) { path = basePath.substring(0, lastSlash+1); } } + else if (m_path.length() > 0) { + path = "/"; + } // 6b - append the relative URI path path = path.concat(m_path); @@ -670,17 +697,16 @@ * URI from a URI string spec. * * @param p_uriSpec the URI specification (cannot be null) - * @param p_nStartIndex the index to begin scanning from - * @param p_nEndIndex the index to end scanning at * * @exception MalformedURIException if p_uriSpec violates syntax rules */ - private void initializeAuthority(String p_uriSpec, int p_nStartIndex, int p_nEndIndex) + private void initializeAuthority(String p_uriSpec) throws MalformedURIException { - int index = p_nStartIndex; - int start = p_nStartIndex; - int end = p_nEndIndex; + int index = 0; + int start = 0; + int end = p_uriSpec.length(); + char testChar = '\0'; String userinfo = null; @@ -697,21 +723,34 @@ index++; } - // host is everything up to ':' + // host is everything up to last ':', or up to + // and including ']' if followed by ':'. String host = null; start = index; - while (index < end) { - testChar = p_uriSpec.charAt(index); - if (testChar == ':') { - break; + boolean hasPort = false; + if (index < end) { + if (p_uriSpec.charAt(start) == '[') { + int bracketIndex = p_uriSpec.indexOf(']', start); + index = (bracketIndex != -1) ? bracketIndex : end; + if (index+1 < end && p_uriSpec.charAt(index+1) == ':') { + ++index; + hasPort = true; + } + else { + index = end; + } + } + else { + int colonIndex = p_uriSpec.lastIndexOf(':', end); + index = (colonIndex != -1) ? colonIndex : end; + hasPort = (index != end); } - index++; } host = p_uriSpec.substring(start, index); int port = -1; if (host.length() > 0) { // port - if (testChar == ':') { + if (hasPort) { index++; start = index; while (index < end) { @@ -761,25 +800,69 @@ char testChar = '\0'; // path - everything up to query string or fragment - while (index < end) { - testChar = p_uriSpec.charAt(index); - if (testChar == '?' || testChar == '#') { - break; - } - // check for valid escape sequence - if (testChar == '%') { - if (index+2 >= end || - !isHex(p_uriSpec.charAt(index+1)) || - !isHex(p_uriSpec.charAt(index+2))) { - throw new MalformedURIException( - "Path contains invalid escape sequence!"); - } - } - else if (!isURICharacter(testChar)) { - throw new MalformedURIException( - "Path contains invalid character: " + testChar); - } - index++; + if (start < end) { + // RFC 2732 only allows '[' and ']' to appear in the opaque part. + if (getScheme() == null || p_uriSpec.charAt(start) == '/') { + + // Scan path. + // abs_path = "/" path_segments + // rel_path = rel_segment [ abs_path ] + while (index < end) { + testChar = p_uriSpec.charAt(index); + + // check for valid escape sequence + if (testChar == '%') { + if (index+2 >= end || + !isHex(p_uriSpec.charAt(index+1)) || + !isHex(p_uriSpec.charAt(index+2))) { + throw new MalformedURIException( + "Path contains invalid escape sequence!"); + } + } + // Path segments cannot contain '[' or ']' since pchar + // production was not changed by RFC 2732. + else if (!isPathCharacter(testChar)) { + if (testChar == '?' || testChar == '#') { + break; + } + throw new MalformedURIException( + "Path contains invalid character: " + testChar); + } + ++index; + } + } + else { + + // Scan opaque part. + // opaque_part = uric_no_slash *uric + while (index < end) { + testChar = p_uriSpec.charAt(index); + + if (testChar == '?' || testChar == '#') { + break; + } + + // check for valid escape sequence + if (testChar == '%') { + if (index+2 >= end || + !isHex(p_uriSpec.charAt(index+1)) || + !isHex(p_uriSpec.charAt(index+2))) { + throw new MalformedURIException( + "Opaque part contains invalid escape sequence!"); + } + } + // If the scheme specific part is opaque, it can contain '[' + // and ']'. uric_no_slash wasn't modified by RFC 2732, which + // I've interpreted as an error in the spec, since the + // production should be equivalent to (uric - '/'), and uric + // contains '[' and ']'. - mrglavas + else if (!isURICharacter(testChar)) { + throw new MalformedURIException( + "Opaque part contains invalid character: " + testChar); + } + ++index; + } + } } m_path = p_uriSpec.substring(start, index); @@ -802,7 +885,7 @@ } else if (!isURICharacter(testChar)) { throw new MalformedURIException( - "Query string contains invalid character:" + testChar); + "Query string contains invalid character: " + testChar); } index++; } @@ -826,7 +909,7 @@ } else if (!isURICharacter(testChar)) { throw new MalformedURIException( - "Fragment contains invalid character:"+testChar); + "Fragment contains invalid character: "+testChar); } index++; } @@ -1051,7 +1134,7 @@ * address or DNS hostname. */ public void setHost(String p_host) throws MalformedURIException { - if (p_host == null || p_host.trim().length() == 0) { + if (p_host == null || p_host.length() == 0) { m_host = p_host; m_userinfo = null; m_port = -1; @@ -1319,25 +1402,30 @@ /** * Determine whether a string is syntactically capable of representing - * a valid IPv4 address or the domain name of a network host. A valid - * IPv4 address consists of four decimal digit groups separated by a - * '.'. A hostname consists of domain labels (each of which must - * begin and end with an alphanumeric but may contain '-') separated - & by a '.'. See RFC 2396 Section 3.2.2. + * a valid IPv4 address, IPv6 reference or the domain name of a network host. + * A valid IPv4 address consists of four decimal digit groups separated by a + * '.'. Each group must consist of one to three digits. See RFC 2732 Section 3, + * and RFC 2373 Appendix B, for the definition of IPv6 references. A hostname + * consists of domain labels (each of which must begin and end with an alphanumeric + * but may contain '-') separated & by a '.'. See RFC 2396 Section 3.2.2. * - * @return true if the string is a syntactically valid IPv4 address - * or hostname + * @return true if the string is a syntactically valid IPv4 address, + * IPv6 reference or hostname */ - public static boolean isWellFormedAddress(String p_address) { - if (p_address == null) { + public static boolean isWellFormedAddress(String address) { + if (address == null) { return false; } - String address = p_address.trim(); int addrLength = address.length(); - if (addrLength == 0 || addrLength > 255) { + if (addrLength == 0) { return false; } + + // Check if the host is a valid IPv6reference. + if (address.startsWith("[")) { + return isWellFormedIPv6Reference(address); + } if (address.startsWith(".") || address.startsWith("-")) { return false; @@ -1351,52 +1439,202 @@ index = address.substring(0, index).lastIndexOf('.'); } - if (index+1 < addrLength && isDigit(p_address.charAt(index+1))) { + if (index+1 < addrLength && isDigit(address.charAt(index+1))) { + return isWellFormedIPv4Address(address); + } + else { + // domain labels can contain alphanumerics and '-" + // but must start and end with an alphanumeric char testChar; - int numDots = 0; - // make sure that 1) we see only digits and dot separators, 2) that - // any dot separator is preceded and followed by a digit and - // 3) that we find 3 dots for (int i = 0; i < addrLength; i++) { testChar = address.charAt(i); if (testChar == '.') { - if (!isDigit(address.charAt(i-1)) || - (i+1 < addrLength && !isDigit(address.charAt(i+1)))) { + if (!isAlphanum(address.charAt(i-1))) { + return false; + } + if (i+1 < addrLength && !isAlphanum(address.charAt(i+1))) { return false; } - numDots++; } - else if (!isDigit(testChar)) { + else if (!isAlphanum(testChar) && testChar != '-') { return false; } } - if (numDots != 3) { - return false; - } } - else { - // domain labels can contain alphanumerics and '-" - // but must start and end with an alphanumeric + return true; + } + + /** + * <p>Determines whether a string is an IPv4 address + * as defined by RFC 2373.</p> + * + * <p>IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT</p> + * + * @return true if the string is a syntactically valid IPv4 address + */ + public static boolean isWellFormedIPv4Address(String address) { + + int addrLength = address.length(); char testChar; + int numDots = 0; + int numDigits = 0; + // make sure that 1) we see only digits and dot separators, 2) that + // any dot separator is preceded and followed by a digit and + // 3) that we find 3 dots + // + // RFC 2732 amended RFC 2396 by replacing the definition + // of IPv4address with the one defined by RFC 2373. - mrglavas + // + // IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT + // + // One to three digits must be in each segment. for (int i = 0; i < addrLength; i++) { testChar = address.charAt(i); if (testChar == '.') { - if (!isAlphanum(address.charAt(i-1))) { - return false; - } - if (i+1 < addrLength && !isAlphanum(address.charAt(i+1))) { + if ((i > 0 && !isDigit(address.charAt(i-1))) || + (i+1 < addrLength && !isDigit(address.charAt(i+1)))) { return false; } + numDigits = 0; + numDots++; } - else if (!isAlphanum(testChar) && testChar != '-') { + else if (!isDigit(testChar)) { + return false; + } + else if (++numDigits > 3) { return false; } } - } - return true; + return (numDots == 3); + } + + /** + * <p>Determines whether a string is an IPv6 reference.</p> + * + * <p>IPv6reference = "[" IPv6address "]" <br> + * IPv6address = hexpart [ ":" IPv4address ] <br> + * IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT <br> + * hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] <br> + * hexseq = hex4 *( ":" hex4) <br> + * hex4 = 1*4HEXDIG</p> + * + * @return true if the string is a syntactically valid IPv6 reference + */ + public static boolean isWellFormedIPv6Reference(String address) { + int addrLength = address.length(); + int start = 1; + int end = addrLength-1; + int index = start; + + // Check if string is a potential match for IPv6reference. + if (!(addrLength > 2 && address.charAt(0) == '[' + && address.charAt(end) == ']')) { + return false; + } + + // The production hexpart can be rewritten as: + // hexpart = hexseq | [hexseq] "::" [hexseq] + // which means as long as we see one of the following + // three groups, then we have a match. + + // 1. Scan hex sequence before possible '::'. + index = scanHexSequence(address, index, end); + if (index == -1) { + return false; + } + else if (index == end) { + return true; + } + + // 2. Skip '::' if present. + if (index + 1 < end && address.charAt(index) == ':') { + if (address.charAt(index+1) == ':') { + index += 2; + if (index == end) { + return true; + } + } + // If the second character wasn't ':', the remainder of the + // string must match IPv4Address. IPv6Address cannot + // start with [":" IPv4Address]. + else { + return (index > start) && + isWellFormedIPv4Address(address.substring(index+1, end)); + } + } + + // 3. Scan hex sequence after '::'. + index = scanHexSequence(address, index, end); + if (index == -1) { + return false; + } + else if (index == end) { + return true; + } + + // If we've gotten this far then the string is a valid + // IPv6 reference only if it contained a valid hexpart, + // and it has an IPv4 address. + // + // REVISIT: The example given for an IPv6 reference + // http://[::192.9.5.5]/ipng in RFC 2732 is an error, or + // the BNF for IPv6address is incorrect. In order to be + // valid for the grammar defined in RFC 2373, if the hexpart + // is only '::', and if the address contains an IPv4 address, + // '::' must be followed by another ':'. Going with the BNF + // from RFC 2373 for now. - mrglavas + if (index > start && index+1 < end && address.charAt(index) == ':') { + return isWellFormedIPv4Address(address.substring(index+1, end)); + } + + return false; } + + /** + * Helper method for isWellFormedIPv6Reference which scans hex sequeunces. + * It returns the index of the next character to scan, or -1 if the + * string region cannot match a valid IPv6 address. + * + * @param sequence the string to be scanned + * @param index the beginning index (inclusive) + * @param end the ending index (exclusive) + * + * @return the index of the next character to scan, or -1 if the + * string region cannot match a valid IPv6 address + */ + private static int scanHexSequence (String sequence, int index, int end) { + + char testChar; + int numDigits = 0; + int start = index; + + // Trying to match the following productions: + // hexseq = hex4 *( ":" hex4) + // hex4 = 1*4HEXDIG + for (; index < end; ++index) { + testChar = sequence.charAt(index); + if (testChar == ':') { + if (numDigits == 0 || ((index+1 < end) && sequence.charAt(index+1) == ':')) { + return index; + } + numDigits = 0; + } + // This might be invalid or an IPv4address. If it's potentially an IPv4address, + // backup to the ':' before the first hex digit in this group. + else if (!isHex(testChar)) { + int back = index - numDigits - 1; + return (testChar == '.' && numDigits < 4 && numDigits > 0 + && back >= start && sequence.charAt(back) == ':') ? back : -1; + } + // There can be at most 4 hex digits per group. + else if (++numDigits > 4) { + return -1; + } + } + return (numDigits > 0) ? end : -1; + } /** @@ -1482,6 +1720,16 @@ private static boolean isUserinfoCharacter (char p_char) { return (p_char <= 'z' && (fgLookupTable[p_char] & MASK_USERINFO_CHARACTER) != 0); } + + /** + * Determine whether a char is a path character. + * + * @return true if the char is a path character, false otherwise + */ + private static boolean isPathCharacter (char p_char) { + return (p_char <= '~' && (fgLookupTable[p_char] & MASK_PATH_CHARACTER) != 0); + } + /** * Determine whether a given string contains only URI characters (also
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]