util URI.java

mrglavas Mon, 14 Jul 2003 21:08:33 -0700

mrglavas    2003/07/14 21:09:45

  Modified:    java/src/org/apache/xerces/util URI.java
  Log:
  Fixed 16566, 18320, 18344, 18345, 18776, 18780, 18782, 18785 (partial).
  
  We're much closer to meeting RFC 2396, though a few other fixes remain.
  Support for IPv6 was the largest change.
  
  Revision  Changes    Path
  1.8       +335 -87   xml-xerces/java/src/org/apache/xerces/util/URI.java
  
  Index: URI.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/util/URI.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- URI.java  13 Jan 2003 17:12:54 -0000      1.7
  +++ URI.java  15 Jul 2003 04:09:45 -0000      1.8
  @@ -2,7 +2,7 @@
    * The Apache Software License, Version 1.1
    *
    *
  - * Copyright (c) 1999-2002 The Apache Software Foundation.  All rights
  + * Copyright (c) 1999-2003 The Apache Software Foundation.  All rights
    * reserved.
    *
    * Redistribution and use in source and binary forms, with or without
  @@ -67,16 +67,18 @@
   * <p>
   * Parsing of a URI specification is done according to the URI
   * syntax described in RFC 2396
  -* <http://www.ietf.org/rfc/rfc2396.txt?number=2396>. Every URI consists
  -* of a scheme, followed by a colon (':'), followed by a scheme-specific
  -* part. For URIs that follow the "generic URI" syntax, the scheme-
  -* specific part begins with two slashes ("//") and may be followed
  -* by an authority segment (comprised of user information, host, and
  -* port), path segment, query segment and fragment. Note that RFC 2396
  -* no longer specifies the use of the parameters segment and excludes
  -* the "user:password" syntax as part of the authority segment. If
  -* "user:password" appears in a URI, the entire user/password string
  -* is stored as userinfo.
  +* <http://www.ietf.org/rfc/rfc2396.txt?number=2396>, and amended by
  +* RFC 2732 <http://www.ietf.org/rfc/rfc2732.txt?number=2732>. 
  +* <p>
  +* Every absolute URI consists of a scheme, followed by a colon (':'), 
  +* followed by a scheme-specific part. For URIs that follow the 
  +* "generic URI" syntax, the scheme-specific part begins with two 
  +* slashes ("//") and may be followed by an authority segment (comprised 
  +* of user information, host, and port), path segment, query segment 
  +* and fragment. Note that RFC 2396 no longer specifies the use of the 
  +* parameters segment and excludes the "user:password" syntax as part of 
  +* the authority segment. If "user:password" appears in a URI, the entire 
  +* user/password string is stored as userinfo.
   * <p>
   * For URIs that do not follow the "generic URI" syntax (e.g. mailto),
   * the entire scheme-specific part is treated as the "path" portion
  @@ -149,6 +151,9 @@
     
     /** ASCII hex characters */
     private static final int ASCII_HEX_CHARACTERS = 0x40;
  +  
  +  /** Path characters */
  +  private static final int PATH_CHARACTERS = 0x80;
   
     /** Mask for alpha-numeric characters */
     private static final int MASK_ALPHA_NUMERIC = ASCII_ALPHA_CHARACTERS | 
ASCII_DIGIT_CHARACTERS;
  @@ -165,6 +170,9 @@
     /** Mask for userinfo characters */
     private static final int MASK_USERINFO_CHARACTER = MASK_UNRESERVED_MASK | 
USERINFO_CHARACTERS;
     
  +  /** Mask for path characters */
  +  private static final int MASK_PATH_CHARACTER = MASK_UNRESERVED_MASK | 
PATH_CHARACTERS; 
  +
     static {
         // Add ASCII Digits and ASCII Hex Numbers
         for (int i = '0'; i <= '9'; ++i) {
  @@ -221,6 +229,17 @@
         fgLookupTable['+'] |= USERINFO_CHARACTERS;
         fgLookupTable['$'] |= USERINFO_CHARACTERS;
         fgLookupTable[','] |= USERINFO_CHARACTERS;
  +      
  +      // Add Path Characters
  +      fgLookupTable[';'] |= PATH_CHARACTERS;
  +      fgLookupTable['/'] |= PATH_CHARACTERS;
  +      fgLookupTable[':'] |= PATH_CHARACTERS;
  +      fgLookupTable['@'] |= PATH_CHARACTERS;
  +      fgLookupTable['&'] |= PATH_CHARACTERS;
  +      fgLookupTable['='] |= PATH_CHARACTERS;
  +      fgLookupTable['+'] |= PATH_CHARACTERS;
  +      fgLookupTable['$'] |= PATH_CHARACTERS;
  +      fgLookupTable[','] |= PATH_CHARACTERS;
     }
   
     /** Stores the scheme (usually the protocol) for this URI. */
  @@ -451,10 +470,10 @@
     private void initialize(URI p_base, String p_uriSpec)
                            throws MalformedURIException {
          
  -     String uriSpec = (p_uriSpec != null) ? p_uriSpec.trim() : null;
  -     int uriSpecLen = (uriSpec != null) ? uriSpec.length() : 0;
  +    String uriSpec = (p_uriSpec != null) ? p_uriSpec.trim() : null;
  +    int uriSpecLen = (uriSpec != null) ? uriSpec.length() : 0;
        
  -     if (p_base == null && uriSpecLen == 0) {
  +    if (p_base == null && uriSpecLen == 0) {
         throw new MalformedURIException(
                     "Cannot initialize URI with empty parameters.");
       }
  @@ -480,13 +499,18 @@
           (colonIdx > queryIdx && queryIdx != -1) ||
           (colonIdx > fragmentIdx && fragmentIdx != -1)) {
         // A standalone base is a valid URI according to spec
  -      if (p_base == null && fragmentIdx != 0 ) {
  +      if (colonIdx == 0 || (p_base == null && fragmentIdx != 0)) {
           throw new MalformedURIException("No scheme found in URI.");
         }
       }
       else {
         initializeScheme(uriSpec);
         index = m_scheme.length()+1;
  +      
  +      // Neither 'scheme:' or 'scheme:#fragment' are valid URIs.
  +      if (colonIdx == uriSpecLen - 1 || uriSpec.charAt(colonIdx+1) == '#') {
  +             throw new MalformedURIException("Scheme specific part cannot be 
empty.");       
  +      }
       }
   
       // two slashes means generic URI syntax, so we get the authority
  @@ -508,7 +532,7 @@
         // if we found authority, parse it out, otherwise we set the
         // host to empty string
         if (index > startPos) {
  -        initializeAuthority(uriSpec, startPos, index);
  +        initializeAuthority(uriSpec.substring(startPos, index));
         }
         else {
           m_host = "";
  @@ -573,16 +597,19 @@
   
         // if we get to this point, we need to resolve relative path
         // RFC 2396 5.2 #6
  -      String path = new String();
  +      String path = "";
         String basePath = p_base.getPath();
   
         // 6a - get all but the last segment of the base URI path
  -      if (basePath != null) {
  +      if (basePath != null && basePath.length() > 0) {
           int lastSlash = basePath.lastIndexOf('/');
           if (lastSlash != -1) {
             path = basePath.substring(0, lastSlash+1);
           }
         }
  +      else if (m_path.length() > 0) {
  +             path = "/";
  +      }
   
         // 6b - append the relative URI path
         path = path.concat(m_path);
  @@ -670,17 +697,16 @@
     * URI from a URI string spec.
     *
     * @param p_uriSpec the URI specification (cannot be null)
  -  * @param p_nStartIndex the index to begin scanning from
  -  * @param p_nEndIndex the index to end scanning at
     *
     * @exception MalformedURIException if p_uriSpec violates syntax rules
     */
  -  private void initializeAuthority(String p_uriSpec, int p_nStartIndex, int 
p_nEndIndex)
  +  private void initializeAuthority(String p_uriSpec)
                    throws MalformedURIException {
       
  -     int index = p_nStartIndex;
  -    int start = p_nStartIndex;
  -    int end = p_nEndIndex;
  +    int index = 0;
  +    int start = 0;
  +    int end = p_uriSpec.length();
  +
       char testChar = '\0';
       String userinfo = null;
   
  @@ -697,21 +723,34 @@
         index++;
       }
   
  -    // host is everything up to ':'
  +    // host is everything up to last ':', or up to 
  +    // and including ']' if followed by ':'.
       String host = null;
       start = index;
  -    while (index < end) {
  -      testChar = p_uriSpec.charAt(index);
  -      if (testChar == ':') {
  -        break;
  +    boolean hasPort = false;
  +    if (index < end) {
  +      if (p_uriSpec.charAt(start) == '[') {
  +             int bracketIndex = p_uriSpec.indexOf(']', start);
  +             index = (bracketIndex != -1) ? bracketIndex : end;
  +             if (index+1 < end && p_uriSpec.charAt(index+1) == ':') {
  +               ++index;
  +               hasPort = true;
  +             }
  +             else {
  +               index = end;
  +             }
  +      }
  +      else {
  +             int colonIndex = p_uriSpec.lastIndexOf(':', end);
  +             index = (colonIndex != -1) ? colonIndex : end;
  +             hasPort = (index != end);
         }
  -      index++;
       }
       host = p_uriSpec.substring(start, index);
       int port = -1;
       if (host.length() > 0) {
         // port
  -      if (testChar == ':') {
  +      if (hasPort) {
           index++;
           start = index;
           while (index < end) {
  @@ -761,25 +800,69 @@
       char testChar = '\0';
   
       // path - everything up to query string or fragment
  -    while (index < end) {
  -      testChar = p_uriSpec.charAt(index);
  -      if (testChar == '?' || testChar == '#') {
  -        break;
  -      }
  -      // check for valid escape sequence
  -      if (testChar == '%') {
  -         if (index+2 >= end ||
  -            !isHex(p_uriSpec.charAt(index+1)) ||
  -            !isHex(p_uriSpec.charAt(index+2))) {
  -          throw new MalformedURIException(
  -                "Path contains invalid escape sequence!");
  -         }
  -      }
  -      else if (!isURICharacter(testChar)) {
  -        throw new MalformedURIException(
  -                  "Path contains invalid character: " + testChar);
  -      }
  -      index++;
  +    if (start < end) {
  +     // RFC 2732 only allows '[' and ']' to appear in the opaque part.
  +     if (getScheme() == null || p_uriSpec.charAt(start) == '/') {
  +     
  +            // Scan path.
  +            // abs_path = "/"  path_segments
  +            // rel_path = rel_segment [ abs_path ]
  +            while (index < end) {
  +                testChar = p_uriSpec.charAt(index);
  +            
  +                // check for valid escape sequence
  +                if (testChar == '%') {
  +                    if (index+2 >= end ||
  +                    !isHex(p_uriSpec.charAt(index+1)) ||
  +                    !isHex(p_uriSpec.charAt(index+2))) {
  +                        throw new MalformedURIException(
  +                            "Path contains invalid escape sequence!");
  +                    }
  +                }
  +                // Path segments cannot contain '[' or ']' since pchar
  +                // production was not changed by RFC 2732.
  +                else if (!isPathCharacter(testChar)) {
  +                         if (testChar == '?' || testChar == '#') {
  +                             break;
  +                         }
  +                    throw new MalformedURIException(
  +                        "Path contains invalid character: " + testChar);
  +                }
  +                ++index;
  +            }
  +        }
  +        else {
  +            
  +            // Scan opaque part.
  +            // opaque_part = uric_no_slash *uric
  +            while (index < end) {
  +                testChar = p_uriSpec.charAt(index);
  +            
  +                if (testChar == '?' || testChar == '#') {
  +                    break;
  +                     }
  +                
  +                // check for valid escape sequence
  +                if (testChar == '%') {
  +                    if (index+2 >= end ||
  +                    !isHex(p_uriSpec.charAt(index+1)) ||
  +                    !isHex(p_uriSpec.charAt(index+2))) {
  +                        throw new MalformedURIException(
  +                            "Opaque part contains invalid escape sequence!");
  +                    }
  +                }
  +                // If the scheme specific part is opaque, it can contain '['
  +                // and ']'. uric_no_slash wasn't modified by RFC 2732, which
  +                // I've interpreted as an error in the spec, since the 
  +                // production should be equivalent to (uric - '/'), and uric
  +                // contains '[' and ']'. - mrglavas
  +                else if (!isURICharacter(testChar)) {
  +                    throw new MalformedURIException(
  +                        "Opaque part contains invalid character: " + testChar);
  +                }
  +                ++index;
  +            }
  +        }
       }
       m_path = p_uriSpec.substring(start, index);
   
  @@ -802,7 +885,7 @@
           }
           else if (!isURICharacter(testChar)) {
             throw new MalformedURIException(
  -                "Query string contains invalid character:" + testChar);
  +                "Query string contains invalid character: " + testChar);
           }
           index++;
         }
  @@ -826,7 +909,7 @@
           }
           else if (!isURICharacter(testChar)) {
             throw new MalformedURIException(
  -                "Fragment contains invalid character:"+testChar);
  +                "Fragment contains invalid character: "+testChar);
           }
           index++;
         }
  @@ -1051,7 +1134,7 @@
     *                                  address or DNS hostname.
     */
     public void setHost(String p_host) throws MalformedURIException {
  -    if (p_host == null || p_host.trim().length() == 0) {
  +    if (p_host == null || p_host.length() == 0) {
         m_host = p_host;
         m_userinfo = null;
         m_port = -1;
  @@ -1319,25 +1402,30 @@
   
    /**
     * Determine whether a string is syntactically capable of representing
  -  * a valid IPv4 address or the domain name of a network host. A valid
  -  * IPv4 address consists of four decimal digit groups separated by a
  -  * '.'. A hostname consists of domain labels (each of which must
  -  * begin and end with an alphanumeric but may contain '-') separated
  -  & by a '.'. See RFC 2396 Section 3.2.2.
  +  * a valid IPv4 address, IPv6 reference or the domain name of a network host. 
  +  * A valid IPv4 address consists of four decimal digit groups separated by a
  +  * '.'. Each group must consist of one to three digits. See RFC 2732 Section 3,
  +  * and RFC 2373 Appendix B, for the definition of IPv6 references. A hostname 
  +  * consists of domain labels (each of which must begin and end with an 
alphanumeric 
  +  * but may contain '-') separated & by a '.'. See RFC 2396 Section 3.2.2.
     *
  -  * @return true if the string is a syntactically valid IPv4 address
  -  *              or hostname
  +  * @return true if the string is a syntactically valid IPv4 address, 
  +  * IPv6 reference or hostname
     */
  -  public static boolean isWellFormedAddress(String p_address) {
  -    if (p_address == null) {
  +  public static boolean isWellFormedAddress(String address) {
  +    if (address == null) {
         return false;
       }
   
  -    String address = p_address.trim();
       int addrLength = address.length();
  -    if (addrLength == 0 || addrLength > 255) {
  +    if (addrLength == 0) {
         return false;
       }
  +    
  +    // Check if the host is a valid IPv6reference.
  +    if (address.startsWith("[")) {
  +      return isWellFormedIPv6Reference(address);
  +    }
   
       if (address.startsWith(".") || address.startsWith("-")) {
         return false;
  @@ -1351,52 +1439,202 @@
         index = address.substring(0, index).lastIndexOf('.');
       }
   
  -    if (index+1 < addrLength && isDigit(p_address.charAt(index+1))) {
  +    if (index+1 < addrLength && isDigit(address.charAt(index+1))) {
  +      return isWellFormedIPv4Address(address);
  +    }
  +    else {
  +      // domain labels can contain alphanumerics and '-"
  +      // but must start and end with an alphanumeric
         char testChar;
  -      int numDots = 0;
   
  -      // make sure that 1) we see only digits and dot separators, 2) that
  -      // any dot separator is preceded and followed by a digit and
  -      // 3) that we find 3 dots
         for (int i = 0; i < addrLength; i++) {
           testChar = address.charAt(i);
           if (testChar == '.') {
  -          if (!isDigit(address.charAt(i-1)) ||
  -              (i+1 < addrLength && !isDigit(address.charAt(i+1)))) {
  +          if (!isAlphanum(address.charAt(i-1))) {
  +            return false;
  +          }
  +          if (i+1 < addrLength && !isAlphanum(address.charAt(i+1))) {
               return false;
             }
  -          numDots++;
           }
  -        else if (!isDigit(testChar)) {
  +        else if (!isAlphanum(testChar) && testChar != '-') {
             return false;
           }
         }
  -      if (numDots != 3) {
  -        return false;
  -      }
       }
  -    else {
  -      // domain labels can contain alphanumerics and '-"
  -      // but must start and end with an alphanumeric
  +    return true;
  +  }
  +  
  +  /**
  +   * <p>Determines whether a string is an IPv4 address 
  +   * as defined by RFC 2373.</p>
  +   *
  +   * <p>IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT</p>
  +   *
  +   * @return true if the string is a syntactically valid IPv4 address
  +   */
  +  public static boolean isWellFormedIPv4Address(String address) {
  +      
  +      int addrLength = address.length();
         char testChar;
  +      int numDots = 0;
  +      int numDigits = 0;
   
  +      // make sure that 1) we see only digits and dot separators, 2) that
  +      // any dot separator is preceded and followed by a digit and
  +      // 3) that we find 3 dots
  +      //
  +      // RFC 2732 amended RFC 2396 by replacing the definition 
  +      // of IPv4address with the one defined by RFC 2373. - mrglavas
  +      //
  +      // IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
  +      //
  +      // One to three digits must be in each segment.
         for (int i = 0; i < addrLength; i++) {
           testChar = address.charAt(i);
           if (testChar == '.') {
  -          if (!isAlphanum(address.charAt(i-1))) {
  -            return false;
  -          }
  -          if (i+1 < addrLength && !isAlphanum(address.charAt(i+1))) {
  +          if ((i > 0 && !isDigit(address.charAt(i-1))) || 
  +              (i+1 < addrLength && !isDigit(address.charAt(i+1)))) {
               return false;
             }
  +          numDigits = 0;
  +          numDots++;
           }
  -        else if (!isAlphanum(testChar) && testChar != '-') {
  +        else if (!isDigit(testChar)) {
  +          return false;
  +        }
  +        else if (++numDigits > 3) {
             return false;
           }
         }
  -    }
  -    return true;
  +      return (numDots == 3);
  +  }
  +  
  +  /**
  +   * <p>Determines whether a string is an IPv6 reference.</p>
  +   *
  +   * <p>IPv6reference = "[" IPv6address "]" <br>
  +   * IPv6address   = hexpart [ ":" IPv4address ] <br>
  +   * IPv4address   = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT <br>
  +   * hexpart       = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] <br>
  +   * hexseq        = hex4 *( ":" hex4) <br>
  +   * hex4          = 1*4HEXDIG</p>
  +   *
  +   * @return true if the string is a syntactically valid IPv6 reference
  +   */
  +  public static boolean isWellFormedIPv6Reference(String address) {
  +      int addrLength = address.length();
  +      int start = 1;
  +      int end = addrLength-1;
  +      int index = start;
  +      
  +      // Check if string is a potential match for IPv6reference.
  +      if (!(addrLength > 2 && address.charAt(0) == '[' 
  +          && address.charAt(end) == ']')) {
  +          return false;
  +      }
  +      
  +      // The production hexpart can be rewritten as:
  +      // hexpart = hexseq | [hexseq] "::" [hexseq]
  +      // which means as long as we see one of the following
  +      // three groups, then we have a match.
  +      
  +      // 1. Scan hex sequence before possible '::'.
  +      index = scanHexSequence(address, index, end);
  +      if (index == -1) {
  +          return false;
  +      }
  +      else if (index == end) {
  +          return true;
  +      }
  +      
  +      // 2. Skip '::' if present.
  +      if (index + 1 < end && address.charAt(index) == ':') {
  +          if (address.charAt(index+1) == ':') {
  +              index += 2;
  +              if (index == end) {
  +                 return true;
  +              }
  +          }
  +          // If the second character wasn't ':', the remainder of the
  +          // string must match IPv4Address. IPv6Address cannot 
  +          // start with [":" IPv4Address].
  +          else {
  +              return (index > start) && 
  +                  isWellFormedIPv4Address(address.substring(index+1, end));
  +          }
  +      }
  +      
  +      // 3. Scan hex sequence after '::'.
  +      index = scanHexSequence(address, index, end);
  +      if (index == -1) {
  +          return false;
  +      }
  +      else if (index == end) {
  +               return true;
  +      }
  +      
  +      // If we've gotten this far then the string is a valid
  +      // IPv6 reference only if it contained a valid hexpart,
  +      // and it has an IPv4 address.
  +      //
  +      // REVISIT: The example given for an IPv6 reference
  +      // http://[::192.9.5.5]/ipng in RFC 2732 is an error, or
  +      // the BNF for IPv6address is incorrect. In order to be
  +      // valid for the grammar defined in RFC 2373, if the hexpart
  +      // is only '::', and if the address contains an IPv4 address,
  +      // '::' must be followed by another ':'. Going with the BNF
  +      // from RFC 2373 for now. - mrglavas
  +      if (index > start && index+1 < end && address.charAt(index) == ':') {
  +          return isWellFormedIPv4Address(address.substring(index+1, end));
  +      }
  +      
  +      return false;
     }
  +  
  +  /**
  +   * Helper method for isWellFormedIPv6Reference which scans hex sequeunces.
  +   * It returns the index of the next character to scan, or -1 if the
  +   * string region cannot match a valid IPv6 address. 
  +   *
  +   * @param sequence the string to be scanned
  +   * @param index the beginning index (inclusive)
  +   * @param end the ending index (exclusive)
  +   *
  +   * @return the index of the next character to scan, or -1 if the
  +   * string region cannot match a valid IPv6 address
  +   */
  +  private static int scanHexSequence (String sequence, int index, int end) {
  +     
  +      char testChar;
  +      int numDigits = 0;
  +      int start = index;
  +      
  +      // Trying to match the following productions:
  +      // hexseq = hex4 *( ":" hex4)
  +      // hex4   = 1*4HEXDIG
  +      for (; index < end; ++index) {
  +             testChar = sequence.charAt(index);
  +             if (testChar == ':') {
  +                 if (numDigits == 0 || ((index+1 < end) && sequence.charAt(index+1) 
== ':')) {
  +                     return index;
  +                 }
  +                 numDigits = 0;
  +        }
  +        // This might be invalid or an IPv4address. If it's potentially an 
IPv4address,
  +        // backup to the ':' before the first hex digit in this group.
  +        else if (!isHex(testChar)) {
  +            int back = index - numDigits - 1;
  +            return (testChar == '.' && numDigits < 4 && numDigits > 0
  +                    && back >= start && sequence.charAt(back) == ':') ? back : -1;
  +        }
  +        // There can be at most 4 hex digits per group.
  +        else if (++numDigits > 4) {
  +            return -1;
  +        }
  +      }
  +      return (numDigits > 0) ? end : -1;
  +  } 
   
   
    /**
  @@ -1482,6 +1720,16 @@
     private static boolean isUserinfoCharacter (char p_char) {
         return (p_char <= 'z' && (fgLookupTable[p_char] & MASK_USERINFO_CHARACTER) != 
0);
     }
  +  
  + /**
  +  * Determine whether a char is a path character.
  +  * 
  +  * @return true if the char is a path character, false otherwise
  +  */
  +  private static boolean isPathCharacter (char p_char) {
  +      return (p_char <= '~' && (fgLookupTable[p_char] & MASK_PATH_CHARACTER) != 0);
  +  }
  +
   
    /**
     * Determine whether a given string contains only URI characters (also


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: xml-xerces/java/src/org/apache/xerces/util URI.java

Reply via email to