parsers ConvertToURI.java

neeraj Thu, 26 Sep 2002 01:39:46 -0700

neeraj      2002/09/26 02:04:57

  Added:       java/src/javax/xml/parsers ConvertToURI.java
  Log:
  fixing bug http://nagoya.apache.org/bugzilla/show_bug.cgi?id=12257
  Added a new class which escapes character sequences that doesn't have representation 
using an unreserved character, it corresponds to printable character of US-ASCII coded 
character
    set [00-1F and 7F hexadecimal] or any US-ASCII character that is disallowed like 
space ' '
   As of right now this class doesn't not handle  non-US ASCII character. But it may 
be extended in future to handle those cases.
  
  Revision  Changes    Path
  1.1                  xml-xerces/java/src/javax/xml/parsers/ConvertToURI.java
  
  Index: ConvertToURI.java
  ===================================================================
  /*
   * The Apache Software License, Version 1.1
   *
   *
   * Copyright (c) 2001, 2002 The Apache Software Foundation.
   * All rights reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Xerces" and "Apache Software Foundation" must
   *    not be used to endorse or promote products derived from this
   *    software without prior written permission. For written
   *    permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache",
   *    nor may "Apache" appear in their name, without prior written
   *    permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation and was
   * originally based on software copyright (c) 1999, International
   * Business Machines, Inc., http://www.apache.org.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  package javax.xml.parsers;
  
  import java.io.File;
  /**
   * It escapes character sequences that doesn't correspond to printable character of 
US-ASCII coded character
   * set [00-1F and 7F hexadecimal] or any US-ASCII character that is disallowed like 
space ' '
   * As of right now this class doesn't not handle  non-US ASCII character. But it may 
be extended in future to
   * handle those cases. Most of the code in this class has been extracted from 
java.net.URI.
   *
   * @author Neeraj Bajaj
   */
  
  
  
  class ConvertToURI{
  
      // RFC2396 precisely specifies which characters in the US-ASCII charset are
      // permissible in the various components of a URI reference.  We here
      // define a set of mask pairs to aid in enforcing these restrictions.  Each
      // mask pair consists of two longs, a low mask and a high mask.  Taken
      // together they represent a 128-bit mask, where bit i is set iff the
      // character with value i is permitted.
      //
      // This approach is more efficient than sequentially searching arrays of
      // permitted characters.  It could be made still more efficient by
      // precompiling the mask information so that a character's presence in a
      // given mask could be determined by a single table lookup.
  
      // Compute the low-order mask for the characters in the given string
      private static long lowMask(String chars) {
          int n = chars.length();
          long m = 0;
          for (int i = 0; i < n; i++) {
              char c = chars.charAt(i);
              if (c < 64)
              m |= (1L << c);
          }
          return m;
  
      }
  
      // Compute the high-order mask for the characters in the given string
      private static long highMask(String chars) {
          int n = chars.length();
          long m = 0;
          for (int i = 0; i < n; i++) {
              char c = chars.charAt(i);
              if ((c >= 64) && (c < 128))
              m |= (1L << (c - 64));
          }
          return m;
      }
  
      // Compute a low-order mask for the characters
      // between first and last, inclusive
      private static long lowMask(char first, char last) {
          long m = 0;
          int f = Math.max(Math.min(first, 63), 0);
          int l = Math.max(Math.min(last, 63), 0);
          for (int i = f; i <= l; i++)
              m |= 1L << i;
          return m;
      }
  
      // Compute a high-order mask for the characters
      // between first and last, inclusive
      private static long highMask(char first, char last) {
          long m = 0;
          int f = Math.max(Math.min(first, 127), 64) - 64;
          int l = Math.max(Math.min(last, 127), 64) - 64;
          for (int i = f; i <= l; i++)
              m |= 1L << i;
          return m;
      }
  
      // Tell whether the given character is permitted by the given mask pair
      private static boolean match(char c, long lowMask, long highMask) {
          if (c < 64)
              return ((1L << c) & lowMask) != 0;
          if (c < 128)
              return ((1L << (c - 64)) & highMask) != 0;
          return false;
      }
  
      // Character-class masks, in reverse order from RFC2396 because
      // initializers for static fields cannot make forward references.
  
      // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
      //            "8" | "9"
      private static final long L_DIGIT = lowMask('0', '9');
      private static final long H_DIGIT = 0L;
  
      // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
      //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
      //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
      private static final long L_UPALPHA = 0L;
      private static final long H_UPALPHA = highMask('A', 'Z');
  
      // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
      //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
      //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
      private static final long L_LOWALPHA = 0L;
      private static final long H_LOWALPHA = highMask('a', 'z');
  
      // alpha         = lowalpha | upalpha
      private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
      private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
  
      // alphanum      = alpha | digit
      private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
      private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
  
      // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
      //                         "a" | "b" | "c" | "d" | "e" | "f"
      private static final long L_HEX = L_DIGIT;
      private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
  
      // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
      //                 "(" | ")"
      private static final long L_MARK = lowMask("-_.!~*'()");
      private static final long H_MARK = highMask("-_.!~*'()");
  
      // unreserved    = alphanum | mark
      private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
      private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
  
      // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
      //                 "$" | "," | "[" | "]"
      // Added per RFC2732: "[", "]"
      private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
      private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
  
      // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
      // characters are allowed; this is handled by the scanEscape method below.
      private static final long L_ESCAPED = 1L;
      private static final long H_ESCAPED = 0L;
  
      // uric          = reserved | unreserved | escaped
      private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
      private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
  
      // pchar         = unreserved | escaped |
      //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
      private static final long L_PCHAR
      = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
      private static final long H_PCHAR
      = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
  
      // All valid path characters
      private static final long L_PATH = L_PCHAR | lowMask(";/");
      private static final long H_PATH = H_PCHAR | highMask(";/");
  
      // Dash, for use in domainlabel and toplabel
      private static final long L_DASH = lowMask("-");
      private static final long H_DASH = highMask("-");
  
      // Dot, for use in hostnames
      private static final long L_DOT = lowMask(".");
      private static final long H_DOT = highMask(".");
  
      // userinfo      = *( unreserved | escaped |
      //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
      private static final long L_USERINFO
      = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
      private static final long H_USERINFO
      = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
  
      // reg_name      = 1*( unreserved | escaped | "$" | "," |
      //                     ";" | ":" | "@" | "&" | "=" | "+" )
      private static final long L_REG_NAME
      = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
      private static final long H_REG_NAME
      = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
  
      // All valid characters for server-based authorities
      private static final long L_SERVER
      = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
      private static final long H_SERVER
      = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
  
      // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
      private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
      private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
  
      // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
      //                 "&" | "=" | "+" | "$" | ","
      private static final long L_URIC_NO_SLASH
      = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");
      private static final long H_URIC_NO_SLASH
      = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");
  
  
      /**
       * Data must be escaped if it does not have a representation using an
       *  unreserved character; this includes data that does not correspond to
       *  a printable character of the US-ASCII coded character set, or that
       *  corresponds to any US-ASCII character that is disallowed.
       *
       *   unreserved  = alphanum | mark
       *   mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
       *
       *  @param path string to be escaped
       *  @return string escaped string
  
       */
  
      public static String getEscapedURI(String path){
          return quote(normalize(slashify(path)), L_PATH, H_PATH) ;
      }
  
  
      // -- Escaping and encoding --
  
      private final static char[] hexDigits = {
      '0', '1', '2', '3', '4', '5', '6', '7',
      '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
      };
  
      private static void appendEscape(StringBuffer sb, byte b) {
          sb.append('%');
          sb.append(hexDigits[(b >> 4) & 0x0f]);
          sb.append(hexDigits[(b >> 0) & 0x0f]);
      }
  
      private static String slashify(String path) {
          String p = path;
          if (File.separatorChar != '/')
              p = p.replace(File.separatorChar, '/');
          if (!p.startsWith("/"))
              p = "/" + p;
          return p;
      }//slashify
  
      // Quote any characters in s that are not permitted
      // by the given mask pair
      //
      private static String quote(String s, long lowMask, long highMask) {
  
      int n = s.length();
      StringBuffer sb = null;
  
      boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
  
      for (int i = 0; i < s.length(); i++) {
          char c = s.charAt(i);
          if (c < '\u0080') {
              if (!match(c, lowMask, highMask)) {
                  if (sb == null) {
                      sb = new StringBuffer();
                      sb.append(s.substring(0, i));
                  }
                  appendEscape(sb, (byte)c);
              }
              else {
                  if (sb != null)
                      sb.append(c);
              }
          } else if (allowNonASCII
              && (Character.isSpaceChar(c)
              || Character.isISOControl(c))) {
  
  
              if (sb == null) {
                  sb = new StringBuffer();
                  sb.append(s.substring(0, i));
              }
  
              //appendEncoded(sb, c);
  
              //this case is not handled.
              throw new InternalError() ;
  
          }
          else {
          if (sb != null)
              sb.append(c);
          }
      }
      return (sb == null) ? s : sb.toString();
  
      }//quote
  
  
      private static int decode(char c) {
          if ((c >= '0') && (c <= '9'))
              return c - '0';
          if ((c >= 'a') && (c <= 'f'))
              return c - 'a' + 10;
          if ((c >= 'A') && (c <= 'F'))
              return c - 'A' + 10;
          return -1;
      }
  
  
      private static byte decode(char c1, char c2) {
          return (byte)(  ((decode(c1) & 0xf) << 4)
                | ((decode(c2) & 0xf) << 0));
      }
  
  
  
      // Split the given path into segments, replacing slashes with nulls and
      // filling in the given segment-index array.
      //
      // Preconditions:
      //   segs.length == Number of segments in path
      //
      // Postconditions:
      //   All slashes in path replaced by '\0'
      //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
      //
      static private void split(char[] path, int[] segs) {
  
        int end = path.length - 1;      // Index of last char in path
        int p = 0;                      // Index of next char in path
        int i = 0;                      // Index of current segment
  
        // Skip initial slashes
        while (p <= end) {
            if (path[p] != '/') break;
            path[p] = '\0';
            p++;
        }
  
        while (p <= end) {
  
          // Note start of segment
          segs[i++] = p++;
  
          // Find beginning of next segment
          while (p <= end) {
              if (path[p++] != '/')
                  continue;
              path[p - 1] = '\0';
  
              // Skip redundant slashes
              while (p <= end) {
                  if (path[p] != '/') break;
                  path[p++] = '\0';
              }
  
              break;
  
          }
  
        }
  
        if (i != segs.length)
          throw new InternalError();    // ASSERT
  
      }//split()
  
  
      // Normalize the given path string.  A normal path string has no empty
      // segments (i.e., occurrences of "//"), no segments equal to ".", and no
      // segments equal to ".." that are preceded by a segment not equal to "..".
      // In contrast to Unix-style pathname normalization, for URI paths we
      // always retain trailing slashes.
      //
      private static String normalize(String ps) {
  
        // Does this path need normalization?
        int ns = needsNormalization(ps);        // Number of segments
        if (ns < 0)
            // Nope -- just return it
            return ps;
  
        char[] path = ps.toCharArray();         // Path in char-array form
  
        // Split path into segments
        int[] segs = new int[ns];               // Segment-index array
        split(path, segs);
  
        // Remove dots
        removeDots(path, segs);
  
        // Prevent scheme-name confusion
        //maybeAddLeadingDot(path, segs);
  
        // Join the remaining segments and return the result
        return new String(path, 0, join(path, segs));
      }
  
  
      // Join the segments in the given path according to the given segment-index
      // array, ignoring those segments whose index entries have been set to -1,
      // and inserting slashes as needed.  Return the length of the resulting
      // path.
      //
      // Preconditions:
      //   segs[i] == -1 implies segment i is to be ignored
      //   path computed by split, as above, with '\0' having replaced '/'
      //
      // Postconditions:
      //   path[0] .. path[return value] == Resulting path
      //
      static private int join(char[] path, int[] segs) {
        int ns = segs.length;           // Number of segments
        int end = path.length - 1;      // Index of last char in path
        int p = 0;                      // Index of next path char to write
  
        if (path[p] == '\0') {
            // Restore initial slash for absolute paths
            path[p++] = '/';
        }
  
        for (int i = 0; i < ns; i++) {
          int q = segs[i];              // Current segment
          if (q == -1)
          // Ignore this segment
          continue;
  
          if (p == q) {
              // We're already at this segment, so just skip to its end
              while ((p <= end) && (path[p] != '\0'))
                  p++;
              if (p <= end) {
                  // Preserve trailing slash
                  path[p++] = '/';
              }
          } else if (p < q) {
              // Copy q down to p
              while ((q <= end) && (path[q] != '\0'))
                  path[p++] = path[q++];
              if (q <= end) {
                  // Preserve trailing slash
                  path[p++] = '/';
              }
          } else
          throw new InternalError(); // ASSERT false
        }
  
        return p;
      }
  
  
      // Remove "." segments from the given path, and remove segment pairs
      // consisting of a non-".." segment followed by a ".." segment.
      //
      private static void removeDots(char[] path, int[] segs) {
        int ns = segs.length;
        int end = path.length - 1;
  
        for (int i = 0; i < ns; i++) {
            int dots = 0;               // Number of dots found (0, 1, or 2)
  
            // Find next occurrence of "." or ".."
            do {
          int p = segs[i];
          if (path[p] == '.') {
              if (p == end) {
            dots = 1;
            break;
              } else if (path[p + 1] == '\0') {
            dots = 1;
            break;
              } else if ((path[p + 1] == '.')
                   && ((p + 1 == end)
                 || (path[p + 2] == '\0'))) {
            dots = 2;
            break;
              }
          }
          i++;
            } while (i < ns);
            if ((i > ns) || (dots == 0))
          break;
  
            if (dots == 1) {
          // Remove this occurrence of "."
          segs[i] = -1;
            } else {
          // If there is a preceding non-".." segment, remove both that
          // segment and this occurrence of ".."; otherwise, leave this
          // ".." segment as-is.
          int j;
          for (j = i - 1; j >= 0; j--) {
              if (segs[j] != -1) break;
          }
          if (j >= 0) {
              int q = segs[j];
              if (!((path[q] == '.')
              && (path[q + 1] == '.')
              && (path[q + 2] == '\0'))) {
            segs[i] = -1;
            segs[j] = -1;
              }
          }
            }
        }
      }
  
  
      // Check the given path to see if it might need normalization.  A path
      // might need normalization if it contains duplicate slashes, a "."
      // segment, or a ".." segment.  Return -1 if no further normalization is
      // possible, otherwise return the number of segments found.
      //
      // This method takes a string argument rather than a char array so that
      // this test can be performed without invoking path.toCharArray().
      //
  
      static private int needsNormalization(String path) {
  
        boolean normal = true;
        int ns = 0;                     // Number of segments
        int end = path.length() - 1;    // Index of last char in path
        int p = 0;                      // Index of next char in path
  
        // Skip initial slashes
        while (p <= end) {
            if (path.charAt(p) != '/') break;
            p++;
        }
        if (p > 1) normal = false;
  
        // Scan segments
        while (p <= end) {
  
            // Looking at "." or ".." ?
            if ((path.charAt(p) == '.')
          && ((p == end)
              || ((path.charAt(p + 1) == '/')
            || ((path.charAt(p + 1) == '.')
                && ((p + 1 == end)
              || (path.charAt(p + 2) == '/')))))) {
          normal = false;
            }
            ns++;
  
            // Find beginning of next segment
            while (p <= end) {
          if (path.charAt(p++) != '/')
              continue;
  
          // Skip redundant slashes
          while (p <= end) {
              if (path.charAt(p) != '/') break;
              normal = false;
              p++;
          }
  
          break;
            }
        }
  
        return normal ? -1 : ns;
      }
  
      
  }//ConvertToURI


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: xml-xerces/java/src/javax/xml/parsers ConvertToURI.java

Reply via email to