Author: jnioche
Date: Tue Nov 10 11:16:49 2015
New Revision: 1713615

URL: http://svn.apache.org/viewvc?rev=1713615&view=rev
Log:
NUTCH-2064 URLNormalizer basic to encode reserved chars and decode non-reserved 
chars

Modified:
    nutch/trunk/CHANGES.txt
    
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
    
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1713615&r1=1713614&r2=1713615&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Nov 10 11:16:49 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
 Release Report: http://s.apache.org/nutch11
 
+* NUTCH-2064 URLNormalizer basic to encode reserved chars and decode 
non-reserved chars (markus, snagel)
+
 * NUTCH-2159 Ensure that all WebApp files are copied into generated artifacts 
for 1.X Webapp (lewismc)
 
 * NUTCH-2154 Nutch REST API (DB) suffering NullPointerException (Aron Ahmadia, 
Sujen Shah via mattmann)

Modified: 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1713615&r1=1713614&r2=1713615&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 Tue Nov 10 11:16:49 2015
@@ -23,6 +23,9 @@ import java.io.InputStreamReader;
 import java.net.MalformedURLException;
 import java.net.URISyntaxException;
 import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Locale;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.hadoop.conf.Configured;
@@ -37,6 +40,9 @@ import org.slf4j.LoggerFactory;
  * <ul>
  * <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li>
  * <li>remove default ports, e.g. 80 for protocol <code>http://</code></li>
+ * <li>normalize <a href=
+ * "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI";>
+ * percent-encoding</a> in URL paths</li>
  * </ul>
  */
 public class BasicURLNormalizer extends Configured implements URLNormalizer {
@@ -50,8 +56,41 @@ public class BasicURLNormalizer extends
   private final static Pattern hasNormalizablePathPattern = Pattern
       .compile("/[./]|[.]/");
 
+  /**
+   * Nutch 1098 - finds URL encoded parts of the URL
+   */
+  private final static Pattern unescapeRulePattern = Pattern
+      .compile("%([0-9A-Fa-f]{2})");
+  
+  // charset used for encoding URLs before escaping
+  private final static Charset utf8 = Charset.forName("UTF-8");
+
+  /** look-up table for characters which should not be escaped in URL paths */
+  private final static boolean[] unescapedCharacters = new boolean[128];
+  static {
+    for (int c = 0; c < 128; c++) {
+      /* https://tools.ietf.org/html/rfc3986#section-2.2
+       * For consistency, percent-encoded octets in the ranges of ALPHA
+       * (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
+       * underscore (%5F), or tilde (%7E) should not be created by URI
+       * producers and, when found in a URI, should be decoded to their
+       * corresponding unreserved characters by URI normalizers.
+       */
+      if ((0x41 <= c && c <= 0x5A)
+        || (0x61 <= c && c <= 0x7A)
+        || (0x30 <= c && c <= 0x39)
+        || c == 0x2D || c == 0x2E
+        || c == 0x5F || c == 0x7E) {
+        unescapedCharacters[c] = true;
+      } else {
+        unescapedCharacters[c] = false;
+      }
+    }
+  }
+
   public String normalize(String urlString, String scope)
       throws MalformedURLException {
+    
     if ("".equals(urlString)) // permit empty
       return urlString;
 
@@ -100,7 +139,14 @@ public class BasicURLNormalizer extends
         changed = true;
         file = file2;
       }
+    }
 
+    // properly encode characters in path/file using percent-encoding
+    String file2 = unescapePath(file);
+    file2 = escapePath(file2);
+    if (!file.equals(file2)) {
+      changed = true;
+      file = file2;
     }
 
     if (changed)
@@ -141,6 +187,84 @@ public class BasicURLNormalizer extends
 
     return file;
   }
+  
+  /**
+   * Remove % encoding from path segment in URL for characters which should be
+   * unescaped according to <a
+   * href="https://tools.ietf.org/html/rfc3986#section-2.2";>RFC3986</a>.
+   */
+  private String unescapePath(String path) {
+    StringBuilder sb = new StringBuilder();
+    
+    Matcher matcher = unescapeRulePattern.matcher(path);
+    
+    int end = -1;
+    int letter;
+
+    // Traverse over all encoded groups
+    while (matcher.find()) {
+      // Append everything up to this group
+      sb.append(path.substring(end + 1, matcher.start()));
+      
+      // Get the integer representation of this hexadecimal encoded character
+      letter = Integer.valueOf(matcher.group().substring(1), 16);
+
+      if (letter < 128 && unescapedCharacters[letter]) {
+        // character should be unescaped in URLs
+        sb.append(new Character((char)letter));
+      } else {
+        // Append the encoded character as uppercase
+        sb.append(matcher.group().toUpperCase(Locale.ROOT));
+      }
+      
+      end = matcher.start() + 2;
+    }
+    
+    letter = path.length();
+    
+    // Append the rest if there's anything
+    if (end <= letter - 1) {
+      sb.append(path.substring(end + 1, letter));
+    }
+
+    // Ok!
+    return sb.toString();
+  }
+
+  /**
+   * Convert path segment of URL from Unicode to UTF-8 and escape all
+   * characters which should be escaped according to <a
+   * href="https://tools.ietf.org/html/rfc3986#section-2.2";>RFC3986</a>..
+   */
+  private String escapePath(String path) {
+    StringBuilder sb = new StringBuilder(path.length());
+
+    // Traverse over all bytes in this URL
+    for (byte b: path.getBytes(utf8)) {
+      // Is this a control character?
+      if (b < 33 || b == 91 || b == 93) {
+        // Start escape sequence 
+        sb.append('%');
+        
+        // Get this byte's hexadecimal representation 
+        String hex = Integer.toHexString(b & 0xFF).toUpperCase();
+        
+        // Do we need to prepend a zero?
+        if (hex.length() % 2 != 0 ) {
+          sb.append('0');
+          sb.append(hex);
+        } else {
+          // No, append this hexadecimal representation
+          sb.append(hex);
+        }
+      } else {
+        // No, just append this character as-is
+        sb.append((char)b);
+      }
+    }
+    
+    return sb.toString();
+  }
 
   public static void main(String args[]) throws IOException {
     BasicURLNormalizer normalizer = new BasicURLNormalizer();

Modified: 
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1713615&r1=1713614&r2=1713615&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 Tue Nov 10 11:16:49 2015
@@ -34,6 +34,72 @@ public class TestBasicURLNormalizer {
     conf = NutchConfiguration.create();
     normalizer.setConf(conf);
   }
+  
+  @Test
+  public void testNUTCH1098() throws Exception {
+    // check that % encoding is normalized
+    normalizeTest("http://foo.com/%66oo.html";, "http://foo.com/foo.html";);
+
+    // check that % encoding works correctly at end of URL
+    normalizeTest("http://foo.com/%66oo.htm%6c";, "http://foo.com/foo.html";);
+    normalizeTest("http://foo.com/%66oo.ht%6dl";, "http://foo.com/foo.html";);
+
+    // check that % decoder do not overlap strings
+    normalizeTest("http://foo.com/%66oo.ht%6d%6c";, "http://foo.com/foo.html";);
+    
+    // check that % decoder leaves high bit chars alone
+    normalizeTest("http://foo.com/%66oo.htm%C0";, "http://foo.com/foo.htm%C0";);
+
+    // check that % decoder leaves control chars alone
+    normalizeTest("http://foo.com/%66oo.htm%1A";, "http://foo.com/foo.htm%1A";);
+
+    // check that % decoder converts to upper case letters
+    normalizeTest("http://foo.com/%66oo.htm%c0";, "http://foo.com/foo.htm%C0";);
+
+    // check that % decoder leaves encoded spaces alone
+    normalizeTest("http://foo.com/you%20too.html";, 
"http://foo.com/you%20too.html";);
+
+    // check that spaces are encoded into %20
+    normalizeTest("http://foo.com/you too.html", 
"http://foo.com/you%20too.html";);
+
+    // check that encoded # are not decoded
+    normalizeTest("http://foo.com/file.html%23cz";, 
"http://foo.com/file.html%23cz";);
+
+    // check that encoded / are not decoded
+    normalizeTest("http://foo.com/fast/dir%2fcz";, 
"http://foo.com/fast/dir%2Fcz";);
+
+    // check that control chars are encoded
+    normalizeTest("http://foo.com/\u001a!";, "http://foo.com/%1A!";);
+
+    // check that control chars are always encoded into 2 digits
+    normalizeTest("http://foo.com/\u0001!";, "http://foo.com/%01!";);
+
+    // check encoding of spanish chars
+    normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", 
"http://mydomain.com/en%20Espa%C3%B1ol.aspx";);
+  }
+  
+  @Test
+  public void testNUTCH2064() throws Exception {
+    // Ampersand and colon and other punctuation characters are not to be 
unescaped
+    normalizeTest("http://x.com/s?q=a%26b&m=10";, 
"http://x.com/s?q=a%26b&m=10";);
+    normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb";,
+        "http://x.com/show?http%3A%2F%2Fx.com%2Fb";);
+    normalizeTest("http://google.com/search?q=c%2B%2B";,
+        "http://google.com/search?q=c%2B%2B";);
+    // do also not touch the query part which is 
application/x-www-form-urlencoded
+    normalizeTest("http://x.com/s?q=a+b";, "http://x.com/s?q=a+b";);
+    // and keep Internationalized domain names
+    // http://bücher.de/ may be http://xn--bcher-kva.de/
+    // but definitely not http://b%C3%BCcher.de/
+    normalizeTest("http://b\u00fccher.de/";, "http://b\u00fccher.de/";);
+    // test whether percent-encoding works together with other normalizations
+    normalizeTest("http://x.com/./a/../%66.html";, "http://x.com/f.html";);
+    // [ and ] need escaping as well
+    normalizeTest("http://x.com/?x[y]=1";, "http://x.com/?x%5By%5D=1";);
+    // boundary test for first character outside the ASCII range (U+0080)
+    normalizeTest("http://x.com/foo\u0080";, "http://x.com/foo%C2%80";);
+    normalizeTest("http://x.com/foo%c2%80";, "http://x.com/foo%C2%80";);
+  }
 
   @Test
   public void testNormalizer() throws Exception {


Reply via email to