Author: jnioche
Date: Tue Nov 10 11:16:49 2015
New Revision: 1713615
URL: http://svn.apache.org/viewvc?rev=1713615&view=rev
Log:
NUTCH-2064 URLNormalizer basic to encode reserved chars and decode non-reserved
chars
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1713615&r1=1713614&r2=1713615&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Nov 10 11:16:49 2015
@@ -3,6 +3,8 @@ Nutch Change Log
Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
Release Report: http://s.apache.org/nutch11
+* NUTCH-2064 URLNormalizer basic to encode reserved chars and decode
non-reserved chars (markus, snagel)
+
* NUTCH-2159 Ensure that all WebApp files are copied into generated artifacts
for 1.X Webapp (lewismc)
* NUTCH-2154 Nutch REST API (DB) suffering NullPointerException (Aron Ahmadia,
Sujen Shah via mattmann)
Modified:
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1713615&r1=1713614&r2=1713615&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
(original)
+++
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Tue Nov 10 11:16:49 2015
@@ -23,6 +23,9 @@ import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Locale;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configured;
@@ -37,6 +40,9 @@ import org.slf4j.LoggerFactory;
* <ul>
* <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li>
* <li>remove default ports, e.g. 80 for protocol <code>http://</code></li>
+ * <li>normalize <a href=
+ * "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI">
+ * percent-encoding</a> in URL paths</li>
* </ul>
*/
public class BasicURLNormalizer extends Configured implements URLNormalizer {
@@ -50,8 +56,41 @@ public class BasicURLNormalizer extends
private final static Pattern hasNormalizablePathPattern = Pattern
.compile("/[./]|[.]/");
+ /**
+ * Nutch 1098 - finds URL encoded parts of the URL
+ */
+ private final static Pattern unescapeRulePattern = Pattern
+ .compile("%([0-9A-Fa-f]{2})");
+
+ // charset used for encoding URLs before escaping
+ private final static Charset utf8 = Charset.forName("UTF-8");
+
+ /** look-up table for characters which should not be escaped in URL paths */
+ private final static boolean[] unescapedCharacters = new boolean[128];
+ static {
+ for (int c = 0; c < 128; c++) {
+ /* https://tools.ietf.org/html/rfc3986#section-2.2
+ * For consistency, percent-encoded octets in the ranges of ALPHA
+ * (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
+ * underscore (%5F), or tilde (%7E) should not be created by URI
+ * producers and, when found in a URI, should be decoded to their
+ * corresponding unreserved characters by URI normalizers.
+ */
+ if ((0x41 <= c && c <= 0x5A)
+ || (0x61 <= c && c <= 0x7A)
+ || (0x30 <= c && c <= 0x39)
+ || c == 0x2D || c == 0x2E
+ || c == 0x5F || c == 0x7E) {
+ unescapedCharacters[c] = true;
+ } else {
+ unescapedCharacters[c] = false;
+ }
+ }
+ }
+
public String normalize(String urlString, String scope)
throws MalformedURLException {
+
if ("".equals(urlString)) // permit empty
return urlString;
@@ -100,7 +139,14 @@ public class BasicURLNormalizer extends
changed = true;
file = file2;
}
+ }
+ // properly encode characters in path/file using percent-encoding
+ String file2 = unescapePath(file);
+ file2 = escapePath(file2);
+ if (!file.equals(file2)) {
+ changed = true;
+ file = file2;
}
if (changed)
@@ -141,6 +187,84 @@ public class BasicURLNormalizer extends
return file;
}
+
+ /**
+ * Remove % encoding from path segment in URL for characters which should be
+ * unescaped according to <a
+ * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
+ */
+ private String unescapePath(String path) {
+ StringBuilder sb = new StringBuilder();
+
+ Matcher matcher = unescapeRulePattern.matcher(path);
+
+ int end = -1;
+ int letter;
+
+ // Traverse over all encoded groups
+ while (matcher.find()) {
+ // Append everything up to this group
+ sb.append(path.substring(end + 1, matcher.start()));
+
+ // Get the integer representation of this hexadecimal encoded character
+ letter = Integer.valueOf(matcher.group().substring(1), 16);
+
+ if (letter < 128 && unescapedCharacters[letter]) {
+ // character should be unescaped in URLs
+ sb.append(new Character((char)letter));
+ } else {
+ // Append the encoded character as uppercase
+ sb.append(matcher.group().toUpperCase(Locale.ROOT));
+ }
+
+ end = matcher.start() + 2;
+ }
+
+ letter = path.length();
+
+ // Append the rest if there's anything
+ if (end <= letter - 1) {
+ sb.append(path.substring(end + 1, letter));
+ }
+
+ // Ok!
+ return sb.toString();
+ }
+
+ /**
+ * Convert path segment of URL from Unicode to UTF-8 and escape all
+ * characters which should be escaped according to <a
+ * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
+ */
+ private String escapePath(String path) {
+ StringBuilder sb = new StringBuilder(path.length());
+
+ // Traverse over all bytes in this URL
+ for (byte b: path.getBytes(utf8)) {
+ // Is this a control character?
+ if (b < 33 || b == 91 || b == 93) {
+ // Start escape sequence
+ sb.append('%');
+
+ // Get this byte's hexadecimal representation
+ String hex = Integer.toHexString(b & 0xFF).toUpperCase();
+
+ // Do we need to prepend a zero?
+ if (hex.length() % 2 != 0 ) {
+ sb.append('0');
+ sb.append(hex);
+ } else {
+ // No, append this hexadecimal representation
+ sb.append(hex);
+ }
+ } else {
+ // No, just append this character as-is
+ sb.append((char)b);
+ }
+ }
+
+ return sb.toString();
+ }
public static void main(String args[]) throws IOException {
BasicURLNormalizer normalizer = new BasicURLNormalizer();
Modified:
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1713615&r1=1713614&r2=1713615&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
(original)
+++
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
Tue Nov 10 11:16:49 2015
@@ -34,6 +34,72 @@ public class TestBasicURLNormalizer {
conf = NutchConfiguration.create();
normalizer.setConf(conf);
}
+
+ @Test
+ public void testNUTCH1098() throws Exception {
+ // check that % encoding is normalized
+ normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
+
+ // check that % encoding works correctly at end of URL
+ normalizeTest("http://foo.com/%66oo.htm%6c", "http://foo.com/foo.html");
+ normalizeTest("http://foo.com/%66oo.ht%6dl", "http://foo.com/foo.html");
+
+ // check that % decoder do not overlap strings
+ normalizeTest("http://foo.com/%66oo.ht%6d%6c", "http://foo.com/foo.html");
+
+ // check that % decoder leaves high bit chars alone
+ normalizeTest("http://foo.com/%66oo.htm%C0", "http://foo.com/foo.htm%C0");
+
+ // check that % decoder leaves control chars alone
+ normalizeTest("http://foo.com/%66oo.htm%1A", "http://foo.com/foo.htm%1A");
+
+ // check that % decoder converts to upper case letters
+ normalizeTest("http://foo.com/%66oo.htm%c0", "http://foo.com/foo.htm%C0");
+
+ // check that % decoder leaves encoded spaces alone
+ normalizeTest("http://foo.com/you%20too.html",
"http://foo.com/you%20too.html");
+
+ // check that spaces are encoded into %20
+ normalizeTest("http://foo.com/you too.html",
"http://foo.com/you%20too.html");
+
+ // check that encoded # are not decoded
+ normalizeTest("http://foo.com/file.html%23cz",
"http://foo.com/file.html%23cz");
+
+ // check that encoded / are not decoded
+ normalizeTest("http://foo.com/fast/dir%2fcz",
"http://foo.com/fast/dir%2Fcz");
+
+ // check that control chars are encoded
+ normalizeTest("http://foo.com/\u001a!", "http://foo.com/%1A!");
+
+ // check that control chars are always encoded into 2 digits
+ normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!");
+
+ // check encoding of spanish chars
+ normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx",
"http://mydomain.com/en%20Espa%C3%B1ol.aspx");
+ }
+
+ @Test
+ public void testNUTCH2064() throws Exception {
+ // Ampersand and colon and other punctuation characters are not to be
unescaped
+ normalizeTest("http://x.com/s?q=a%26b&m=10",
"http://x.com/s?q=a%26b&m=10");
+ normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb",
+ "http://x.com/show?http%3A%2F%2Fx.com%2Fb");
+ normalizeTest("http://google.com/search?q=c%2B%2B",
+ "http://google.com/search?q=c%2B%2B");
+ // do also not touch the query part which is
application/x-www-form-urlencoded
+ normalizeTest("http://x.com/s?q=a+b", "http://x.com/s?q=a+b");
+ // and keep Internationalized domain names
+ // http://bücher.de/ may be http://xn--bcher-kva.de/
+ // but definitely not http://b%C3%BCcher.de/
+ normalizeTest("http://b\u00fccher.de/", "http://b\u00fccher.de/");
+ // test whether percent-encoding works together with other normalizations
+ normalizeTest("http://x.com/./a/../%66.html", "http://x.com/f.html");
+ // [ and ] need escaping as well
+ normalizeTest("http://x.com/?x[y]=1", "http://x.com/?x%5By%5D=1");
+ // boundary test for first character outside the ASCII range (U+0080)
+ normalizeTest("http://x.com/foo\u0080", "http://x.com/foo%C2%80");
+ normalizeTest("http://x.com/foo%c2%80", "http://x.com/foo%C2%80");
+ }
@Test
public void testNormalizer() throws Exception {