http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java b/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java new file mode 100644 index 0000000..d815c45 --- /dev/null +++ b/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.ajax; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +/** Unit tests for AjaxURLNormalizer. */ +public class TestAjaxURLNormalizer extends TestCase { + private AjaxURLNormalizer normalizer; + private Configuration conf; + + public TestAjaxURLNormalizer(String name) { + super(name); + normalizer = new AjaxURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + } + + public void testNormalizer() throws Exception { + // check if AJAX URL's are normalized to an _escaped_frament_ form + normalizeTest("http://example.org/#!k=v", "http://example.org/?_escaped_fragment_=k=v"); + + // Check with some escaped chars + normalizeTest("http://example.org/#!k=v&something=is wrong", "http://example.org/?_escaped_fragment_=k=v%26something=is%20wrong"); + + // Check with query string and multiple fragment params + normalizeTest("http://example.org/path.html?queryparam=queryvalue#!key1=value1&key2=value2", "http://example.org/path.html?queryparam=queryvalue&_escaped_fragment_=key1=value1%26key2=value2"); + } + + public void testNormalizerWhenIndexing() throws Exception { + // check if it works the other way around + normalizeTest("http://example.org/?_escaped_fragment_=key=value", "http://example.org/#!key=value", URLNormalizers.SCOPE_INDEXER); + normalizeTest("http://example.org/?key=value&_escaped_fragment_=key=value", "http://example.org/?key=value#!key=value", URLNormalizers.SCOPE_INDEXER); + normalizeTest("http://example.org/page.html?key=value&_escaped_fragment_=key=value%26something=is%20wrong", "http://example.org/page.html?key=value#!key=value&something=is wrong", URLNormalizers.SCOPE_INDEXER); + } + + private void normalizeTest(String weird, String normal) throws Exception { + assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); + } + + private void normalizeTest(String weird, String normal, String scope) throws Exception { + assertEquals(normal, normalizer.normalize(weird, scope)); + } + + public static void main(String[] args) throws Exception { + new TestAjaxURLNormalizer("test").testNormalizer(); + } +} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-basic/build.xml b/nutch-plugins/urlnormalizer-basic/build.xml new file mode 100644 index 0000000..5a74bb0 --- /dev/null +++ b/nutch-plugins/urlnormalizer-basic/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-basic" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-basic/ivy.xml b/nutch-plugins/urlnormalizer-basic/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlnormalizer-basic/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-basic/plugin.xml b/nutch-plugins/urlnormalizer-basic/plugin.xml new file mode 100644 index 0000000..fb505aa --- /dev/null +++ b/nutch-plugins/urlnormalizer-basic/plugin.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-basic" + name="Basic URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-basic.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.basic" + name="Nutch Basic URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="BasicURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-basic/pom.xml b/nutch-plugins/urlnormalizer-basic/pom.xml new file mode 100644 index 0000000..d87f112 --- /dev/null +++ b/nutch-plugins/urlnormalizer-basic/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlnormalizer-basic</artifactId> + <packaging>jar</packaging> + + <name>urlnormalizer-basic</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java new file mode 100644 index 0000000..3e00346 --- /dev/null +++ b/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -0,0 +1,290 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.basic; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configured; +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Converts URLs to a normal form: + * <ul> + * <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li> + * <li>remove default ports, e.g. 80 for protocol <code>http://</code></li> + * <li>normalize <a href= + * "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI"> + * percent-encoding</a> in URL paths</li> + * </ul> + */ +public class BasicURLNormalizer extends Configured implements URLNormalizer { + public static final Logger LOG = LoggerFactory + .getLogger(BasicURLNormalizer.class); + + /** + * Pattern to detect whether a URL path could be normalized. Contains one of + * /. or ./ /.. or ../ // + */ + private final static Pattern hasNormalizablePathPattern = Pattern + .compile("/[./]|[.]/"); + + /** + * Nutch 1098 - finds URL encoded parts of the URL + */ + private final static Pattern unescapeRulePattern = Pattern + .compile("%([0-9A-Fa-f]{2})"); + + // charset used for encoding URLs before escaping + private final static Charset utf8 = Charset.forName("UTF-8"); + + /** look-up table for characters which should not be escaped in URL paths */ + private final static boolean[] unescapedCharacters = new boolean[128]; + static { + for (int c = 0; c < 128; c++) { + /* https://tools.ietf.org/html/rfc3986#section-2.2 + * For consistency, percent-encoded octets in the ranges of ALPHA + * (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), + * underscore (%5F), or tilde (%7E) should not be created by URI + * producers and, when found in a URI, should be decoded to their + * corresponding unreserved characters by URI normalizers. + */ + if ((0x41 <= c && c <= 0x5A) + || (0x61 <= c && c <= 0x7A) + || (0x30 <= c && c <= 0x39) + || c == 0x2D || c == 0x2E + || c == 0x5F || c == 0x7E) { + unescapedCharacters[c] = true; + } else { + unescapedCharacters[c] = false; + } + } + } + + public String normalize(String urlString, String scope) + throws MalformedURLException { + + if ("".equals(urlString)) // permit empty + return urlString; + + urlString = urlString.trim(); // remove extra spaces + + URL url = new URL(urlString); + + String protocol = url.getProtocol(); + String host = url.getHost(); + int port = url.getPort(); + String file = url.getFile(); + + boolean changed = false; + + if (!urlString.startsWith(protocol)) // protocol was lowercased + changed = true; + + if ("http".equals(protocol) || "https".equals(protocol) + || "ftp".equals(protocol)) { + + if (host != null) { + String newHost = host.toLowerCase(); // lowercase host + if (!host.equals(newHost)) { + host = newHost; + changed = true; + } + } + + if (port == url.getDefaultPort()) { // uses default port + port = -1; // so don't specify it + changed = true; + } + + if (file == null || "".equals(file)) { // add a slash + file = "/"; + changed = true; + } + + if (url.getRef() != null) { // remove the ref + changed = true; + } + + // check for unnecessary use of "/../", "/./", and "//" + String file2 = getFileWithNormalizedPath(url); + if (!file.equals(file2)) { + changed = true; + file = file2; + } + } + + // properly encode characters in path/file using percent-encoding + String file2 = unescapePath(file); + file2 = escapePath(file2); + if (!file.equals(file2)) { + changed = true; + file = file2; + } + + if (changed) + urlString = new URL(protocol, host, port, file).toString(); + + return urlString; + } + + private String getFileWithNormalizedPath(URL url) + throws MalformedURLException { + String file; + + if (hasNormalizablePathPattern.matcher(url.getPath()).find()) { + // only normalize the path if there is something to normalize + // to avoid needless work + try { + file = url.toURI().normalize().toURL().getFile(); + // URI.normalize() does not normalize leading dot segments, + // see also http://tools.ietf.org/html/rfc3986#section-5.2.4 + int start = 0; + while (file.startsWith("/../", start)) { + start += 3; + } + if (start > 0) { + file = file.substring(start); + } + } catch (URISyntaxException e) { + file = url.getFile(); + } + } else { + file = url.getFile(); + } + + // if path is empty return a single slash + if (file.isEmpty()) { + file = "/"; + } + + return file; + } + + /** + * Remove % encoding from path segment in URL for characters which should be + * unescaped according to <a + * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>. + */ + private String unescapePath(String path) { + StringBuilder sb = new StringBuilder(); + + Matcher matcher = unescapeRulePattern.matcher(path); + + int end = -1; + int letter; + + // Traverse over all encoded groups + while (matcher.find()) { + // Append everything up to this group + sb.append(path.substring(end + 1, matcher.start())); + + // Get the integer representation of this hexadecimal encoded character + letter = Integer.valueOf(matcher.group().substring(1), 16); + + if (letter < 128 && unescapedCharacters[letter]) { + // character should be unescaped in URLs + sb.append(new Character((char)letter)); + } else { + // Append the encoded character as uppercase + sb.append(matcher.group().toUpperCase(Locale.ROOT)); + } + + end = matcher.start() + 2; + } + + letter = path.length(); + + // Append the rest if there's anything + if (end <= letter - 1) { + sb.append(path.substring(end + 1, letter)); + } + + // Ok! + return sb.toString(); + } + + /** + * Convert path segment of URL from Unicode to UTF-8 and escape all + * characters which should be escaped according to <a + * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.. + */ + private String escapePath(String path) { + StringBuilder sb = new StringBuilder(path.length()); + + // Traverse over all bytes in this URL + for (byte b: path.getBytes(utf8)) { + // Is this a control character? + if (b < 33 || b == 91 || b == 93) { + // Start escape sequence + sb.append('%'); + + // Get this byte's hexadecimal representation + String hex = Integer.toHexString(b & 0xFF).toUpperCase(); + + // Do we need to prepend a zero? + if (hex.length() % 2 != 0 ) { + sb.append('0'); + sb.append(hex); + } else { + // No, append this hexadecimal representation + sb.append(hex); + } + } else { + // No, just append this character as-is + sb.append((char)b); + } + } + + return sb.toString(); + } + + public static void main(String args[]) throws IOException { + BasicURLNormalizer normalizer = new BasicURLNormalizer(); + normalizer.setConf(NutchConfiguration.create()); + String scope = URLNormalizers.SCOPE_DEFAULT; + if (args.length >= 1) { + scope = args[0]; + System.out.println("Scope: " + scope); + } + String line, normUrl; + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + while ((line = in.readLine()) != null) { + try { + normUrl = normalizer.normalize(line, scope); + System.out.println(normUrl); + } catch (MalformedURLException e) { + System.out.println("failed: " + line); + } + } + System.exit(0); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java b/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java new file mode 100644 index 0000000..ae59a84 --- /dev/null +++ b/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL normalizer performing basic normalizations: remove default ports + * and dot segments in path. + */ +package org.apache.nutch.net.urlnormalizer.basic; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java new file mode 100644 index 0000000..9a0f8c4 --- /dev/null +++ b/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -0,0 +1,175 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.basic; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** Unit tests for BasicURLNormalizer. */ +public class TestBasicURLNormalizer { + private BasicURLNormalizer normalizer; + + private Configuration conf; + + public TestBasicURLNormalizer() { + normalizer = new BasicURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + } + + @Test + public void testNUTCH1098() throws Exception { + // check that % encoding is normalized + normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); + + // check that % encoding works correctly at end of URL + normalizeTest("http://foo.com/%66oo.htm%6c", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/%66oo.ht%6dl", "http://foo.com/foo.html"); + + // check that % decoder do not overlap strings + normalizeTest("http://foo.com/%66oo.ht%6d%6c", "http://foo.com/foo.html"); + + // check that % decoder leaves high bit chars alone + normalizeTest("http://foo.com/%66oo.htm%C0", "http://foo.com/foo.htm%C0"); + + // check that % decoder leaves control chars alone + normalizeTest("http://foo.com/%66oo.htm%1A", "http://foo.com/foo.htm%1A"); + + // check that % decoder converts to upper case letters + normalizeTest("http://foo.com/%66oo.htm%c0", "http://foo.com/foo.htm%C0"); + + // check that % decoder leaves encoded spaces alone + normalizeTest("http://foo.com/you%20too.html", "http://foo.com/you%20too.html"); + + // check that spaces are encoded into %20 + normalizeTest("http://foo.com/you too.html", "http://foo.com/you%20too.html"); + + // check that encoded # are not decoded + normalizeTest("http://foo.com/file.html%23cz", "http://foo.com/file.html%23cz"); + + // check that encoded / are not decoded + normalizeTest("http://foo.com/fast/dir%2fcz", "http://foo.com/fast/dir%2Fcz"); + + // check that control chars are encoded + normalizeTest("http://foo.com/\u001a!", "http://foo.com/%1A!"); + + // check that control chars are always encoded into 2 digits + normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!"); + + // check encoding of spanish chars + normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx"); + } + + @Test + public void testNUTCH2064() throws Exception { + // Ampersand and colon and other punctuation characters are not to be unescaped + normalizeTest("http://x.com/s?q=a%26b&m=10", "http://x.com/s?q=a%26b&m=10"); + normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb", + "http://x.com/show?http%3A%2F%2Fx.com%2Fb"); + normalizeTest("http://google.com/search?q=c%2B%2B", + "http://google.com/search?q=c%2B%2B"); + // do also not touch the query part which is application/x-www-form-urlencoded + normalizeTest("http://x.com/s?q=a+b", "http://x.com/s?q=a+b"); + // and keep Internationalized domain names + // http://bücher.de/ may be http://xn--bcher-kva.de/ + // but definitely not http://b%C3%BCcher.de/ + normalizeTest("http://b\u00fccher.de/", "http://b\u00fccher.de/"); + // test whether percent-encoding works together with other normalizations + normalizeTest("http://x.com/./a/../%66.html", "http://x.com/f.html"); + // [ and ] need escaping as well + normalizeTest("http://x.com/?x[y]=1", "http://x.com/?x%5By%5D=1"); + // boundary test for first character outside the ASCII range (U+0080) + normalizeTest("http://x.com/foo\u0080", "http://x.com/foo%C2%80"); + normalizeTest("http://x.com/foo%c2%80", "http://x.com/foo%C2%80"); + } + + @Test + public void testNormalizer() throws Exception { + // check that leading and trailing spaces are removed + normalizeTest(" http://foo.com/ ", "http://foo.com/"); + + // check that protocol is lower cased + normalizeTest("HTTP://foo.com/", "http://foo.com/"); + + // check that host is lower cased + normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); + normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); + + // check that port number is normalized + normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html"); + normalizeTest("http://foo.com:81/", "http://foo.com:81/"); + + // check that null path is normalized + normalizeTest("http://foo.com", "http://foo.com/"); + + // check that references are removed + normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html"); + + // // check that encoding is normalized + // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); + + // check that unnecessary "../" are removed + + normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/aa/../", "http://foo.com/"); + normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/"); + normalizeTest("http://foo.com/aa/..", "http://foo.com/"); + normalizeTest("http://foo.com/aa/bb/cc/../../foo.html", + "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html", + "http://foo.com/aa/cc/ee/foo.html"); + normalizeTest("http://foo.com/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/../../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/aa/../../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/aa/../bb/../foo.html/../../", + "http://foo.com/"); + normalizeTest("http://foo.com/../aa/foo.html", "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/a..a/foo.html", + "http://foo.com/a..a/foo.html"); + normalizeTest("http://foo.com/a..a/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/foo.foo/../foo.html", + "http://foo.com/foo.html"); + normalizeTest("http://foo.com//aa/bb/foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com/aa//bb/foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com/aa/bb//foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com//aa//bb//foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com////aa////bb////foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com/aa?referer=http://bar.com", + "http://foo.com/aa?referer=http://bar.com"); + } + + private void normalizeTest(String weird, String normal) throws Exception { + Assert.assertEquals("normalizing: " + weird, normal, + normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); + } + + public static void main(String[] args) throws Exception { + new TestBasicURLNormalizer().testNormalizer(); + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/build.xml b/nutch-plugins/urlnormalizer-host/build.xml new file mode 100644 index 0000000..516596d --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/build.xml @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-host" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/data/hosts.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/data/hosts.txt b/nutch-plugins/urlnormalizer-host/data/hosts.txt new file mode 100644 index 0000000..c7e0ccf --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/data/hosts.txt @@ -0,0 +1,8 @@ +# Force all sub domains to www. +*.example.com example.com + +# Force no sub domain to www. URL's +www.example.net example.net + +# Force www. sub domain when hitting link without sub domain +example.org www.example.org \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/ivy.xml b/nutch-plugins/urlnormalizer-host/ivy.xml new file mode 100644 index 0000000..0a363f7 --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/plugin.xml b/nutch-plugins/urlnormalizer-host/plugin.xml new file mode 100644 index 0000000..f2b9615 --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/plugin.xml @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-host" + name="Host URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-host.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.host" + name="Nutch Host URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="HostURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.host.HostURLNormalizer"> + <parameter name="file" value="host-urlnormalizer.txt"/> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/pom.xml b/nutch-plugins/urlnormalizer-host/pom.xml new file mode 100644 index 0000000..217029e --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlnormalizer-host</artifactId> + <packaging>jar</packaging> + + <name>urlnormalizer-host</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java new file mode 100644 index 0000000..8d5c110 --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java @@ -0,0 +1,198 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.host; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashMap; + +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.URLUtil; + +/** + * URL normalizer for mapping hosts to their desired form. It takes a simple + * text file as source in the format: + * + * example.org www.example.org + * + * mapping all URL's of example.org the the www sub-domain. It also allows for + * wildcards to be used to map all sub-domains to another host: + * + * *.example.org www.example.org + */ +public class HostURLNormalizer implements URLNormalizer { + + private Configuration conf; + + private static final Logger LOG = LoggerFactory + .getLogger(HostURLNormalizer.class); + + private static String attributeFile = null; + private String hostsFile = null; + private static final HashMap<String, String> hostsMap = new HashMap<String, String>(); + + public HostURLNormalizer() { + } + + public HostURLNormalizer(String hostsFile) { + this.hostsFile = hostsFile; + } + + private synchronized void readConfiguration(Reader configReader) + throws IOException { + if (hostsMap.size() > 0) { + return; + } + + BufferedReader reader = new BufferedReader(configReader); + String line, host, target; + int delimiterIndex; + + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + line.trim(); + delimiterIndex = line.indexOf(" "); + + host = line.substring(0, delimiterIndex); + target = line.substring(delimiterIndex + 1); + hostsMap.put(host, target); + } + } + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "urlnormalizer-host"; + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLNormalizer.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("urlnormalizer.hosts.file"); + String stringRules = conf.get("urlnormalizer.hosts.rules"); + if (hostsFile != null) { + file = hostsFile; + } else if (attributeFile != null) { + file = attributeFile; + } + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + try { + if (reader == null) { + reader = new FileReader(file); + } + readConfiguration(reader); + } catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public String normalize(String urlString, String scope) + throws MalformedURLException { + String host = new URL(urlString).getHost(); + + // Test static hosts + if (hostsMap.containsKey(host)) { + return replaceHost(urlString, host, hostsMap.get(host)); + } + + // Test for wildcard in reverse order + String[] hostParts = host.split("\\."); + + // Use a buffer for our host parts + StringBuilder hostBuffer = new StringBuilder(); + + // This is our temp buffer keeping host parts with a wildcard + String wildCardHost = new String(); + + // Add the tld to the buffer + hostBuffer.append(hostParts[hostParts.length - 1]); + + for (int i = hostParts.length - 2; i > 0; i--) { + // Prepend another sub domain + hostBuffer.insert(0, hostParts[i] + "."); + + // Make a wildcarded sub domain + wildCardHost = "*." + hostBuffer.toString(); + + // Check if this wildcard sub domain exists + if (hostsMap.containsKey(wildCardHost)) { + // Replace the original input host with the wildard replaced + return replaceHost(urlString, host, hostsMap.get(wildCardHost)); + } + } + + return urlString; + } + + protected String replaceHost(String urlString, String host, String target) { + int hostIndex = urlString.indexOf(host); + + StringBuilder buffer = new StringBuilder(); + + buffer.append(urlString.substring(0, hostIndex)); + buffer.append(target); + buffer.append(urlString.substring(hostIndex + host.length())); + + return buffer.toString(); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/package-info.java b/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/package-info.java new file mode 100644 index 0000000..62c97d7 --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL normalizer renaming hosts to a canonical form listed in the + * configuration file. + */ +package org.apache.nutch.net.urlnormalizer.host; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java b/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java new file mode 100644 index 0000000..c9e1a2c --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.host; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestHostURLNormalizer { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + @Test + public void testHostURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String hostsFile = SAMPLES + SEPARATOR + "hosts.txt"; + HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile); + normalizer.setConf(conf); + + // Force www. sub domain when hitting link without sub domain + Assert.assertEquals("http://www.example.org/page.html", + normalizer.normalize("http://example.org/page.html", + URLNormalizers.SCOPE_DEFAULT)); + + // Force no sub domain to www. URL's + Assert.assertEquals("http://example.net/path/to/something.html", normalizer + .normalize("http://www.example.net/path/to/something.html", + URLNormalizers.SCOPE_DEFAULT)); + + // Force all sub domains to www. + Assert.assertEquals("http://example.com/?does=it&still=work", normalizer + .normalize("http://example.com/?does=it&still=work", + URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals("http://example.com/buh", normalizer.normalize( + "http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals("http://example.com/blaat", normalizer.normalize( + "http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-pass/build.xml b/nutch-plugins/urlnormalizer-pass/build.xml new file mode 100644 index 0000000..b478e45 --- /dev/null +++ b/nutch-plugins/urlnormalizer-pass/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-pass" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-pass/ivy.xml b/nutch-plugins/urlnormalizer-pass/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlnormalizer-pass/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-pass/plugin.xml b/nutch-plugins/urlnormalizer-pass/plugin.xml new file mode 100644 index 0000000..31dcc70 --- /dev/null +++ b/nutch-plugins/urlnormalizer-pass/plugin.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-pass" + name="Pass-through URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-pass.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.pass" + name="Nutch Pass-through URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="PassURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-pass/pom.xml b/nutch-plugins/urlnormalizer-pass/pom.xml new file mode 100644 index 0000000..502d0d4 --- /dev/null +++ b/nutch-plugins/urlnormalizer-pass/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlnormalizer-pass</artifactId> + <packaging>jar</packaging> + + <name>urlnormalizer-pass</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java b/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java new file mode 100644 index 0000000..03d510c --- /dev/null +++ b/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java @@ -0,0 +1,49 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.pass; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizer; + +/** + * This URLNormalizer doesn't change urls. It is sometimes useful if for a given + * scope at least one normalizer must be defined but no transformations are + * required. + * + * @author Andrzej Bialecki + */ +public class PassURLNormalizer implements URLNormalizer { + + private Configuration conf; + + public String normalize(String urlString, String scope) + throws MalformedURLException { + return urlString; + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java b/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java new file mode 100644 index 0000000..eab6c2e --- /dev/null +++ b/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL normalizer dummy which does not change URLs. Required because at least + * one URL normalizer must be defined in any scope. + */ +package org.apache.nutch.net.urlnormalizer.pass; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java b/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java new file mode 100644 index 0000000..f470c62 --- /dev/null +++ b/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.pass; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestPassURLNormalizer { + + @Test + public void testPassURLNormalizer() { + Configuration conf = NutchConfiguration.create(); + + PassURLNormalizer normalizer = new PassURLNormalizer(); + normalizer.setConf(conf); + String url = "http://www.example.com/test/..//"; + String result = null; + try { + result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT); + } catch (MalformedURLException mue) { + Assert.fail(mue.toString()); + } + + Assert.assertEquals(url, result); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/build.xml b/nutch-plugins/urlnormalizer-protocol/build.xml new file mode 100644 index 0000000..71df8e2 --- /dev/null +++ b/nutch-plugins/urlnormalizer-protocol/build.xml @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-protocol" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/data/protocols.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/data/protocols.txt b/nutch-plugins/urlnormalizer-protocol/data/protocols.txt new file mode 100644 index 0000000..7091cd7 --- /dev/null +++ b/nutch-plugins/urlnormalizer-protocol/data/protocols.txt @@ -0,0 +1,7 @@ +# format: host\tprotocol\n + +example.org http +example.net http + +example.io https +example.nl https http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/ivy.xml b/nutch-plugins/urlnormalizer-protocol/ivy.xml new file mode 100644 index 0000000..0a363f7 --- /dev/null +++ b/nutch-plugins/urlnormalizer-protocol/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/plugin.xml b/nutch-plugins/urlnormalizer-protocol/plugin.xml new file mode 100644 index 0000000..639b16a --- /dev/null +++ b/nutch-plugins/urlnormalizer-protocol/plugin.xml @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-protocol" + name="Protocol URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-protocol.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.protocol" + name="Nutch Protocol URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="ProtocolURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.protocol.ProtocolURLNormalizer"> + <parameter name="file" value="protocols.txt"/> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/pom.xml b/nutch-plugins/urlnormalizer-protocol/pom.xml new file mode 100644 index 0000000..7c92a2c --- /dev/null +++ b/nutch-plugins/urlnormalizer-protocol/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlnormalizer-protocol</artifactId> + <packaging>jar</packaging> + + <name>urlnormalizer-protocol</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/src/main/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/src/main/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/nutch-plugins/urlnormalizer-protocol/src/main/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java new file mode 100644 index 0000000..4278325 --- /dev/null +++ b/nutch-plugins/urlnormalizer-protocol/src/main/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java @@ -0,0 +1,190 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.protocol; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.URLUtil; + +/** + * @author [email protected] + */ +public class ProtocolURLNormalizer implements URLNormalizer { + + private Configuration conf; + + private static final Logger LOG = LoggerFactory.getLogger(ProtocolURLNormalizer.class); + + private static final char QUESTION_MARK = '?'; + private static final String PROTOCOL_DELIMITER = "://"; + + private static String attributeFile = null; + private String protocolsFile = null; + + // We record a map of hosts and boolean, the boolean denotes whether the host should + // have slashes after URL paths. True means slash, false means remove the slash + private static final Map<String,String> protocolsMap = new HashMap<String,String>(); + + public ProtocolURLNormalizer() {} + + public ProtocolURLNormalizer(String protocolsFile) { + this.protocolsFile = protocolsFile; + } + + private synchronized void readConfiguration(Reader configReader) throws IOException { + if (protocolsMap.size() > 0) { + return; + } + + BufferedReader reader = new BufferedReader(configReader); + String line, host; + String protocol; + int delimiterIndex; + + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + line.trim(); + delimiterIndex = line.indexOf(" "); + // try tabulator + if (delimiterIndex == -1) { + delimiterIndex = line.indexOf("\t"); + } + + host = line.substring(0, delimiterIndex); + protocol = line.substring(delimiterIndex + 1).trim(); + + protocolsMap.put(host, protocol); + } + } + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "urlnormalizer-protocol"; + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( + URLNormalizer.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } + else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("urlnormalizer.protocols.file"); + String stringRules = conf.get("urlnormalizer.protocols.rules"); + if (protocolsFile != null) { + file = protocolsFile; + } + else if (attributeFile != null) { + file = attributeFile; + } + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + try { + if (reader == null) { + reader = new FileReader(file); + } + readConfiguration(reader); + } + catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public String normalize(String url, String scope) throws MalformedURLException { + return normalize(url, null, scope); + } + + public String normalize(String url, CrawlDatum crawlDatum, String scope) throws MalformedURLException { + // Get URL repr. + URL u = new URL(url); + + // Get the host + String host = u.getHost(); + + // Do we have a rule for this host? + if (protocolsMap.containsKey(host)) { + String protocol = u.getProtocol(); + String requiredProtocol = protocolsMap.get(host); + + // Incorrect protocol? + if (!protocol.equals(requiredProtocol)) { + // Rebuild URL with new protocol + StringBuilder buffer = new StringBuilder(requiredProtocol); + buffer.append(PROTOCOL_DELIMITER); + buffer.append(host); + buffer.append(u.getPath()); + + String queryString = u.getQuery(); + if (queryString != null) { + buffer.append(QUESTION_MARK); + buffer.append(queryString); + } + + url = buffer.toString(); + } + } + + return url; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java new file mode 100644 index 0000000..8880628 --- /dev/null +++ b/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.protocol; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestProtocolURLNormalizer extends TestCase { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public void testProtocolURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt"; + ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(protocolsFile); + normalizer.setConf(conf); + + // No change + assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // https to http + assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // no change + assertEquals("https://example.io/", normalizer.normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("https://example.nl/", normalizer.normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT)); + + // http to https + assertEquals("https://example.io/", normalizer.normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("https://example.nl/", normalizer.normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-querystring/build.xml b/nutch-plugins/urlnormalizer-querystring/build.xml new file mode 100644 index 0000000..2d692c4 --- /dev/null +++ b/nutch-plugins/urlnormalizer-querystring/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-querystring" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-querystring/ivy.xml b/nutch-plugins/urlnormalizer-querystring/ivy.xml new file mode 100644 index 0000000..0a363f7 --- /dev/null +++ b/nutch-plugins/urlnormalizer-querystring/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-querystring/plugin.xml b/nutch-plugins/urlnormalizer-querystring/plugin.xml new file mode 100644 index 0000000..2a677fc --- /dev/null +++ b/nutch-plugins/urlnormalizer-querystring/plugin.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-querystring" + name="Querystrings URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-querystring.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.querystring" + name="Nutch Querystring URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="QuerystringURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.querystring.QuerystringURLNormalizer"> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-querystring/pom.xml b/nutch-plugins/urlnormalizer-querystring/pom.xml new file mode 100644 index 0000000..514f2f0 --- /dev/null +++ b/nutch-plugins/urlnormalizer-querystring/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlnormalizer-querystring</artifactId> + <packaging>jar</packaging> + + <name>urlnormalizer-querystring</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project>
