Author: mattmann
Date: Mon Apr 27 01:40:34 2015
New Revision: 1676186
URL: http://svn.apache.org/r1676186
Log:
Fix for NUTCH-1969 URL Normalizer properly handling slashes contributed by
Markus Jelsma.
Added:
nutch/trunk/src/plugin/urlnormalizer-slash/
nutch/trunk/src/plugin/urlnormalizer-slash/build.xml
nutch/trunk/src/plugin/urlnormalizer-slash/data/
nutch/trunk/src/plugin/urlnormalizer-slash/data/slashes.txt
nutch/trunk/src/plugin/urlnormalizer-slash/ivy.xml
nutch/trunk/src/plugin/urlnormalizer-slash/plugin.xml
nutch/trunk/src/plugin/urlnormalizer-slash/src/
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/src/plugin/build.xml
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1676186&r1=1676185&r2=1676186&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Apr 27 01:40:34 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1969 URL Normalizer properly handling slashes (markus via mattmann)
+
* NUTCH-2001 Sub Collection Field Name incorrect in nutch-default.xml
(Jeff Cocking via mattmann)
Modified: nutch/trunk/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1676186&r1=1676185&r2=1676186&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Mon Apr 27 01:40:34 2015
@@ -217,7 +217,8 @@
<packageset dir="${plugins.dir}/urlnormalizer-pass/src/java"/>
<packageset dir="${plugins.dir}/urlnormalizer-querystring/src/java"/>
<packageset dir="${plugins.dir}/urlnormalizer-regex/src/java"/>
-
+ <packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/>
+
<link href="${javadoc.link.java}"/>
<link href="${javadoc.link.lucene}"/>
<link href="${javadoc.link.hadoop}"/>
@@ -626,7 +627,8 @@
<packageset dir="${plugins.dir}/urlnormalizer-pass/src/java"/>
<packageset dir="${plugins.dir}/urlnormalizer-querystring/src/java"/>
<packageset dir="${plugins.dir}/urlnormalizer-regex/src/java"/>
-
+ <packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/>
+
<link href="${javadoc.link.java}"/>
<link href="${javadoc.link.lucene}"/>
<link href="${javadoc.link.hadoop}"/>
@@ -1042,10 +1044,12 @@
<source path="${plugins.dir}/urlnormalizer-host/src/test/" />
<source path="${plugins.dir}/urlnormalizer-pass/src/java/" />
<source path="${plugins.dir}/urlnormalizer-pass/src/test/" />
- <source path="${plugins.dir}/urlnormalizer-regex/src/java/" />
- <source path="${plugins.dir}/urlnormalizer-regex/src/test/" />
<source path="${plugins.dir}/urlnormalizer-querystring/src/java/" />
<source path="${plugins.dir}/urlnormalizer-querystring/src/test/" />
+ <source path="${plugins.dir}/urlnormalizer-regex/src/java/" />
+ <source path="${plugins.dir}/urlnormalizer-regex/src/test/" />
+ <source path="${plugins.dir}/urlnormalizer-slash/src/java/" />
+ <source path="${plugins.dir}/urlnormalizer-slash/src/test/" />
<output path="${build.classes}" />
</classpath>
Modified: nutch/trunk/src/plugin/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1676186&r1=1676185&r2=1676186&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Mon Apr 27 01:40:34 2015
@@ -78,6 +78,7 @@
<ant dir="urlnormalizer-pass" target="deploy"/>
<ant dir="urlnormalizer-querystring" target="deploy"/>
<ant dir="urlnormalizer-regex" target="deploy"/>
+ <ant dir="urlnormalizer-slash" target="deploy"/>
</target>
<!-- ====================================================== -->
@@ -118,6 +119,7 @@
<ant dir="urlnormalizer-pass" target="test"/>
<ant dir="urlnormalizer-querystring" target="test"/>
<ant dir="urlnormalizer-regex" target="test"/>
+ <ant dir="urlnormalizer-slash" target="test"/>
</parallel>
</target>
@@ -179,5 +181,6 @@
<ant dir="urlnormalizer-pass" target="clean"/>
<ant dir="urlnormalizer-querystring" target="clean"/>
<ant dir="urlnormalizer-regex" target="clean"/>
+ <ant dir="urlnormalizer-slash" target="clean"/>
</target>
</project>
Added: nutch/trunk/src/plugin/urlnormalizer-slash/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-slash/build.xml?rev=1676186&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-slash/build.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-slash/build.xml Mon Apr 27 01:40:34
2015
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-slash" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="data" />
+ </copy>
+</project>
Added: nutch/trunk/src/plugin/urlnormalizer-slash/data/slashes.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-slash/data/slashes.txt?rev=1676186&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-slash/data/slashes.txt (added)
+++ nutch/trunk/src/plugin/urlnormalizer-slash/data/slashes.txt Mon Apr 27
01:40:34 2015
@@ -0,0 +1,7 @@
+# Both domains have duplicate URL's, some with slashes and some without
+
+# We prefer this domain with slashes
+www.example.org +
+
+# ..but this domain without
+www.example.net -
\ No newline at end of file
Added: nutch/trunk/src/plugin/urlnormalizer-slash/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-slash/ivy.xml?rev=1676186&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-slash/ivy.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-slash/ivy.xml Mon Apr 27 01:40:34 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/urlnormalizer-slash/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-slash/plugin.xml?rev=1676186&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-slash/plugin.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-slash/plugin.xml Mon Apr 27 01:40:34
2015
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlnormalizer-slash"
+ name="Slash URL Normalizer"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlnormalizer-slash.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlnormalizer.slash"
+ name="Nutch Slash URL Normalizer"
+ point="org.apache.nutch.net.URLNormalizer">
+ <implementation id="SlashURLNormalizer"
+
class="org.apache.nutch.net.urlnormalizer.slash.SlashURLNormalizer">
+ <parameter name="file" value="slashes.txt"/>
+ </implementation>
+ </extension>
+
+</plugin>
Added:
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java?rev=1676186&view=auto
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
(added)
+++
nutch/trunk/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
Mon Apr 27 01:40:34 2015
@@ -0,0 +1,224 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.slash;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * @author [email protected]
+ */
+public class SlashURLNormalizer implements URLNormalizer {
+
+ private Configuration conf;
+
+ private static final Logger LOG =
LoggerFactory.getLogger(SlashURLNormalizer.class);
+
+ private static final char QUESTION_MARK = '?';
+ private static final char SLASH = '/';
+ private static final char DOT = '.';
+ private static final String PROTOCOL_DELIMITER = "://";
+
+ private static String attributeFile = null;
+ private String slashesFile = null;
+
+ // We record a map of hosts and boolean, the boolean denotes whether the
host should
+ // have slashes after URL paths. True means slash, false means remove the
slash
+ private static final Map<String,Boolean> slashesMap = new
HashMap<String,Boolean>();
+
+ public SlashURLNormalizer() {}
+
+ public SlashURLNormalizer(String slashesFile) {
+ this.slashesFile = slashesFile;
+ }
+
+ private synchronized void readConfiguration(Reader configReader) throws
IOException {
+ if (slashesMap.size() > 0) {
+ return;
+ }
+
+ BufferedReader reader = new BufferedReader(configReader);
+ String line, host;
+ String rule;
+ int delimiterIndex;
+
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ line.trim();
+ delimiterIndex = line.indexOf(" ");
+ // try tabulator
+ if (delimiterIndex == -1) {
+ delimiterIndex = line.indexOf("\t");
+ }
+
+ host = line.substring(0, delimiterIndex);
+ rule = line.substring(delimiterIndex + 1).trim();
+
+ if (rule.equals("+")) {
+ slashesMap.put(host, true);
+ } else {
+ slashesMap.put(host, false);
+ }
+ }
+ }
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // get the extensions for domain urlfilter
+ String pluginName = "urlnormalizer-slash";
+ Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+ URLNormalizer.class.getName()).getExtensions();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+ attributeFile = extension.getAttribute("file");
+ break;
+ }
+ }
+
+ // handle blank non empty input
+ if (attributeFile != null && attributeFile.trim().equals("")) {
+ attributeFile = null;
+ }
+
+ if (attributeFile != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
+ }
+ }
+ else {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ + pluginName);
+ }
+ }
+
+ // domain file and attribute "file" take precedence if defined
+ String file = conf.get("urlnormalizer.slashes.file");
+ String stringRules = conf.get("urlnormalizer.slashes.rules");
+ if (slashesFile != null) {
+ file = slashesFile;
+ }
+ else if (attributeFile != null) {
+ file = attributeFile;
+ }
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
+ try {
+ if (reader == null) {
+ reader = new FileReader(file);
+ }
+ readConfiguration(reader);
+ }
+ catch (IOException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
+
+ public String normalize(String url, String scope) throws
MalformedURLException {
+ return normalize(url, null, scope);
+ }
+
+ public String normalize(String url, CrawlDatum crawlDatum, String scope)
throws MalformedURLException {
+ // Get URL repr.
+ URL u = new URL(url);
+
+ // Get the host
+ String host = u.getHost();
+
+ // Do we have a rule for this host?
+ if (slashesMap.containsKey(host)) {
+ // Yes, separate the path and optional querystring
+ String protocol = u.getProtocol();
+ String path = u.getPath();
+
+ // Don't do anything to root URL's
+ // / is always set by basic normalizer
+ if (path.length() > 1) {
+ String queryString = u.getQuery();
+
+ // Get the rule
+ boolean rule = slashesMap.get(host);
+
+ // Does it have a trailing slash
+ int lastIndexOfSlash = path.lastIndexOf(SLASH);
+ boolean trailingSlash = (lastIndexOfSlash == path.length() - 1);
+
+ // Do we need to add a trailing slash?
+ if (!trailingSlash && rule) {
+ // Only add a trailing slash if this path doesn't appear to have an
extension/suffix such as .html
+ int lastIndexOfDot = path.lastIndexOf(DOT);
+ if (path.length() < 6 || lastIndexOfDot == -1 || lastIndexOfDot <
path.length() - 6) {
+ StringBuilder buffer = new StringBuilder(protocol);
+ buffer.append(PROTOCOL_DELIMITER);
+ buffer.append(host);
+ buffer.append(path);
+ buffer.append(SLASH);
+ if (queryString != null) {
+ buffer.append(QUESTION_MARK);
+ buffer.append(queryString);
+ }
+ url = buffer.toString();
+ }
+ }
+
+ // Do we need to remove a trailing slash?
+ else if (trailingSlash && !rule) {
+ StringBuilder buffer = new StringBuilder(protocol);
+ buffer.append(PROTOCOL_DELIMITER);
+ buffer.append(host);
+ buffer.append(path.substring(0, lastIndexOfSlash));
+ if (queryString != null) {
+ buffer.append(QUESTION_MARK);
+ buffer.append(queryString);
+ }
+ url = buffer.toString();
+ }
+ }
+ }
+
+ return url;
+ }
+}
\ No newline at end of file
Added:
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java?rev=1676186&view=auto
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
(added)
+++
nutch/trunk/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
Mon Apr 27 01:40:34 2015
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.slash;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestSlashURLNormalizer extends TestCase {
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ public void testSlashURLNormalizer() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ String slashesFile = SAMPLES + SEPARATOR + "slashes.txt";
+ SlashURLNormalizer normalizer = new SlashURLNormalizer(slashesFile);
+ normalizer.setConf(conf);
+
+ // No change
+ assertEquals("http://example.org/",
normalizer.normalize("http://example.org/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/",
normalizer.normalize("http://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
+ // Don't touch base URL's
+ assertEquals("http://example.org",
normalizer.normalize("http://example.org", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net",
normalizer.normalize("http://example.net", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.org/",
normalizer.normalize("http://example.org/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/",
normalizer.normalize("http://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
+ // Change
+ assertEquals("http://www.example.org/page/",
normalizer.normalize("http://www.example.org/page",
URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://www.example.net/path/to/something",
normalizer.normalize("http://www.example.net/path/to/something/",
URLNormalizers.SCOPE_DEFAULT));
+
+ // No change
+ assertEquals("http://example.org/buh/",
normalizer.normalize("http://example.org/buh/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/blaat",
normalizer.normalize("http://example.net/blaat", URLNormalizers.SCOPE_DEFAULT));
+
+ // No change
+ assertEquals("http://example.nl/buh/",
normalizer.normalize("http://example.nl/buh/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.de/blaat",
normalizer.normalize("http://example.de/blaat", URLNormalizers.SCOPE_DEFAULT));
+
+ // Change
+ assertEquals("http://www.example.org/page/?a=b&c=d",
normalizer.normalize("http://www.example.org/page?a=b&c=d",
URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://www.example.net/path/to/something?a=b&c=d",
normalizer.normalize("http://www.example.net/path/to/something/?a=b&c=d",
URLNormalizers.SCOPE_DEFAULT));
+
+ // No change
+ assertEquals("http://www.example.org/noise.mp3",
normalizer.normalize("http://www.example.org/noise.mp3",
URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://www.example.org/page.html",
normalizer.normalize("http://www.example.org/page.html",
URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://www.example.org/page.shtml",
normalizer.normalize("http://www.example.org/page.shtml",
URLNormalizers.SCOPE_DEFAULT));
+
+ // Change
+ assertEquals("http://www.example.org/this.is.not.an_extension/",
normalizer.normalize("http://www.example.org/this.is.not.an_extension",
URLNormalizers.SCOPE_DEFAULT));
+ }
+}