http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
 
b/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
new file mode 100644
index 0000000..1de03a3
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.querystring;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * URL normalizer plugin for normalizing query strings but sorting query string
+ * parameters. Not sorting query strings can lead to large amounts of duplicate
+ * URL's such as ?a=x&b=y vs b=y&a=x.
+ * 
+ */
+public class QuerystringURLNormalizer implements URLNormalizer {
+
+  private Configuration conf;
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(QuerystringURLNormalizer.class);
+
+  public QuerystringURLNormalizer() {
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
+    URL url = new URL(urlString);
+
+    String queryString = url.getQuery();
+
+    if (queryString == null) {
+      return urlString;
+    }
+
+    List<String> queryStringParts = Arrays.asList(queryString.split("&"));
+    Collections.sort(queryStringParts);
+
+    StringBuilder sb = new StringBuilder();
+
+    sb.append(url.getProtocol());
+    sb.append("://");
+    sb.append(url.getHost());
+    if (url.getPort() > -1) {
+      sb.append(":");
+      sb.append(url.getPort());
+    }
+    sb.append(url.getPath());
+    sb.append("?");
+    sb.append(StringUtils.join(queryStringParts, "&"));
+    if (url.getRef() != null) {
+      sb.append("#");
+      sb.append(url.getRef());
+    }
+
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java
 
b/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java
new file mode 100644
index 0000000..005fbca
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL normalizer which sort the elements in the query part to avoid duplicates
+ * by permutations.
+ */
+package org.apache.nutch.net.urlnormalizer.querystring;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
 
b/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
new file mode 100644
index 0000000..b85c55d
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.querystring;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestQuerystringURLNormalizer extends TestCase {
+
+  public void testQuerystringURLNormalizer() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer();
+    normalizer.setConf(conf);
+
+    assertEquals("http://example.com/?a=b&c=d";, normalizer.normalize(
+        "http://example.com/?c=d&a=b";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com/a/b/c";, normalizer.normalize(
+        "http://example.com/a/b/c";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com:1234/a/b/c";, normalizer.normalize(
+        "http://example.com:1234/a/b/c";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com:1234/a/b/c#ref";, normalizer.normalize(
+        "http://example.com:1234/a/b/c#ref";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref";,
+        normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref";,
+            URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com/?a=b&a=c&c=d";, normalizer.normalize(
+        "http://example.com/?c=d&a=b&a=c";, URLNormalizers.SCOPE_DEFAULT));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-regex/build.xml 
b/nutch-plugins/urlnormalizer-regex/build.xml
new file mode 100644
index 0000000..76875ec
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-regex/build.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-regex" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.xml, **/*.test"/>
+  </copy>
+
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-regex/ivy.xml 
b/nutch-plugins/urlnormalizer-regex/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-regex/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-regex/plugin.xml 
b/nutch-plugins/urlnormalizer-regex/plugin.xml
new file mode 100644
index 0000000..e75096f
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-regex/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-regex"
+   name="Regex URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-regex.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.regex"
+              name="Nutch Regex URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="RegexURLNormalizer"
+                      
class="org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-regex/pom.xml 
b/nutch-plugins/urlnormalizer-regex/pom.xml
new file mode 100644
index 0000000..bfb056e
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-regex/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlnormalizer-regex</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlnormalizer-regex</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test 
b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test
new file mode 100644
index 0000000..7867ad8
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test
@@ -0,0 +1,84 @@
+# test simple removal of session id, keeping parameters before and after
+http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 
http://foo.com/foo.php
+http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 
http://foo.com/foo.php?f=2
+http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3 
http://foo.com/foo.php?f=2&q=3
+http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2 
http://foo.com/foo.php?f=2
+
+# test removal of different session ids including removal of ; in jsessionid
+http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl 
http://www.foo.com/foo.php
+http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl&x=y 
http://www.foo.com/foo.php?x=y
+http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED 
http://www.foo.com/foo.html
+http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED
 http://www.foo.com/foo.html?param=1&another=2
+http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2
 http://www.foo.com/foo.html?param=1&another=2
+http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 
http://www.foo.com/foo.php?x=1&something=1
+http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 
http://www.foo.com/foo.html
+http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo
 http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo
+http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43
 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en
+http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47
 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47
+# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328)
+http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 
http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
+http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0
 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0
+
+# test removal default pages
+http://www.foo.com/home/index.html http://www.foo.com/home/
+http://www.foo.com/index.html http://www.foo.com/
+http://www.foo.com/index.htm http://www.foo.com/
+http://www.foo.com/index.asp http://www.foo.com/
+http://www.foo.com/index.aspx http://www.foo.com/
+http://www.foo.com/index.php http://www.foo.com/
+http://www.foo.com/index.php3 http://www.foo.com/
+http://www.foo.com/default.html http://www.foo.com/
+http://www.foo.com/default.htm http://www.foo.com/
+http://www.foo.com/default.asp http://www.foo.com/
+http://www.foo.com/default.aspx http://www.foo.com/
+http://www.foo.com/default.php http://www.foo.com/
+http://www.foo.com/default.php3 http://www.foo.com/
+http://www.foo.com/something.php3 http://www.foo.com/something.php3
+http://www.foo.com/something.html http://www.foo.com/something.html
+http://www.foo.com/something.asp http://www.foo.com/something.asp
+http://www.foo.com/index.phtml http://www.foo.com/
+http://www.foo.com/index.cfm http://www.foo.com/
+http://www.foo.com/index.cgi http://www.foo.com/
+http://www.foo.com/index.HTML http://www.foo.com/
+http://www.foo.com/index.Htm http://www.foo.com/
+http://www.foo.com/index.ASP http://www.foo.com/
+http://www.foo.com/index.jsp http://www.foo.com/
+http://www.foo.com/index.jsf http://www.foo.com/
+http://www.foo.com/index.jspx http://www.foo.com/
+http://www.foo.com/index.jspfx http://www.foo.com/index.jspfx
+http://www.foo.com/index.jspa http://www.foo.com/
+http://www.foo.com/index.jsps http://www.foo.com/index.jsps
+http://www.foo.com/index.aspX http://www.foo.com/
+http://www.foo.com/index.PhP http://www.foo.com/
+http://www.foo.com/index.PhP4 http://www.foo.com/
+http://www.foo.com/default.HTml http://www.foo.com/
+http://www.foo.com/default.HTm http://www.foo.com/
+http://www.foo.com/default.ASp http://www.foo.com/
+http://www.foo.com/default.AspX http://www.foo.com/
+http://www.foo.com/default.PHP http://www.foo.com/
+http://www.foo.com/default.PHP3 http://www.foo.com/
+http://www.foo.com/index.phtml http://www.foo.com/
+http://www.foo.com/index.cfm http://www.foo.com/
+http://www.foo.com/index.cgi http://www.foo.com/
+
+# ensure keeping non-default pages
+http://www.foo.com/foo.php3 http://www.foo.com/foo.php3
+http://www.foo.com/foo.html http://www.foo.com/foo.html
+http://www.foo.com/foo.asp http://www.foo.com/foo.asp
+
+# test removal of interpage anchors and keeping query string
+http://www.foo.com/foo.html#something http://www.foo.com/foo.html
+http://www.foo.com/foo.html#something?x=y http://www.foo.com/foo.html?x=y
+
+# test general cleaning of bad urls
+http://www.foo.com/foo.html?&x=y http://www.foo.com/foo.html?x=y
+http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a
+http://www.foo.com/foo.html? http://www.foo.com/foo.html
+
+# remove double slashes but keep 2 slashes after protocol
+http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html
+https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html
+
+# normalize file: protocol prefix (keep one slash)
+file:///path//foo.html file:/path/foo.html
+file:/path//foo.html file:/path/foo.html

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml 
b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml
new file mode 100644
index 0000000..4d6eabc
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml
@@ -0,0 +1,66 @@
+<?xml version="1.0"?>
+<!-- This is the configuration file for the RegexUrlNormalize Class.
+     This is intended so that users can specify substitutions to be
+     done on URLs. The regex engine that is used is Perl5 compatible.
+     The rules are applied to URLs in the order they occur in this file.  -->
+
+<!-- WATCH OUT: an xml parser reads this file an ampersands must be
+     expanded to &amp; -->
+
+<!-- The following rules show how to strip out session IDs, default pages, 
+     interpage anchors, etc. Order does matter!  -->
+<regex-normalize>
+
+<!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
+<regex>
+  
<pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
+  <substitution>$4</substitution>
+</regex>
+
+<!-- changes default pages into standard for /index.html, etc. into / -->
+<!-- these are commented in the default file but uncommented here for testing 
-->
+<regex>
+  
<pattern>/((?i)index|default)\.((?i)js[pf]{1}?[afx]?|cgi|cfm|asp[x]?|[psx]?htm[l]?|php[3456]?)(\?|&amp;|#|$)</pattern>
+  <substitution>/$3</substitution>
+</regex> 
+
+<!-- removes interpage href anchors such as site.com#location -->
+<regex>
+  <pattern>#.*?(\?|&amp;|$)</pattern>
+  <substitution>$1</substitution>
+</regex>
+
+<!-- cleans ?&var=value into ?var=value -->
+<regex>
+  <pattern>\?&amp;</pattern>
+  <substitution>\?</substitution>
+</regex>
+
+<!-- cleans multiple sequential ampersands into a single ampersand -->
+<regex>
+  <pattern>&amp;{2,}</pattern>
+  <substitution>&amp;</substitution>
+</regex>
+
+<!-- removes trailing ?, ampersands, . -->
+<regex>
+  <pattern>[\?&amp;\.]$</pattern>
+  <substitution></substitution>
+</regex>
+
+<!-- normalize file:/// protocol prefix: -->
+<!--  keep one single slash (NUTCH-1483) -->
+<regex>
+  <pattern>^file://+</pattern>
+  <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
+<regex>
+  <pattern>(?&lt;!:)/{2,}</pattern>
+  <substitution>/</substitution>
+</regex>
+
+</regex-normalize>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test 
b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test
new file mode 100644
index 0000000..9d92880
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test
@@ -0,0 +1,8 @@
+# test removal of subdomains
+http://www.foo.bar.com/ http://bar.com/
+
+# test removal of url path
+http://www.foo.bar.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 
http://bar.com/
+
+# test removal of urls in arguments
+https://www.foo.bar.com/foo.php?url=http://www.example.com/test.php 
https://bar.com/

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml 
b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml
new file mode 100644
index 0000000..3698968
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<!-- This is the configuration file for the RegexUrlNormalize Class.
+     This is intended so that users can specify substitutions to be
+     done on URLs. The regex engine that is used is Perl5 compatible.
+     The rules are applied to URLs in the order they occur in this file.  -->
+
+<!-- WATCH OUT: an xml parser reads this file an ampersands must be
+     expanded to &amp; -->
+
+<!--
+     The following rules show how to reduce urls so that
+     urls from the same domain are identical. This is useful
+     e.g. when calculating host counts, or splitting fetchlists.
+-->
+<regex-normalize>
+<regex>
+  <pattern>(^[a-z]{3,5}://)([\w]+\.)*?(\w+\.\w+)[/$].*</pattern>
+  <substitution>$1$3/</substitution>
+</regex>
+</regex-normalize>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 
b/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
new file mode 100644
index 0000000..363da18
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
@@ -0,0 +1,324 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.regex;
+
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+import org.xml.sax.InputSource;
+
+/**
+ * Allows users to do regex substitutions on all/any URLs that are encountered,
+ * which is useful for stripping session IDs from URLs.
+ * 
+ * <p>
+ * This class uses the <tt>urlnormalizer.regex.file</tt> property. It should be
+ * set to the file name of an xml file which should contain the patterns and
+ * substitutions to be done on encountered URLs.
+ * </p>
+ * <p>
+ * This class also supports different rules depending on the scope. Please see
+ * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.
+ * </p>
+ * 
+ * @author Luke Baker
+ * @author Andrzej Bialecki
+ */
+public class RegexURLNormalizer extends Configured implements URLNormalizer {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(RegexURLNormalizer.class);
+
+  /**
+   * Class which holds a compiled pattern and its corresponding substition
+   * string.
+   */
+  private static class Rule {
+    public Pattern pattern;
+
+    public String substitution;
+  }
+
+  private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = 
new ThreadLocal<HashMap<String, List<Rule>>>() {
+    protected java.util.HashMap<String, java.util.List<Rule>> initialValue() {
+      return new HashMap<String, List<Rule>>();
+    };
+  };
+
+  public HashMap<String, List<Rule>> getScopedRules() {
+    return scopedRulesThreadLocal.get();
+  }
+
+  private List<Rule> defaultRules;
+
+  private static final List<Rule> EMPTY_RULES = Collections.emptyList();
+
+  /**
+   * The default constructor which is called from UrlNormalizerFactory
+   * (normalizerClass.newInstance()) in method: getNormalizer()*
+   */
+  public RegexURLNormalizer() {
+    super(null);
+  }
+
+  public RegexURLNormalizer(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * Constructor which can be passed the file name, so it doesn't look in the
+   * configuration files for it.
+   */
+  public RegexURLNormalizer(Configuration conf, String filename)
+      throws IOException, PatternSyntaxException {
+    super(conf);
+    List<Rule> rules = readConfigurationFile(filename);
+    if (rules != null) {
+      defaultRules = rules;
+    }
+  }
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+    // the default constructor was called
+
+    String filename = getConf().get("urlnormalizer.regex.file");
+    String stringRules = getConf().get("urlnormalizer.regex.rules");
+    Reader reader = null;
+    if (stringRules != null) {
+      reader = new StringReader(stringRules);
+    } else {
+      reader = getConf().getConfResourceAsReader(filename);
+    }
+    List<Rule> rules = null;
+    if (reader == null) {
+      LOG.warn("Can't load the default rules! ");
+      rules = EMPTY_RULES;
+    } else {
+      try {
+        rules = readConfiguration(reader);
+      } catch (Exception e) {
+        LOG.warn("Couldn't read default config: " + e);
+        rules = EMPTY_RULES;
+      }
+    }
+    defaultRules = rules;
+  }
+
+  // used in JUnit test.
+  void setConfiguration(Reader reader, String scope) {
+    List<Rule> rules = readConfiguration(reader);
+    getScopedRules().put(scope, rules);
+    LOG.debug("Set config for scope '" + scope + "': " + rules.size()
+        + " rules.");
+  }
+
+  /**
+   * This function does the replacements by iterating through all the regex
+   * patterns. It accepts a string url as input and returns the altered string.
+   */
+  public String regexNormalize(String urlString, String scope) {
+    HashMap<String, List<Rule>> scopedRules = getScopedRules();
+    List<Rule> curRules = scopedRules.get(scope);
+    if (curRules == null) {
+      // try to populate
+      String configFile = getConf().get("urlnormalizer.regex.file." + scope);
+      if (configFile != null) {
+        LOG.debug("resource for scope '" + scope + "': " + configFile);
+        try {
+          Reader reader = getConf().getConfResourceAsReader(configFile);
+          curRules = readConfiguration(reader);
+          scopedRules.put(scope, curRules);
+        } catch (Exception e) {
+          LOG.warn("Couldn't load resource '" + configFile + "': " + e);
+        }
+      }
+      if (curRules == EMPTY_RULES || curRules == null) {
+        LOG.info("can't find rules for scope '" + scope + "', using default");
+        scopedRules.put(scope, EMPTY_RULES);
+      }
+    }
+    if (curRules == EMPTY_RULES || curRules == null) {
+      curRules = defaultRules;
+    }
+    Iterator<Rule> i = curRules.iterator();
+    while (i.hasNext()) {
+      Rule r = (Rule) i.next();
+
+      Matcher matcher = r.pattern.matcher(urlString);
+
+      urlString = matcher.replaceAll(r.substitution);
+    }
+    return urlString;
+  }
+
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
+    return regexNormalize(urlString, scope);
+  }
+
+  /** Reads the configuration file and populates a List of Rules. */
+  private List<Rule> readConfigurationFile(String filename) {
+    if (LOG.isInfoEnabled()) {
+      LOG.info("loading " + filename);
+    }
+    try {
+      FileReader reader = new FileReader(filename);
+      return readConfiguration(reader);
+    } catch (Exception e) {
+      LOG.error("Error loading rules from '" + filename + "': " + e);
+      return EMPTY_RULES;
+    }
+  }
+
+  private List<Rule> readConfiguration(Reader reader) {
+    List<Rule> rules = new ArrayList<Rule>();
+    try {
+
+      // borrowed heavily from code in Configuration.java
+      Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
+          .parse(new InputSource(reader));
+      Element root = doc.getDocumentElement();
+      if ((!"regex-normalize".equals(root.getTagName()))
+          && (LOG.isErrorEnabled())) {
+        LOG.error("bad conf file: top-level element not <regex-normalize>");
+      }
+      NodeList regexes = root.getChildNodes();
+      for (int i = 0; i < regexes.getLength(); i++) {
+        Node regexNode = regexes.item(i);
+        if (!(regexNode instanceof Element))
+          continue;
+        Element regex = (Element) regexNode;
+        if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) {
+          LOG.warn("bad conf file: element not <regex>");
+        }
+        NodeList fields = regex.getChildNodes();
+        String patternValue = null;
+        String subValue = null;
+        for (int j = 0; j < fields.getLength(); j++) {
+          Node fieldNode = fields.item(j);
+          if (!(fieldNode instanceof Element))
+            continue;
+          Element field = (Element) fieldNode;
+          if ("pattern".equals(field.getTagName()) && field.hasChildNodes())
+            patternValue = ((Text) field.getFirstChild()).getData();
+          if ("substitution".equals(field.getTagName())
+              && field.hasChildNodes())
+            subValue = ((Text) field.getFirstChild()).getData();
+          if (!field.hasChildNodes())
+            subValue = "";
+        }
+        if (patternValue != null && subValue != null) {
+          Rule rule = new Rule();
+          try {
+            rule.pattern = Pattern.compile(patternValue);
+          } catch (PatternSyntaxException e) {
+            if (LOG.isErrorEnabled()) {
+              LOG.error("skipped rule: " + patternValue + " -> " + subValue
+                  + " : invalid regular expression pattern: " + e);
+            }
+            continue;
+          }
+          rule.substitution = subValue;
+          rules.add(rule);
+        }
+      }
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("error parsing conf file: " + e);
+      }
+      return EMPTY_RULES;
+    }
+    if (rules.size() == 0)
+      return EMPTY_RULES;
+    return rules;
+  }
+
+  /** Spits out patterns and substitutions that are in the configuration file. 
*/
+  public static void main(String args[]) throws PatternSyntaxException,
+      IOException {
+    RegexURLNormalizer normalizer = new RegexURLNormalizer();
+    normalizer.setConf(NutchConfiguration.create());
+    HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules();
+    Iterator<Rule> i = normalizer.defaultRules.iterator();
+    System.out.println("* Rules for 'DEFAULT' scope:");
+    while (i.hasNext()) {
+      Rule r = i.next();
+      System.out.print("  " + r.pattern.pattern() + " -> ");
+      System.out.println(r.substitution);
+    }
+    // load the scope
+    if (args.length > 1) {
+      normalizer.normalize("http://test.com";, args[1]);
+    }
+    if (scopedRules.size() > 1) {
+      Iterator<String> it = scopedRules.keySet().iterator();
+      while (it.hasNext()) {
+        String scope = it.next();
+        if (URLNormalizers.SCOPE_DEFAULT.equals(scope))
+          continue;
+        System.out.println("* Rules for '" + scope + "' scope:");
+        i = ((List<Rule>) scopedRules.get(scope)).iterator();
+        while (i.hasNext()) {
+          Rule r = (Rule) i.next();
+          System.out.print("  " + r.pattern.pattern() + " -> ");
+          System.out.println(r.substitution);
+        }
+      }
+    }
+    if (args.length > 0) {
+      System.out.println("\n---------- Normalizer test -----------");
+      String scope = URLNormalizers.SCOPE_DEFAULT;
+      if (args.length > 1)
+        scope = args[1];
+      System.out.println("Scope: " + scope);
+      System.out.println("Input url:  '" + args[0] + "'");
+      System.out.println("Output url: '" + normalizer.normalize(args[0], scope)
+          + "'");
+    }
+    System.exit(0);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java
 
b/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java
new file mode 100644
index 0000000..04562c3
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL normalizer with configurable rules based on regular expressions
+ * ({@link java.util.regex.Pattern}).
+ */
+package org.apache.nutch.net.urlnormalizer.regex;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
 
b/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
new file mode 100644
index 0000000..cbf6c64
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.regex;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.*;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+/** Unit tests for RegexUrlNormalizer. */
+public class TestRegexURLNormalizer {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestRegexURLNormalizer.class);
+
+  private RegexURLNormalizer normalizer;
+  private Configuration conf;
+  private Map<String, NormalizedURL[]> testData = new HashMap<String, 
NormalizedURL[]>();
+
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation.
+
+  public TestRegexURLNormalizer() throws IOException {
+    normalizer = new RegexURLNormalizer();
+    conf = NutchConfiguration.create();
+    normalizer.setConf(conf);
+    File[] configs = new File(sampleDir).listFiles(new FileFilter() {
+      public boolean accept(File f) {
+        if (f.getName().endsWith(".xml")
+            && f.getName().startsWith("regex-normalize-"))
+          return true;
+        return false;
+      }
+    });
+    for (int i = 0; i < configs.length; i++) {
+      try {
+        FileReader reader = new FileReader(configs[i]);
+        String cname = configs[i].getName();
+        cname = cname.substring(16, cname.indexOf(".xml"));
+        normalizer.setConfiguration(reader, cname);
+        NormalizedURL[] urls = readTestFile(cname);
+        testData.put(cname, urls);
+      } catch (Exception e) {
+        LOG.warn("Could load config from '" + configs[i] + "': " + 
e.toString());
+      }
+    }
+  }
+
+  @Test
+  public void testNormalizerDefault() throws Exception {
+    normalizeTest((NormalizedURL[]) testData.get(URLNormalizers.SCOPE_DEFAULT),
+        URLNormalizers.SCOPE_DEFAULT);
+  }
+
+  @Test
+  public void testNormalizerScope() throws Exception {
+    Iterator<String> it = testData.keySet().iterator();
+    while (it.hasNext()) {
+      String scope = it.next();
+      normalizeTest((NormalizedURL[]) testData.get(scope), scope);
+    }
+  }
+
+  private void normalizeTest(NormalizedURL[] urls, String scope)
+      throws Exception {
+    for (int i = 0; i < urls.length; i++) {
+      String url = urls[i].url;
+      String normalized = normalizer.normalize(urls[i].url, scope);
+      String expected = urls[i].expectedURL;
+      LOG.info("scope: " + scope + " url: " + url + " | normalized: "
+          + normalized + " | expected: " + expected);
+      Assert.assertEquals(urls[i].expectedURL, normalized);
+    }
+  }
+
+  private void bench(int loops, String scope) {
+    long start = System.currentTimeMillis();
+    try {
+      NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope);
+      if (expected == null)
+        return;
+      for (int i = 0; i < loops; i++) {
+        normalizeTest(expected, scope);
+      }
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+    LOG.info("bench time (" + loops + ") "
+        + (System.currentTimeMillis() - start) + "ms");
+  }
+
+  private static class NormalizedURL {
+    String url;
+    String expectedURL;
+
+    public NormalizedURL(String line) {
+      String[] fields = line.split("\\s+");
+      url = fields[0];
+      expectedURL = fields[1];
+    }
+  }
+
+  private NormalizedURL[] readTestFile(String scope) throws IOException {
+    File f = new File(sampleDir, "regex-normalize-" + scope + ".test");
+    @SuppressWarnings("resource")
+    BufferedReader in = new BufferedReader(new InputStreamReader(
+        new FileInputStream(f), "UTF-8"));
+    List<NormalizedURL> list = new ArrayList<NormalizedURL>();
+    String line;
+    while ((line = in.readLine()) != null) {
+      if (line.trim().length() == 0 || line.startsWith("#")
+          || line.startsWith(" "))
+        continue;
+      list.add(new NormalizedURL(line));
+    }
+    return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]);
+  }
+
+  public static void main(String[] args) throws Exception {
+    if (args.length == 0) {
+      System.err.println("TestRegexURLNormalizer [-bench <iter>] <scope>");
+      System.exit(-1);
+    }
+    boolean bench = false;
+    int iter = -1;
+    String scope = null;
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-bench")) {
+        bench = true;
+        iter = Integer.parseInt(args[++i]);
+      } else
+        scope = args[i];
+    }
+    if (scope == null) {
+      System.err.println("Missing required scope name.");
+      System.exit(-1);
+    }
+    if (bench && iter < 0) {
+      System.err.println("Invalid number of iterations: " + iter);
+      System.exit(-1);
+    }
+    TestRegexURLNormalizer test = new TestRegexURLNormalizer();
+    NormalizedURL[] urls = (NormalizedURL[]) test.testData.get(scope);
+    if (urls == null) {
+      LOG.warn("Missing test data for scope '" + scope
+          + "', using default scope.");
+      scope = URLNormalizers.SCOPE_DEFAULT;
+      urls = (NormalizedURL[]) test.testData.get(scope);
+    }
+    if (bench) {
+      test.bench(iter, scope);
+    } else {
+      test.normalizeTest(urls, scope);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-slash/build.xml 
b/nutch-plugins/urlnormalizer-slash/build.xml
new file mode 100644
index 0000000..29b2262
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-slash/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-slash" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/data/slashes.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-slash/data/slashes.txt 
b/nutch-plugins/urlnormalizer-slash/data/slashes.txt
new file mode 100644
index 0000000..d3bd70a
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-slash/data/slashes.txt
@@ -0,0 +1,7 @@
+# Both domains have duplicate URL's, some with slashes and some without
+
+# We prefer this domain with slashes
+www.example.org +
+
+# ..but this domain without
+www.example.net -
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-slash/ivy.xml 
b/nutch-plugins/urlnormalizer-slash/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-slash/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-slash/plugin.xml 
b/nutch-plugins/urlnormalizer-slash/plugin.xml
new file mode 100644
index 0000000..db820ed
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-slash/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-slash"
+   name="Slash URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-slash.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.slash"
+              name="Nutch Slash URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="SlashURLNormalizer"
+                      
class="org.apache.nutch.net.urlnormalizer.slash.SlashURLNormalizer">
+        <parameter name="file" value="slashes.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-slash/pom.xml 
b/nutch-plugins/urlnormalizer-slash/pom.xml
new file mode 100644
index 0000000..0ac618f
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-slash/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlnormalizer-slash</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlnormalizer-slash</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/src/main/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-slash/src/main/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
 
b/nutch-plugins/urlnormalizer-slash/src/main/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
new file mode 100644
index 0000000..4554cf0
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-slash/src/main/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
@@ -0,0 +1,224 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.slash;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * @author [email protected]
+ */
+public class SlashURLNormalizer implements URLNormalizer {
+
+  private Configuration conf;
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(SlashURLNormalizer.class);
+
+  private static final char QUESTION_MARK = '?';
+  private static final char SLASH = '/';
+  private static final char DOT = '.';
+  private static final String PROTOCOL_DELIMITER = "://";
+
+  private static String attributeFile = null;
+  private String slashesFile = null;
+  
+  // We record a map of hosts and boolean, the boolean denotes whether the 
host should
+  // have slashes after URL paths. True means slash, false means remove the 
slash
+  private static final Map<String,Boolean> slashesMap = new 
HashMap<String,Boolean>();
+
+  public SlashURLNormalizer() {}
+
+  public SlashURLNormalizer(String slashesFile) {
+    this.slashesFile = slashesFile;
+  }
+
+  private synchronized void readConfiguration(Reader configReader) throws 
IOException {
+    if (slashesMap.size() > 0) {
+      return;
+    }
+
+    BufferedReader reader = new BufferedReader(configReader);
+    String line, host;
+    String rule;
+    int delimiterIndex;
+
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        line.trim();
+        delimiterIndex = line.indexOf(" ");
+        // try tabulator
+        if (delimiterIndex == -1) {
+          delimiterIndex = line.indexOf("\t");
+        }
+
+        host = line.substring(0, delimiterIndex);
+        rule = line.substring(delimiterIndex + 1).trim();
+        
+        if (rule.equals("+")) {
+          slashesMap.put(host, true);
+        } else {
+          slashesMap.put(host, false);
+        }
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "urlnormalizer-slash";
+    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+      URLNormalizer.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+          + " as " + attributeFile);
+      }
+    }
+    else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+          + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("urlnormalizer.slashes.file");
+    String stringRules = conf.get("urlnormalizer.slashes.rules");
+    if (slashesFile != null) {
+      file = slashesFile;
+    }
+    else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    }
+    catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+  
+  public String normalize(String url, String scope) throws 
MalformedURLException {
+    return normalize(url, null, scope);
+  }
+
+  public String normalize(String url, CrawlDatum crawlDatum, String scope) 
throws MalformedURLException {
+    // Get URL repr.
+    URL u = new URL(url);
+    
+    // Get the host
+    String host = u.getHost();
+
+    // Do we have a rule for this host?
+    if (slashesMap.containsKey(host)) {
+      // Yes, separate the path and optional querystring
+      String protocol = u.getProtocol();
+      String path = u.getPath();
+
+      // Don't do anything to root URL's
+      // / is always set by basic normalizer
+      if (path.length() > 1) {
+        String queryString = u.getQuery();
+        
+        // Get the rule
+        boolean rule = slashesMap.get(host);
+        
+        // Does it have a trailing slash
+        int lastIndexOfSlash = path.lastIndexOf(SLASH);
+        boolean trailingSlash = (lastIndexOfSlash == path.length() - 1);
+        
+        // Do we need to add a trailing slash?
+        if (!trailingSlash && rule) {
+          // Only add a trailing slash if this path doesn't appear to have an 
extension/suffix such as .html
+          int lastIndexOfDot = path.lastIndexOf(DOT);
+          if (path.length() < 6 || lastIndexOfDot == -1 || lastIndexOfDot < 
path.length() - 6) {          
+            StringBuilder buffer = new StringBuilder(protocol);
+            buffer.append(PROTOCOL_DELIMITER);
+            buffer.append(host);
+            buffer.append(path);
+            buffer.append(SLASH);
+            if (queryString != null) {
+              buffer.append(QUESTION_MARK);
+              buffer.append(queryString);
+            }
+            url = buffer.toString();
+          }
+        }
+        
+        // Do we need to remove a trailing slash?
+        else if (trailingSlash && !rule) {
+          StringBuilder buffer = new StringBuilder(protocol);
+          buffer.append(PROTOCOL_DELIMITER);
+          buffer.append(host);
+          buffer.append(path.substring(0, lastIndexOfSlash));
+          if (queryString != null) {
+            buffer.append(QUESTION_MARK);
+            buffer.append(queryString);
+          }
+          url = buffer.toString();      
+        }
+      }
+    }
+
+    return url;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
 
b/nutch-plugins/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
new file mode 100644
index 0000000..c3585e4
--- /dev/null
+++ 
b/nutch-plugins/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.slash;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestSlashURLNormalizer extends TestCase {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  public void testSlashURLNormalizer() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    String slashesFile = SAMPLES + SEPARATOR + "slashes.txt";
+    SlashURLNormalizer normalizer = new SlashURLNormalizer(slashesFile);
+    normalizer.setConf(conf);
+
+    // No change
+    assertEquals("http://example.org/";, 
normalizer.normalize("http://example.org/";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.net/";, 
normalizer.normalize("http://example.net/";, URLNormalizers.SCOPE_DEFAULT));
+    
+    // Don't touch base URL's
+    assertEquals("http://example.org";, 
normalizer.normalize("http://example.org";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.net";, 
normalizer.normalize("http://example.net";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.org/";, 
normalizer.normalize("http://example.org/";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.net/";, 
normalizer.normalize("http://example.net/";, URLNormalizers.SCOPE_DEFAULT));
+    
+    // Change
+    assertEquals("http://www.example.org/page/";, 
normalizer.normalize("http://www.example.org/page";, 
URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://www.example.net/path/to/something";, 
normalizer.normalize("http://www.example.net/path/to/something/";, 
URLNormalizers.SCOPE_DEFAULT));
+    
+    // No change
+    assertEquals("http://example.org/buh/";, 
normalizer.normalize("http://example.org/buh/";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.net/blaat";, 
normalizer.normalize("http://example.net/blaat";, URLNormalizers.SCOPE_DEFAULT));
+    
+    // No change
+    assertEquals("http://example.nl/buh/";, 
normalizer.normalize("http://example.nl/buh/";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.de/blaat";, 
normalizer.normalize("http://example.de/blaat";, URLNormalizers.SCOPE_DEFAULT));
+    
+    // Change
+    assertEquals("http://www.example.org/page/?a=b&c=d";, 
normalizer.normalize("http://www.example.org/page?a=b&c=d";, 
URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://www.example.net/path/to/something?a=b&c=d";, 
normalizer.normalize("http://www.example.net/path/to/something/?a=b&c=d";, 
URLNormalizers.SCOPE_DEFAULT));
+    
+    // No change
+    assertEquals("http://www.example.org/noise.mp3";, 
normalizer.normalize("http://www.example.org/noise.mp3";, 
URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://www.example.org/page.html";, 
normalizer.normalize("http://www.example.org/page.html";, 
URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://www.example.org/page.shtml";, 
normalizer.normalize("http://www.example.org/page.shtml";, 
URLNormalizers.SCOPE_DEFAULT));
+
+    // Change
+    assertEquals("http://www.example.org/this.is.not.an_extension/";, 
normalizer.normalize("http://www.example.org/this.is.not.an_extension";, 
URLNormalizers.SCOPE_DEFAULT));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index b92375c..8cffbc2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -10,7 +10,7 @@
     <packaging>pom</packaging>
 
     <properties>
-
+        <junit.version>4.12</junit.version>
     </properties>
     <modules>
         <module>nutch-core</module>
@@ -26,6 +26,26 @@
                     <target>1.7</target>
                 </configuration>
             </plugin>
+            <plugin>
+                <artifactId>maven-clean-plugin</artifactId>
+                <version>3.0.0</version>
+                <configuration>
+                    <filesets>
+                        <fileset>
+                            <directory>runtime/</directory>
+                            <followSymlinks>false</followSymlinks>
+                        </fileset>
+                    </filesets>
+                </configuration>
+            </plugin>
         </plugins>
     </build>
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>${junit.version}</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/build-plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml
deleted file mode 100755
index c759d5f..0000000
--- a/src/plugin/build-plugin.xml
+++ /dev/null
@@ -1,255 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!-- Imported by plugin build.xml files to define default targets. -->
-<project xmlns:ivy="antlib:org.apache.ivy.ant">
-
-  <property name="name" value="${ant.project.name}"/>
-  <property name="root" value="${basedir}"/>
-
-  <!-- load plugin-specific properties first -->
-  <property file="${user.home}/${name}.build.properties" />
-  <property file="${root}/build.properties" />
-
-  <property name="nutch.root" location="${root}/../../../"/>
-
-  <property name="src.dir" location="${root}/src/java"/>
-  <property name="src.test" location="${root}/src/test"/>
-
-  <available file="${src.test}" type="dir" property="test.available"/>
-
-  <property name="conf.dir" location="${nutch.root}/conf"/>
-
-  <property name="build.dir" location="${nutch.root}/build/${name}"/>
-  <property name="build.classes" location="${build.dir}/classes"/>
-  <property name="build.test" location="${build.dir}/test"/>
-  <property name="build.test.lib" location="${build.test}/lib"/>
-
-  <property name="deploy.dir" location="${nutch.root}/build/plugins/${name}"/>
-
-  <!-- load nutch defaults last so that they can be overridden above -->
-  <property file="${nutch.root}/default.properties" />
-
-  <ivy:settings id="ivy.instance" file="${nutch.root}/ivy/ivysettings.xml" />
-
-  <path id="plugin.deps"/>
-
-  <fileset id="lib.jars" dir="${root}" includes="lib/*.jar"/>
-
-  <!-- the normal classpath -->
-  <path id="classpath">
-    <pathelement location="${build.classes}"/>
-    <fileset refid="lib.jars"/>
-    <pathelement location="${nutch.root}/build/classes"/>
-    <fileset dir="${nutch.root}/build/lib">
-      <include name="*.jar" />
-    </fileset>
-    <path refid="plugin.deps"/>
-    <fileset dir="${deploy.dir}">
-      <include name="*.jar" />
-    </fileset>
-  </path>
-
-  <!-- the unit test classpath -->
-  <path id="test.classpath">
-    <pathelement location="${build.test}" />
-    <pathelement location="${nutch.root}/build/test/classes"/>
-    <pathelement location="${nutch.root}/src/test"/>
-    <pathelement location="${conf.dir}"/>
-    <pathelement location="${nutch.root}/build"/>
-    <!-- test dependencies specific to current plugin -->
-    <fileset dir="${build.test.lib}">
-      <include name="*.jar" />
-    </fileset>
-    <!-- global test dependencies -->
-    <fileset dir="${nutch.root}/build/test/lib">
-      <include name="*.jar" />
-    </fileset>
-    <path refid="classpath"/>
-  </path>
-
-  <!-- ====================================================== -->
-  <!-- Stuff needed by all targets                            -->
-  <!-- ====================================================== -->
-  <target name="init">
-    <mkdir dir="${build.dir}"/>
-    <mkdir dir="${build.classes}"/>
-    <mkdir dir="${build.test}"/>
-    <mkdir dir="${build.test.lib}"/>
-    <mkdir dir="${deploy.dir}"/>
-
-    <antcall target="init-plugin"/>
-  </target>
-
-  <!-- to be overridden by sub-projects --> 
-  <target name="init-plugin"/>
-
-  <!--
-   ! Used to build plugin compilation dependencies
-   ! (to be overridden by plugins)
-   !-->
-  <target name="deps-jar"/>
-
-  <!--
-   ! Used to deploy plugin runtime dependencies
-   ! (to be overridden by plugins)
-   !-->
-  <target name="deps-test"/>
-
-  <!--
-   ! Used to compile test for plugin runtime dependencies
-   ! (to be overridden by plugins)
-   !-->
-  <target name="deps-test-compile"/>
-
-  <!-- ====================================================== -->
-  <!-- Compile the Java files                                 -->
-  <!-- ====================================================== -->
-  <target name="compile" depends="init,deps-jar, resolve-default">
-    <echo message="Compiling plugin: ${name}"/>
-    <javac 
-     encoding="${build.encoding}" 
-     srcdir="${src.dir}"
-     includes="**/*.java"
-     destdir="${build.classes}"
-     debug="${javac.debug}"
-     optimize="${javac.optimize}"
-     target="${javac.version}"
-     source="${javac.version}"
-     deprecation="${javac.deprecation}">
-      <classpath refid="classpath"/>
-    </javac>
-  </target>
-
-  <target name="compile-core">
-    <ant target="compile-core" inheritall="false" dir="${nutch.root}"/>
-    <ant target="compile"/>
-  </target>
-  
-  <!-- ================================================================== -->
-  <!-- Make plugin .jar                                                   -->
-  <!-- ================================================================== -->
-  <!--                                                                    -->
-  <!-- ================================================================== -->
-  <target name="jar" depends="compile">
-    <jar
-      jarfile="${build.dir}/${name}.jar"
-      basedir="${build.classes}"
-    />
-  </target>
-
-  <target name="jar-core" depends="compile-core">
-    <jar
-        jarfile="${build.dir}/${name}.jar"
-        basedir="${build.classes}"
-        />
-  </target>
-
-  <!-- ================================================================== -->
-  <!-- Deploy plugin to ${deploy.dir}                                     -->
-  <!-- ================================================================== -->
-  <!--                                                                    -->
-  <!-- ================================================================== -->
-  <target name="deploy" depends="jar, deps-test">
-    <mkdir dir="${deploy.dir}"/>
-    <copy file="plugin.xml" todir="${deploy.dir}" 
-          preservelastmodified="true"/>
-    <available property="lib-available"
-                 file="${build.dir}/${name}.jar"/>
-    <antcall target="copy-generated-lib"/>
-    <copy todir="${deploy.dir}" flatten="true">
-      <fileset refid="lib.jars"/>
-    </copy>
-  </target>
-       
-  <target name="copy-generated-lib" if="lib-available">
-    <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}" 
failonerror="false"/>
-  </target>
-
-  <!-- ================================================================== -->
-  <!-- Compile test code                                                  --> 
-  <!-- ================================================================== -->
-  <target name="compile-test" depends="compile, deps-test-compile" 
if="test.available">
-    <javac 
-     encoding="${build.encoding}" 
-     srcdir="${src.test}"
-     includes="**/*.java"
-     destdir="${build.test}"
-     debug="${javac.debug}"
-     optimize="${javac.optimize}"
-     target="${javac.version}"
-     source="${javac.version}"
-     deprecation="${javac.deprecation}">
-      <classpath refid="test.classpath"/>
-    </javac>    
-  </target>
-
-  <!-- ================================================================== -->
-  <!-- Run unit tests                                                     --> 
-  <!-- ================================================================== -->
-  <target name="test" depends="compile-test, deploy" if="test.available">
-    <echo message="Testing plugin: ${name}"/>
-
-    <junit printsummary="yes" haltonfailure="no" fork="yes"
-      errorProperty="tests.failed" failureProperty="tests.failed">
-      <sysproperty key="test.data" value="${build.test}/data"/>
-      <sysproperty key="test.input" value="${root}/data"/>
-      <sysproperty key="javax.xml.parsers.DocumentBuilderFactory" 
value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/> 
-      <classpath refid="test.classpath"/>
-      <formatter type="${test.junit.output.format}" />
-      <batchtest todir="${build.test}" unless="testcase">
-        <fileset dir="${src.test}"
-                 includes="**/Test*.java" excludes="**/${test.exclude}.java" />
-      </batchtest>
-      <batchtest todir="${build.test}" if="testcase">
-        <fileset dir="${src.test}" includes="**/${testcase}.java"/>
-      </batchtest>
-    </junit>
-
-    <fail if="tests.failed">Tests failed!</fail>
-
-  </target>   
-
-  <!-- target: resolve  ================================================= -->
-  <target name="resolve-default" depends="clean-lib" description="resolve and 
retrieve dependencies with ivy">
-    <ivy:resolve file="ivy.xml" conf="default" log="download-only"/>
-    <ivy:retrieve pattern="${deploy.dir}/[artifact]-[revision].[ext]" 
symlink="false" log="quiet"/>
-  </target>
-
-  <target name="resolve-test" depends="clean-lib" description="resolve and 
retrieve dependencies with ivy">
-    <ivy:resolve file="ivy.xml" conf="test" log="download-only"/>
-    <ivy:retrieve pattern="${build.test.lib}/[artifact]-[revision].[ext]" 
symlink="false" log="quiet"/>
-  </target>
-
-  <!-- ================================================================== -->
-  <!-- Clean.  Delete the build files, and their directories              -->
-  <!-- ================================================================== -->
-  <!-- target: clean  =================================================== -->
-  <target name="clean" depends="clean-build, clean-lib" description="--> clean 
the project" />
-
-  <!-- target: clean-lib  =============================================== -->
-  <target name="clean-lib" description="--> clean the project libraries 
directory (dependencies)">
-    <delete includeemptydirs="true" dir="${build.lib.dir}"/>
-  </target>
-
-  <!-- target: clean-build  ============================================= -->
-  <target name="clean-build" description="--> clean the project built files">
-    <delete includeemptydirs="true" dir="${build.dir}"/>
-    <delete includeemptydirs="true" dir="${deploy.dir}"/>
-  </target>
-
-</project>

Reply via email to