This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 1fc98bf NUTCH-2690 Configurable and fast URL filter - performs fast
exact matches on host/domain names - before applying regexes to the path
component of a URL
new 8cc41d8 Merge pull request #433 from commoncrawl/cc-fast-url-filter
1fc98bf is described below
commit 1fc98bf061aedb98be4453865201ce6d9f1dede6
Author: Sebastian Nagel <[email protected]>
AuthorDate: Mon Nov 28 12:56:46 2016 +0100
NUTCH-2690 Configurable and fast URL filter
- performs fast exact matches on host/domain names
- before applying regexes to the path component of a URL
---
build.xml | 4 +
conf/fast-urlfilter.txt.template | 49 ++++
conf/nutch-default.xml | 7 +
default.properties | 1 +
src/plugin/build.xml | 3 +
.../urlfilter/api/RegexURLFilterBaseTest.java | 22 +-
.../urlfilter-automaton/sample/Benchmarks.urls | 22 +-
src/plugin/urlfilter-fast/README.md | 59 ++++
src/plugin/urlfilter-fast/build.xml | 51 ++++
src/plugin/urlfilter-fast/ivy.xml | 41 +++
src/plugin/urlfilter-fast/plugin.xml | 41 +++
.../sample/Benchmarks.urls | 22 +-
.../sample/fast-urlfilter-benchmark.txt | 25 ++
.../urlfilter-fast/sample/fast-urlfilter-test.txt | 19 ++
src/plugin/urlfilter-fast/sample/test.urls | 21 ++
.../apache/nutch/urlfilter/fast/FastURLFilter.java | 315 +++++++++++++++++++++
.../apache/nutch/urlfilter/fast/package-info.java | 24 ++
.../nutch/urlfilter/fast/TestFastURLFilter.java | 55 ++++
src/plugin/urlfilter-regex/sample/Benchmarks.urls | 22 +-
19 files changed, 798 insertions(+), 5 deletions(-)
diff --git a/build.xml b/build.xml
index 04a36a6..5d883e3 100644
--- a/build.xml
+++ b/build.xml
@@ -231,6 +231,7 @@
<packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-domain/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/>
+ <packageset dir="${plugins.dir}/urlfilter-fast/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
@@ -728,6 +729,7 @@
<packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-domain/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/>
+ <packageset dir="${plugins.dir}/urlfilter-fast/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
@@ -1150,6 +1152,8 @@
<source path="${plugins.dir}/urlfilter-domain/src/test/" />
<source path="${plugins.dir}/urlfilter-domainblacklist/src/java/" />
<source path="${plugins.dir}/urlfilter-domainblacklist/src/test/" />
+ <source path="${plugins.dir}/urlfilter-fast/src/java/"/>
+ <source path="${plugins.dir}/urlfilter-fast/src/test/"/>
<source path="${plugins.dir}/urlfilter-ignoreexempt/src/java/" />
<source path="${plugins.dir}/urlfilter-prefix/src/java/" />
<source path="${plugins.dir}/urlfilter-prefix/src/test/" />
diff --git a/conf/fast-urlfilter.txt.template b/conf/fast-urlfilter.txt.template
new file mode 100644
index 0000000..99bb5c9
--- /dev/null
+++ b/conf/fast-urlfilter.txt.template
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Rule file for the plugin urlfilter-fast
+
+# Used to filter a large number of domain and host-specific regular
+# expressions
+
+#
+# `Domain` rules are applied to all hosts and subdomains of a domain, e.g.
+#
+# Domain example.org
+# DenyPath (?i)%7c # matches against just the path part of
URL
+# DenyPathQuery ^/resource\?x=1 # matches against path + query
+#
+#
+# To match against a single hostname:
+#
+# Host www.example.com
+# DenyPath (?i)%7c
+#
+#
+# Global rules are defined using the domain name `.`:
+#
+# Domain .
+# (/[^/]+)/[^/]+\1/[^/]+\1/
+# # skips URLs with slash-delimited segment that repeats 3+ times, to
break loops
+#
+#
+# Comments start with the `#` character and reach until the end of the line.
+#
+#
+# For more details, see
+# - src/plugin/urlfilter-fast/README.md
+# -
src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+#
+
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 951494e..b919e43 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1647,6 +1647,13 @@ CAUTION: Set the parser.timeout to -1 or a bigger value
than 30, when using this
</property>
<property>
+ <name>urlfilter.fast.file</name>
+ <value>fast-urlfilter.txt</value>
+ <description>Name of file on CLASSPATH containing regular expressions
+ used by urlfilter-fast (FastURLFilter) plugin.</description>
+</property>
+
+<property>
<name>urlfilter.order</name>
<value></value>
<description>The order by which url filters are applied.
diff --git a/default.properties b/default.properties
index a3bc0cf..899f33d 100644
--- a/default.properties
+++ b/default.properties
@@ -104,6 +104,7 @@ plugins.urlfilter=\
org.apache.nutch.urlfilter.automaton*:\
org.apache.nutch.urlfilter.domain*:\
org.apache.nutch.urlfilter.domainblacklist*:\
+ org.apache.nutch.urlfilter.fast*:\
org.apache.nutch.urlfilter.ignoreexempt*:\
org.apache.nutch.urlfilter.prefix*:\
org.apache.nutch.urlfilter.regex*:\
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 2592357..51c3fe7 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -89,6 +89,7 @@
<ant dir="urlfilter-automaton" target="deploy"/>
<ant dir="urlfilter-domain" target="deploy" />
<ant dir="urlfilter-domainblacklist" target="deploy" />
+ <ant dir="urlfilter-fast" target="deploy"/>
<ant dir="urlfilter-prefix" target="deploy"/>
<ant dir="urlfilter-regex" target="deploy"/>
<ant dir="urlfilter-suffix" target="deploy"/>
@@ -146,6 +147,7 @@
<ant dir="urlfilter-automaton" target="test"/>
<ant dir="urlfilter-domain" target="test"/>
<ant dir="urlfilter-domainblacklist" target="test"/>
+ <ant dir="urlfilter-fast" target="test"/>
<!--ant dir="urlfilter-ignoreexempt" target="test"/-->
<ant dir="urlfilter-prefix" target="test"/>
<ant dir="urlfilter-regex" target="test"/>
@@ -234,6 +236,7 @@
<ant dir="urlfilter-automaton" target="clean"/>
<ant dir="urlfilter-domain" target="clean" />
<ant dir="urlfilter-domainblacklist" target="clean" />
+ <ant dir="urlfilter-fast" target="clean"/>
<ant dir="urlfilter-ignoreexempt" target="clean"/>
<ant dir="urlfilter-prefix" target="clean"/>
<ant dir="urlfilter-regex" target="clean"/>
diff --git
a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
index 730d3cb..c77c67e 100644
---
a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
+++
b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -43,8 +43,8 @@ public abstract class RegexURLFilterBaseTest {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
- private final static String SEPARATOR = System.getProperty("file.separator");
- private final static String SAMPLES = System.getProperty("test.data", ".");
+ protected final static String SEPARATOR =
System.getProperty("file.separator");
+ protected final static String SAMPLES = System.getProperty("test.data", ".");
protected abstract URLFilter getURLFilter(Reader rules);
@@ -72,6 +72,24 @@ public abstract class RegexURLFilterBaseTest {
+ (System.currentTimeMillis() - start) + "ms");
}
+ protected void bench(int loops, String rulesFile, String urlsFile) {
+ try {
+ bench(loops, new FileReader(SAMPLES + SEPARATOR + rulesFile),
+ new FileReader(SAMPLES + SEPARATOR + urlsFile));
+ } catch (Exception e) {
+ Assert.fail(e.toString());
+ }
+ }
+
+ protected void test(String rulesFile, String urlsFile) {
+ try {
+ test(new FileReader(SAMPLES + SEPARATOR + rulesFile),
+ new FileReader(SAMPLES + SEPARATOR + urlsFile));
+ } catch (Exception e) {
+ Assert.fail(e.toString());
+ }
+ }
+
protected void test(String file) {
try {
test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
diff --git a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
index 40bf4ee..6a0e822 100644
--- a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
+++ b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
@@ -294,4 +294,24 @@
-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr
-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73
+http://www.tversity.com/
--http://www.aspseek.org/index.php
\ No newline at end of file
+-http://www.aspseek.org/index.php
+-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif
+-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip
+-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg
+-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg
+-http://history.vineyard.net/mv1887.gif
+-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG
+-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG
+-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg
+-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg
+-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg
+-http://www.houseofmabel.com/programs/html3/LinkSure.zip
+-http://www.lib.utexas.edu/maps/states/california.gif
+-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg
diff --git a/src/plugin/urlfilter-fast/README.md
b/src/plugin/urlfilter-fast/README.md
new file mode 100644
index 0000000..46b293f
--- /dev/null
+++ b/src/plugin/urlfilter-fast/README.md
@@ -0,0 +1,59 @@
+
+Filters URLs based on a file of regular expressions using host/domains
+matching first. The default policy is to accept a URL if no matches
+are found.
+
+Rule Format:
+
+```
+Host www.example.org
+ DenyPath /path/to/be/excluded
+ DenyPath /some/other/path/excluded
+
+# Deny everything from *.example.com and example.com
+Domain example.com
+ DenyPath .*
+
+Domain example.org
+ DenyPathQuery /resource/.*?action=exclude
+```
+
+`Host` rules are evaluated before `Domain` rules. For `Host` rules the
+entire host name of a URL must match while the domain names in
+`Domain` rules are considered as matches if the domain is a suffix of
+the host name (consisting of complete host name parts). Shorter
+domain suffixes are checked first, a single dot "`.`" as "domain name"
+can be used to specify global rules applied to every URL.
+
+E.g., for "www.example.com" the rules given above are looked up in the
+following order:
+
+1. check "www.example.com" whether host-based rules exist and whether one of
them matches
+1. check "www.example.com" for domain-based rules
+1. check "example.com" for domain-based rules
+1. check "com" for domain-based rules
+1. check for global rules (domain name is ".")
+
+The first matching rule will reject the URL and no further rules are
+checked. If no rule matches the URL is accepted. URLs without a host
+name (e.g., <code>file:/path/file.txt</code> are checked for global
+rules only. URLs which fail to be parsed as
+[java.net.URL](https://docs.oracle.com/javase/8/docs/api/java/net/URL.html)
+are always rejected.
+
+For rules either the URL path (`DenyPath`) or path and query
+(`DenyPathQuery`) are checked whether the given [Java Regular
+expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html)
+is found (see
+[Matcher.find()](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Matcher.html#find--))
+in the URL path (and query).
+
+Rules are applied in the order of their definition. For better
+performance, regular expressions which are simpler/faster or match
+more URLs should be defined earlier.
+
+Comments in the rule file start with the `#` character and reach until
+the end of the line.
+
+The rules file is defined via the property `urlfilter.fast.file`,
+the default name is `fast-urlfilter.txt`.
diff --git a/src/plugin/urlfilter-fast/build.xml
b/src/plugin/urlfilter-fast/build.xml
new file mode 100644
index 0000000..c22ca6e
--- /dev/null
+++ b/src/plugin/urlfilter-fast/build.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-fast" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-regex-filter/*.jar" />
+ </fileset>
+ <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+ </path>
+
+ <!-- Compile test classes for dependencies -->
+ <target name="deps-test-compile">
+ <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample" includes="**/*.txt, **/*.urls"/>
+ </copy>
+
+</project>
diff --git a/src/plugin/urlfilter-fast/ivy.xml
b/src/plugin/urlfilter-fast/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/src/plugin/urlfilter-fast/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
diff --git a/src/plugin/urlfilter-fast/plugin.xml
b/src/plugin/urlfilter-fast/plugin.xml
new file mode 100644
index 0000000..4e28cb3
--- /dev/null
+++ b/src/plugin/urlfilter-fast/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-fast"
+ name="Fast URL Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-fast.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-regex-filter"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.fast"
+ name="Nutch Fast URL Filter"
+ point="org.apache.nutch.net.URLFilter">
+ <implementation id="FastURLFilter"
+ class="org.apache.nutch.urlfilter.fast.FastURLFilter"/>
+ </extension>
+</plugin>
diff --git a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
b/src/plugin/urlfilter-fast/sample/Benchmarks.urls
similarity index 91%
copy from src/plugin/urlfilter-automaton/sample/Benchmarks.urls
copy to src/plugin/urlfilter-fast/sample/Benchmarks.urls
index 40bf4ee..6a0e822 100644
--- a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
+++ b/src/plugin/urlfilter-fast/sample/Benchmarks.urls
@@ -294,4 +294,24 @@
-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr
-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73
+http://www.tversity.com/
--http://www.aspseek.org/index.php
\ No newline at end of file
+-http://www.aspseek.org/index.php
+-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif
+-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip
+-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg
+-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg
+-http://history.vineyard.net/mv1887.gif
+-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG
+-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG
+-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg
+-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg
+-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg
+-http://www.houseofmabel.com/programs/html3/LinkSure.zip
+-http://www.lib.utexas.edu/maps/states/california.gif
+-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg
diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt
b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt
new file mode 100644
index 0000000..27a918b
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt
@@ -0,0 +1,25 @@
+# port of urlfilter-regex benchmarks to urlfilter-fast
+# cf.
+# src/plugin/urlfilter-regex/sample/Benchmarks.rules
+# src/plugin/urlfilter-regex/sample/Benchmarks.urls
+
+# skip file:, ftp:, & mailto: urls
+# -^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+Domain .
+ DenyPath
(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+Domain .
+ DenyPathQuery [?*!@=]
+
+# skip .fr .org and .net domains
+Domain fr
+ DenyPath .*
+Domain org
+ DenyPath .*
+Domain net
+ DenyPath .*
+
+# accept every URL not matched by any rule
diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt
b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt
new file mode 100644
index 0000000..9f26529
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt
@@ -0,0 +1,19 @@
+Host www.example.org
+ DenyPath ^/path/to/be/excluded
+ DenyPath ^/some/other/path/excluded
+
+# Deny everything from *.example.com and example.com
+Domain example.com
+ DenyPath .*
+
+Domain example.org
+ DenyPathQuery /resource/.*?action=exclude
+
+# exclude images from image server
+Host i.example.org
+ DenyPath (?i)\.jpe?g$
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+Domain .
+ DenyPath (/.+?)/.*?\1/.*?\1/
+
diff --git a/src/plugin/urlfilter-fast/sample/test.urls
b/src/plugin/urlfilter-fast/sample/test.urls
new file mode 100644
index 0000000..3aa4354
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/test.urls
@@ -0,0 +1,21 @@
+-https://www.example.org/path/to/be/excluded
+-https://www.example.org/path/to/be/excluded/continued
+-https://www.example.org/some/other/path/excluded
++https://www.example.org/
++https://www.example.org/%20white%20space%20in%20path%20escaped/
+-https://www1.example.com/
+-https://www2.example.com/
+-https://www.subnet.example.com/
++https://www.examplex.com/
++https://www.example.co.uk/
++https://www.example.com.za/
+-https://www.example.org/resource/put?action=exclude
+-http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
+-http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/
++http://www.nutch.org/abcd/foo1/bar1/zzz1/
+-https://i.example.org/394d46ef76ee5c1bbad1cb98b40dc463d322c94d/c=0-129-2047-1285/635969287686419433-WORLD-40943944.JPG?width=3200&height=1680&fit=crop
+-ftp://ftp.example.com/file1.txt
++ftp://ftp.example.org/file1.txt
++file:/path/file1.txt
++file:///path/file1.txt
+-file:/abcd/foo/bar/xyz/foo/bar/foo/
\ No newline at end of file
diff --git
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
new file mode 100644
index 0000000..d53a2fd
--- /dev/null
+++
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -0,0 +1,315 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.fast;
+
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.Multimap;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.invoke.MethodHandles;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.net.URL;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Filters URLs based on a file of regular expressions using host/domains
+ * matching first. The default policy is to accept a URL if no matches are
+ * found.
+ *
+ * Rule Format:
+ *
+ * <pre>
+ * Host www.example.org
+ * DenyPath /path/to/be/excluded
+ * DenyPath /some/other/path/excluded
+ *
+ * # Deny everything from *.example.com and example.com
+ * Domain example.com
+ * DenyPath .*
+ *
+ * Domain example.org
+ * DenyPathQuery /resource/.*?action=exclude
+ * </pre>
+ *
+ * <code>Host</code> rules are evaluated before <code>Domain</code> rules. For
+ * <code>Host</code> rules the entire host name of a URL must match while the
+ * domain names in <code>Domain</code> rules are considered as matches if the
+ * domain is a suffix of the host name (consisting of complete host name
parts).
+ * Shorter domain suffixes are checked first, a single dot
+ * "<code>.</code>" as "domain name" can be used to specify
+ * global rules applied to every URL.
+ *
+ * E.g., for "www.example.com" the rules given above are looked up in the
+ * following order:
+ * <ol>
+ * <li>check "www.example.com" whether host-based rules exist and whether one
of
+ * them matches</li>
+ * <li>check "www.example.com" for domain-based rules</li>
+ * <li>check "example.com" for domain-based rules</li>
+ * <li>check "com" for domain-based rules</li>
+ * <li>check for global rules ("<code>Domain .</code>")</li>
+ * </ol>
+ * The first matching rule will reject the URL and no further rules are
checked.
+ * If no rule matches the URL is accepted. URLs without a host name (e.g.,
+ * <code>file:/path/file.txt</code> are checked for global rules only. URLs
+ * which fail to be parsed as {@link java.net.URL} are always rejected.
+ *
+ * For rules either the URL path (<code>DenyPath</code>) or path and query
+ * (<code>DenyPathQuery</code>) are checked whether the given
+ * {@link java.util.regex Java Regular expression} is found (see
+ * {@link java.util.regex.Matcher#find()}) in the URL path (and query).
+ *
+ * Rules are applied in the order of their definition. For better performance,
+ * regular expressions which are simpler/faster or match more URLs should be
+ * defined earlier.
+ *
+ * Comments in the rule file start with the <code>#</code> character and reach
+ * until the end of the line.
+ *
+ * The rules file is defined via the property <code>urlfilter.fast.file</code>,
+ * the default name is <code>fast-urlfilter.txt</code>.
+ */
+public class FastURLFilter implements URLFilter {
+
+ protected static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ private Configuration conf;
+ public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
+ private Multimap<String, Rule> hostRules = LinkedHashMultimap.create();
+ private Multimap<String, Rule> domainRules = LinkedHashMultimap.create();
+
+ private static final Pattern CATCH_ALL_RULE = Pattern
+ .compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
+
+ public FastURLFilter() {}
+
+ FastURLFilter(Reader rules) throws IOException, PatternSyntaxException {
+ reloadRules(rules);
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ try {
+ reloadRules();
+ } catch (Exception e) {
+ LOG.error(e.getMessage());
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ @Override
+ public String filter(String url) {
+
+ URL u;
+
+ try {
+ u = new URL(url);
+ } catch (Exception e) {
+ LOG.debug("Rejected {} because failed to parse as URL: {}", url,
+ e.getMessage());
+ return null;
+ }
+
+ String hostname = u.getHost();
+
+ // first check for host-specific rules
+ for (Rule rule : hostRules.get(hostname)) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+
+ // also look up domain rules for host name
+ for (Rule rule : domainRules.get(hostname)) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+
+ // check suffixes of host name from longer to shorter:
+ // subdomains, domain, top-level domain
+ int start = 0;
+ int pos;
+ while ((pos = hostname.indexOf('.', start)) != -1) {
+ start = pos + 1;
+ String domain = hostname.substring(start);
+ for (Rule rule : domainRules.get(domain)) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+ }
+
+ // finally check "global" rules defined for `Domain .`
+ for (Rule rule : domainRules.get(".")) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+
+ // no reject rules found
+ return url;
+ }
+
+ public void reloadRules() throws IOException {
+ String fileRules = conf.get(URLFILTER_FAST_FILE);
+ try (Reader reader = conf.getConfResourceAsReader(fileRules)) {
+ reloadRules(reader);
+ }
+ }
+
+ private void reloadRules(Reader rules) throws IOException {
+ domainRules.clear();
+ hostRules.clear();
+
+ BufferedReader reader = new BufferedReader(rules);
+
+ String current = null;
+ boolean host = false;
+ int lineno = 0;
+
+ String line;
+ try {
+ while((line = reader.readLine()) != null) {
+ lineno++;
+ line = line.trim();
+
+ if (line.indexOf("#") != -1) {
+ // strip comments
+ line = line.substring(0, line.indexOf("#")).trim();
+ }
+
+ if (StringUtils.isBlank(line)) {
+ continue;
+ }
+
+ if (line.startsWith("Host")) {
+ host = true;
+ current = line.split("\\s+")[1];
+ } else if (line.startsWith("Domain")) {
+ host = false;
+ current = line.split("\\s+")[1];
+ } else {
+ if (current == null) {
+ continue;
+ }
+
+ Rule rule = null;
+ try {
+ if (CATCH_ALL_RULE.matcher(line).matches()) {
+ rule = DenyAllRule.getInstance();
+ } else if (line.startsWith("DenyPathQuery")) {
+ rule = new DenyPathQueryRule(line.split("\\s+")[1]);
+ } else if (line.startsWith("DenyPath")) {
+ rule = new DenyPathRule(line.split("\\s+")[1]);
+ } else {
+ LOG.warn("Problem reading rule on line {}: {}", lineno, line);
+ continue;
+ }
+ } catch (Exception e) {
+ LOG.warn("Problem reading rule on line {}: {} - {}", lineno, line,
e.getMessage());
+ continue;
+ }
+
+ if (host) {
+ LOG.trace("Adding host rule [{}] [{}]", current, rule);
+ hostRules.put(current, rule);
+ } else {
+ LOG.trace("Adding domain rule [{}] [{}]", current, rule);
+ domainRules.put(current, rule);
+ }
+ }
+ }
+ } catch (IOException e) {
+ LOG.warn("Caught exception while reading rules file at line {}: {}",
+ lineno, e.getMessage());
+ throw e;
+ }
+ }
+
+ public static class Rule {
+ protected Pattern pattern;
+
+ Rule() {}
+
+ public Rule(String regex) {
+ pattern = Pattern.compile(regex);
+ }
+
+ public boolean match(URL url) {
+ return pattern.matcher(url.toString()).find();
+ }
+
+ public String toString() {
+ return pattern.toString();
+ }
+ }
+
+ public static class DenyPathRule extends Rule {
+ public DenyPathRule(String regex) {
+ super(regex);
+ }
+
+ public boolean match(URL url) {
+ String haystack = url.getPath();
+ return pattern.matcher(haystack).find();
+ }
+ }
+
+ /** Rule for <code>DenyPath .*</code> or <code>DenyPath .?</code> */
+ public static class DenyAllRule extends Rule {
+
+ private static Rule instance = new DenyAllRule(".");
+
+ private DenyAllRule(String regex) {
+ super(regex);
+ }
+
+ public static Rule getInstance() {
+ return instance;
+ }
+
+ public boolean match(URL url) {
+ return true;
+ }
+ }
+
+ public static class DenyPathQueryRule extends Rule {
+ public DenyPathQueryRule(String regex) {
+ super(regex);
+ }
+
+ public boolean match(URL url) {
+ String haystack = url.getFile();
+ return pattern.matcher(haystack).find();
+ }
+ }
+}
diff --git
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java
new file mode 100644
index 0000000..d56f948
--- /dev/null
+++
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin that first does fast exact suffix matches on host/domain
+ * names before applying regular expressions to the path component of a URL.
See
+ * {@link org.apache.nutch.urlfilter.fast.FastURLFilter} for a description of
+ * the rule format.
+ */
+package org.apache.nutch.urlfilter.fast;
diff --git
a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
new file mode 100644
index 0000000..9609228
--- /dev/null
+++
b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.fast;
+
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TestFastURLFilter extends RegexURLFilterBaseTest {
+
+ protected URLFilter getURLFilter(Reader rules) {
+ try {
+ return new FastURLFilter(rules);
+ } catch (IOException e) {
+ Assert.fail(e.toString());
+ return null;
+ }
+ }
+
+ @Test
+ public void test() {
+ test("fast-urlfilter-test.txt", "test.urls");
+ test("fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+ }
+
+ @Test
+ public void benchmark() {
+ bench(50, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+ bench(100, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+ bench(200, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+ bench(400, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+ bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+ }
+
+}
diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.urls
b/src/plugin/urlfilter-regex/sample/Benchmarks.urls
index 40bf4ee..6a0e822 100644
--- a/src/plugin/urlfilter-regex/sample/Benchmarks.urls
+++ b/src/plugin/urlfilter-regex/sample/Benchmarks.urls
@@ -294,4 +294,24 @@
-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr
-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73
+http://www.tversity.com/
--http://www.aspseek.org/index.php
\ No newline at end of file
+-http://www.aspseek.org/index.php
+-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif
+-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip
+-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg
+-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg
+-http://history.vineyard.net/mv1887.gif
+-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG
+-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG
+-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg
+-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg
+-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg
+-http://www.houseofmabel.com/programs/html3/LinkSure.zip
+-http://www.lib.utexas.edu/maps/states/california.gif
+-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg