This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 1fc98bf  NUTCH-2690 Configurable and fast URL filter - performs fast 
exact matches on host/domain names - before applying regexes to the path 
component of a URL
     new 8cc41d8  Merge pull request #433 from commoncrawl/cc-fast-url-filter
1fc98bf is described below

commit 1fc98bf061aedb98be4453865201ce6d9f1dede6
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Mon Nov 28 12:56:46 2016 +0100

    NUTCH-2690 Configurable and fast URL filter
    - performs fast exact matches on host/domain names
    - before applying regexes to the path component of a URL
---
 build.xml                                          |   4 +
 conf/fast-urlfilter.txt.template                   |  49 ++++
 conf/nutch-default.xml                             |   7 +
 default.properties                                 |   1 +
 src/plugin/build.xml                               |   3 +
 .../urlfilter/api/RegexURLFilterBaseTest.java      |  22 +-
 .../urlfilter-automaton/sample/Benchmarks.urls     |  22 +-
 src/plugin/urlfilter-fast/README.md                |  59 ++++
 src/plugin/urlfilter-fast/build.xml                |  51 ++++
 src/plugin/urlfilter-fast/ivy.xml                  |  41 +++
 src/plugin/urlfilter-fast/plugin.xml               |  41 +++
 .../sample/Benchmarks.urls                         |  22 +-
 .../sample/fast-urlfilter-benchmark.txt            |  25 ++
 .../urlfilter-fast/sample/fast-urlfilter-test.txt  |  19 ++
 src/plugin/urlfilter-fast/sample/test.urls         |  21 ++
 .../apache/nutch/urlfilter/fast/FastURLFilter.java | 315 +++++++++++++++++++++
 .../apache/nutch/urlfilter/fast/package-info.java  |  24 ++
 .../nutch/urlfilter/fast/TestFastURLFilter.java    |  55 ++++
 src/plugin/urlfilter-regex/sample/Benchmarks.urls  |  22 +-
 19 files changed, 798 insertions(+), 5 deletions(-)

diff --git a/build.xml b/build.xml
index 04a36a6..5d883e3 100644
--- a/build.xml
+++ b/build.xml
@@ -231,6 +231,7 @@
       <packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-domain/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/>
+      <packageset dir="${plugins.dir}/urlfilter-fast/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
@@ -728,6 +729,7 @@
       <packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-domain/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/>
+      <packageset dir="${plugins.dir}/urlfilter-fast/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
@@ -1150,6 +1152,8 @@
         <source path="${plugins.dir}/urlfilter-domain/src/test/" />
         <source path="${plugins.dir}/urlfilter-domainblacklist/src/java/" />
         <source path="${plugins.dir}/urlfilter-domainblacklist/src/test/" />
+        <source path="${plugins.dir}/urlfilter-fast/src/java/"/>
+        <source path="${plugins.dir}/urlfilter-fast/src/test/"/>
         <source path="${plugins.dir}/urlfilter-ignoreexempt/src/java/" />
         <source path="${plugins.dir}/urlfilter-prefix/src/java/" />
         <source path="${plugins.dir}/urlfilter-prefix/src/test/" />
diff --git a/conf/fast-urlfilter.txt.template b/conf/fast-urlfilter.txt.template
new file mode 100644
index 0000000..99bb5c9
--- /dev/null
+++ b/conf/fast-urlfilter.txt.template
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Rule file for the plugin urlfilter-fast
+
+# Used to filter a large number of domain and host-specific regular
+# expressions
+
+#
+# `Domain` rules are applied to all hosts and subdomains of a domain, e.g.
+#
+#   Domain example.org
+#     DenyPath (?i)%7c                # matches against just the path part of 
URL
+#     DenyPathQuery ^/resource\?x=1   # matches against path + query
+#
+#
+# To match against a single hostname:
+#
+#   Host www.example.com
+#     DenyPath (?i)%7c
+#
+#
+# Global rules are defined using the domain name `.`:
+#
+#   Domain .
+#     (/[^/]+)/[^/]+\1/[^/]+\1/
+#     # skips URLs with slash-delimited segment that repeats 3+ times, to 
break loops
+#
+#
+# Comments start with the `#` character and reach until the end of the line.
+#
+#
+# For more details, see
+#  - src/plugin/urlfilter-fast/README.md
+#  - 
src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+#
+
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 951494e..b919e43 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1647,6 +1647,13 @@ CAUTION: Set the parser.timeout to -1 or a bigger value 
than 30, when using this
 </property>
 
 <property>
+  <name>urlfilter.fast.file</name>
+  <value>fast-urlfilter.txt</value>
+  <description>Name of file on CLASSPATH containing regular expressions
+  used by urlfilter-fast (FastURLFilter) plugin.</description>
+</property>
+
+<property>
   <name>urlfilter.order</name>
   <value></value>
   <description>The order by which url filters are applied.
diff --git a/default.properties b/default.properties
index a3bc0cf..899f33d 100644
--- a/default.properties
+++ b/default.properties
@@ -104,6 +104,7 @@ plugins.urlfilter=\
    org.apache.nutch.urlfilter.automaton*:\
    org.apache.nutch.urlfilter.domain*:\
    org.apache.nutch.urlfilter.domainblacklist*:\
+   org.apache.nutch.urlfilter.fast*:\
    org.apache.nutch.urlfilter.ignoreexempt*:\
    org.apache.nutch.urlfilter.prefix*:\
    org.apache.nutch.urlfilter.regex*:\
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 2592357..51c3fe7 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -89,6 +89,7 @@
     <ant dir="urlfilter-automaton" target="deploy"/>
     <ant dir="urlfilter-domain" target="deploy" />
     <ant dir="urlfilter-domainblacklist" target="deploy" />
+    <ant dir="urlfilter-fast" target="deploy"/>
     <ant dir="urlfilter-prefix" target="deploy"/>
     <ant dir="urlfilter-regex" target="deploy"/>
     <ant dir="urlfilter-suffix" target="deploy"/>
@@ -146,6 +147,7 @@
      <ant dir="urlfilter-automaton" target="test"/>
      <ant dir="urlfilter-domain" target="test"/>
      <ant dir="urlfilter-domainblacklist" target="test"/>
+     <ant dir="urlfilter-fast" target="test"/>
      <!--ant dir="urlfilter-ignoreexempt" target="test"/-->
      <ant dir="urlfilter-prefix" target="test"/>
      <ant dir="urlfilter-regex" target="test"/>
@@ -234,6 +236,7 @@
     <ant dir="urlfilter-automaton" target="clean"/>
     <ant dir="urlfilter-domain" target="clean" />
     <ant dir="urlfilter-domainblacklist" target="clean" />
+    <ant dir="urlfilter-fast" target="clean"/>
     <ant dir="urlfilter-ignoreexempt" target="clean"/>
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="urlfilter-regex" target="clean"/>
diff --git 
a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
 
b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
index 730d3cb..c77c67e 100644
--- 
a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
+++ 
b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -43,8 +43,8 @@ public abstract class RegexURLFilterBaseTest {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  private final static String SEPARATOR = System.getProperty("file.separator");
-  private final static String SAMPLES = System.getProperty("test.data", ".");
+  protected final static String SEPARATOR = 
System.getProperty("file.separator");
+  protected final static String SAMPLES = System.getProperty("test.data", ".");
 
   protected abstract URLFilter getURLFilter(Reader rules);
 
@@ -72,6 +72,24 @@ public abstract class RegexURLFilterBaseTest {
         + (System.currentTimeMillis() - start) + "ms");
   }
 
+  protected void bench(int loops, String rulesFile, String urlsFile) {
+    try {
+      bench(loops, new FileReader(SAMPLES + SEPARATOR + rulesFile),
+          new FileReader(SAMPLES + SEPARATOR + urlsFile));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
+  protected void test(String rulesFile, String urlsFile) {
+    try {
+      test(new FileReader(SAMPLES + SEPARATOR + rulesFile),
+          new FileReader(SAMPLES + SEPARATOR + urlsFile));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
   protected void test(String file) {
     try {
       test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
diff --git a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls 
b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
index 40bf4ee..6a0e822 100644
--- a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
+++ b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
@@ -294,4 +294,24 @@
 
-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
 -http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
 +http://www.tversity.com/
--http://www.aspseek.org/index.php
\ No newline at end of file
+-http://www.aspseek.org/index.php
+-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif
+-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip
+-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg
+-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg
+-http://history.vineyard.net/mv1887.gif
+-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG
+-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG
+-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg
+-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg
+-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg
+-http://www.houseofmabel.com/programs/html3/LinkSure.zip
+-http://www.lib.utexas.edu/maps/states/california.gif
+-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg
diff --git a/src/plugin/urlfilter-fast/README.md 
b/src/plugin/urlfilter-fast/README.md
new file mode 100644
index 0000000..46b293f
--- /dev/null
+++ b/src/plugin/urlfilter-fast/README.md
@@ -0,0 +1,59 @@
+
+Filters URLs based on a file of regular expressions using host/domains
+matching first. The default policy is to accept a URL if no matches
+are found.
+
+Rule Format:
+
+```
+Host www.example.org
+  DenyPath /path/to/be/excluded
+  DenyPath /some/other/path/excluded
+
+# Deny everything from *.example.com and example.com
+Domain example.com
+  DenyPath .*
+
+Domain example.org
+  DenyPathQuery /resource/.*?action=exclude
+```
+
+`Host` rules are evaluated before `Domain` rules. For `Host` rules the
+entire host name of a URL must match while the domain names in
+`Domain` rules are considered as matches if the domain is a suffix of
+the host name (consisting of complete host name parts).  Shorter
+domain suffixes are checked first, a single dot "`.`" as "domain name"
+can be used to specify global rules applied to every URL.
+
+E.g., for "www.example.com" the rules given above are looked up in the
+following order:
+
+1. check "www.example.com" whether host-based rules exist and whether one of 
them matches
+1. check "www.example.com" for domain-based rules
+1. check "example.com" for domain-based rules
+1. check "com" for domain-based rules
+1. check for global rules (domain name is ".")
+
+The first matching rule will reject the URL and no further rules are
+checked.  If no rule matches the URL is accepted.  URLs without a host
+name (e.g., <code>file:/path/file.txt</code> are checked for global
+rules only.  URLs which fail to be parsed as
+[java.net.URL](https://docs.oracle.com/javase/8/docs/api/java/net/URL.html)
+are always rejected.
+
+For rules either the URL path (`DenyPath`) or path and query
+(`DenyPathQuery`) are checked whether the given [Java Regular
+expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html)
+is found (see
+[Matcher.find()](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Matcher.html#find--))
+in the URL path (and query).
+
+Rules are applied in the order of their definition. For better
+performance, regular expressions which are simpler/faster or match
+more URLs should be defined earlier.
+
+Comments in the rule file start with the `#` character and reach until
+the end of the line.
+
+The rules file is defined via the property `urlfilter.fast.file`,
+the default name is `fast-urlfilter.txt`.
diff --git a/src/plugin/urlfilter-fast/build.xml 
b/src/plugin/urlfilter-fast/build.xml
new file mode 100644
index 0000000..c22ca6e
--- /dev/null
+++ b/src/plugin/urlfilter-fast/build.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-fast" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+  </path>
+
+  <!-- Compile test classes for dependencies -->
+  <target name="deps-test-compile">
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.txt, **/*.urls"/>
+  </copy>
+
+</project>
diff --git a/src/plugin/urlfilter-fast/ivy.xml 
b/src/plugin/urlfilter-fast/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/src/plugin/urlfilter-fast/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/urlfilter-fast/plugin.xml 
b/src/plugin/urlfilter-fast/plugin.xml
new file mode 100644
index 0000000..4e28cb3
--- /dev/null
+++ b/src/plugin/urlfilter-fast/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-fast"
+   name="Fast URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-fast.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-filter"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.fast"
+              name="Nutch Fast URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="FastURLFilter"
+                      class="org.apache.nutch.urlfilter.fast.FastURLFilter"/>
+   </extension>
+</plugin>
diff --git a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls 
b/src/plugin/urlfilter-fast/sample/Benchmarks.urls
similarity index 91%
copy from src/plugin/urlfilter-automaton/sample/Benchmarks.urls
copy to src/plugin/urlfilter-fast/sample/Benchmarks.urls
index 40bf4ee..6a0e822 100644
--- a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
+++ b/src/plugin/urlfilter-fast/sample/Benchmarks.urls
@@ -294,4 +294,24 @@
 
-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
 -http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
 +http://www.tversity.com/
--http://www.aspseek.org/index.php
\ No newline at end of file
+-http://www.aspseek.org/index.php
+-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif
+-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip
+-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg
+-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg
+-http://history.vineyard.net/mv1887.gif
+-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG
+-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG
+-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg
+-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg
+-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg
+-http://www.houseofmabel.com/programs/html3/LinkSure.zip
+-http://www.lib.utexas.edu/maps/states/california.gif
+-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg
diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt 
b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt
new file mode 100644
index 0000000..27a918b
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt
@@ -0,0 +1,25 @@
+# port of urlfilter-regex benchmarks to urlfilter-fast
+# cf.
+#    src/plugin/urlfilter-regex/sample/Benchmarks.rules
+#    src/plugin/urlfilter-regex/sample/Benchmarks.urls
+
+# skip file:, ftp:, & mailto: urls
+# -^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+Domain .
+  DenyPath 
(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+Domain .
+  DenyPathQuery [?*!@=]
+
+# skip .fr .org and .net domains
+Domain fr
+  DenyPath .*
+Domain org
+  DenyPath .*
+Domain net
+  DenyPath .*
+
+# accept every URL not matched by any rule
diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt 
b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt
new file mode 100644
index 0000000..9f26529
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt
@@ -0,0 +1,19 @@
+Host www.example.org
+  DenyPath ^/path/to/be/excluded
+  DenyPath ^/some/other/path/excluded
+
+# Deny everything from *.example.com and example.com
+Domain example.com
+  DenyPath .*
+
+Domain example.org
+  DenyPathQuery /resource/.*?action=exclude
+
+# exclude images from image server
+Host i.example.org
+  DenyPath (?i)\.jpe?g$
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+Domain .
+  DenyPath (/.+?)/.*?\1/.*?\1/
+
diff --git a/src/plugin/urlfilter-fast/sample/test.urls 
b/src/plugin/urlfilter-fast/sample/test.urls
new file mode 100644
index 0000000..3aa4354
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/test.urls
@@ -0,0 +1,21 @@
+-https://www.example.org/path/to/be/excluded
+-https://www.example.org/path/to/be/excluded/continued
+-https://www.example.org/some/other/path/excluded
++https://www.example.org/
++https://www.example.org/%20white%20space%20in%20path%20escaped/
+-https://www1.example.com/
+-https://www2.example.com/
+-https://www.subnet.example.com/
++https://www.examplex.com/
++https://www.example.co.uk/
++https://www.example.com.za/
+-https://www.example.org/resource/put?action=exclude
+-http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
+-http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/
++http://www.nutch.org/abcd/foo1/bar1/zzz1/
+-https://i.example.org/394d46ef76ee5c1bbad1cb98b40dc463d322c94d/c=0-129-2047-1285/635969287686419433-WORLD-40943944.JPG?width=3200&height=1680&fit=crop
+-ftp://ftp.example.com/file1.txt
++ftp://ftp.example.org/file1.txt
++file:/path/file1.txt
++file:///path/file1.txt
+-file:/abcd/foo/bar/xyz/foo/bar/foo/
\ No newline at end of file
diff --git 
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
 
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
new file mode 100644
index 0000000..d53a2fd
--- /dev/null
+++ 
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -0,0 +1,315 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.fast;
+
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.Multimap;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.invoke.MethodHandles;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.net.URL;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Filters URLs based on a file of regular expressions using host/domains
+ * matching first. The default policy is to accept a URL if no matches are
+ * found.
+ *
+ * Rule Format:
+ * 
+ * <pre>
+ * Host www.example.org
+ *   DenyPath /path/to/be/excluded
+ *   DenyPath /some/other/path/excluded
+ *
+ * # Deny everything from *.example.com and example.com
+ * Domain example.com
+ *   DenyPath .*
+ *
+ * Domain example.org
+ *   DenyPathQuery /resource/.*?action=exclude
+ * </pre>
+ * 
+ * <code>Host</code> rules are evaluated before <code>Domain</code> rules. For
+ * <code>Host</code> rules the entire host name of a URL must match while the
+ * domain names in <code>Domain</code> rules are considered as matches if the
+ * domain is a suffix of the host name (consisting of complete host name 
parts).
+ * Shorter domain suffixes are checked first, a single dot
+ * &quot;<code>.</code>&quot; as &quot;domain name&quot; can be used to specify
+ * global rules applied to every URL.
+ * 
+ * E.g., for "www.example.com" the rules given above are looked up in the
+ * following order:
+ * <ol>
+ * <li>check "www.example.com" whether host-based rules exist and whether one 
of
+ * them matches</li>
+ * <li>check "www.example.com" for domain-based rules</li>
+ * <li>check "example.com" for domain-based rules</li>
+ * <li>check "com" for domain-based rules</li>
+ * <li>check for global rules (&quot;<code>Domain .</code>&quot;)</li>
+ * </ol>
+ * The first matching rule will reject the URL and no further rules are 
checked.
+ * If no rule matches the URL is accepted. URLs without a host name (e.g.,
+ * <code>file:/path/file.txt</code> are checked for global rules only. URLs
+ * which fail to be parsed as {@link java.net.URL} are always rejected.
+ * 
+ * For rules either the URL path (<code>DenyPath</code>) or path and query
+ * (<code>DenyPathQuery</code>) are checked whether the given
+ * {@link java.util.regex Java Regular expression} is found (see
+ * {@link java.util.regex.Matcher#find()}) in the URL path (and query).
+ * 
+ * Rules are applied in the order of their definition. For better performance,
+ * regular expressions which are simpler/faster or match more URLs should be
+ * defined earlier.
+ * 
+ * Comments in the rule file start with the <code>#</code> character and reach
+ * until the end of the line.
+ * 
+ * The rules file is defined via the property <code>urlfilter.fast.file</code>,
+ * the default name is <code>fast-urlfilter.txt</code>.
+ */
+public class FastURLFilter implements URLFilter {
+
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private Configuration conf;
+  public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
+  private Multimap<String, Rule> hostRules = LinkedHashMultimap.create();
+  private Multimap<String, Rule> domainRules = LinkedHashMultimap.create();
+
+  private static final Pattern CATCH_ALL_RULE = Pattern
+      .compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
+
+  public FastURLFilter() {}
+
+  FastURLFilter(Reader rules) throws IOException, PatternSyntaxException {
+    reloadRules(rules);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    try {
+      reloadRules();
+    } catch (Exception e) {
+      LOG.error(e.getMessage());
+      throw new RuntimeException(e.getMessage(), e);
+    }
+  }
+
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public String filter(String url) {
+
+    URL u;
+
+    try {
+      u = new URL(url);
+    } catch (Exception e) {
+      LOG.debug("Rejected {} because failed to parse as URL: {}", url,
+          e.getMessage());
+      return null;
+    }
+
+    String hostname = u.getHost();
+
+    // first check for host-specific rules
+    for (Rule rule : hostRules.get(hostname)) {
+      if (rule.match(u)) {
+        return null;
+      }
+    }
+
+    // also look up domain rules for host name
+    for (Rule rule : domainRules.get(hostname)) {
+      if (rule.match(u)) {
+        return null;
+      }
+    }
+
+    // check suffixes of host name from longer to shorter:
+    // subdomains, domain, top-level domain
+    int start = 0;
+    int pos;
+    while ((pos = hostname.indexOf('.', start)) != -1) {
+      start = pos + 1;
+      String domain = hostname.substring(start);
+      for (Rule rule : domainRules.get(domain)) {
+        if (rule.match(u)) {
+          return null;
+        }
+      }
+    }
+
+    // finally check "global" rules defined for `Domain .`
+    for (Rule rule : domainRules.get(".")) {
+      if (rule.match(u)) {
+        return null;
+      }
+    }
+
+    // no reject rules found
+    return url;
+  }
+
+  public void reloadRules() throws IOException {
+    String fileRules = conf.get(URLFILTER_FAST_FILE);
+    try (Reader reader = conf.getConfResourceAsReader(fileRules)) {
+      reloadRules(reader);
+    }
+  }
+
+  private void reloadRules(Reader rules) throws IOException {
+    domainRules.clear();
+    hostRules.clear();
+
+    BufferedReader reader = new BufferedReader(rules);
+
+    String current = null;
+    boolean host = false;
+    int lineno = 0;
+
+    String line;
+    try {
+      while((line = reader.readLine()) != null) {
+        lineno++;
+        line = line.trim();
+
+        if (line.indexOf("#") != -1) {
+          // strip comments
+          line = line.substring(0, line.indexOf("#")).trim();
+        }
+
+        if (StringUtils.isBlank(line)) {
+          continue;
+        }
+
+        if (line.startsWith("Host")) {
+          host = true;
+          current =  line.split("\\s+")[1];
+        } else if (line.startsWith("Domain")) {
+          host = false;
+          current = line.split("\\s+")[1];
+        } else {
+          if (current == null) {
+            continue;
+          }
+
+          Rule rule = null;
+          try {
+            if (CATCH_ALL_RULE.matcher(line).matches()) {
+              rule = DenyAllRule.getInstance();
+            } else if (line.startsWith("DenyPathQuery")) {
+              rule = new DenyPathQueryRule(line.split("\\s+")[1]);
+            } else if (line.startsWith("DenyPath")) {
+                rule = new DenyPathRule(line.split("\\s+")[1]);
+            } else {
+              LOG.warn("Problem reading rule on line {}: {}", lineno, line);
+              continue;
+            }
+          } catch (Exception e) {
+            LOG.warn("Problem reading rule on line {}: {} - {}", lineno, line, 
e.getMessage());
+            continue;
+          }
+
+          if (host) {
+            LOG.trace("Adding host rule [{}] [{}]", current, rule);
+            hostRules.put(current, rule);
+          } else {
+            LOG.trace("Adding domain rule [{}] [{}]", current, rule);
+            domainRules.put(current, rule);
+          }
+        }
+      }
+    } catch (IOException e) {
+      LOG.warn("Caught exception while reading rules file at line {}: {}",
+          lineno, e.getMessage());
+      throw e;
+    }
+  }
+
+  public static class Rule {
+    protected Pattern pattern;
+
+    Rule() {}
+
+    public Rule(String regex) {
+      pattern = Pattern.compile(regex);
+    }
+
+    public boolean match(URL url) {
+      return pattern.matcher(url.toString()).find();
+    }
+
+    public String toString() {
+       return pattern.toString();
+    }
+  }
+
+  public static class DenyPathRule extends Rule {
+    public DenyPathRule(String regex) {
+      super(regex);
+    }
+
+    public boolean match(URL url) {
+      String haystack = url.getPath();
+      return pattern.matcher(haystack).find();
+    }
+  }
+
+  /** Rule for <code>DenyPath .*</code> or <code>DenyPath .?</code> */
+  public static class DenyAllRule extends Rule {
+
+    private static Rule instance = new DenyAllRule(".");
+
+    private DenyAllRule(String regex) {
+      super(regex);
+    }
+
+    public static Rule getInstance() {
+      return instance;
+    }
+
+    public boolean match(URL url) {
+      return true;
+    }
+  }
+
+  public static class DenyPathQueryRule extends Rule {
+    public DenyPathQueryRule(String regex) {
+      super(regex);
+    }
+
+    public boolean match(URL url) {
+      String haystack = url.getFile();
+      return pattern.matcher(haystack).find();
+    }
+  }
+}
diff --git 
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java
 
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java
new file mode 100644
index 0000000..d56f948
--- /dev/null
+++ 
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin that first does fast exact suffix matches on host/domain
+ * names before applying regular expressions to the path component of a URL. 
See
+ * {@link org.apache.nutch.urlfilter.fast.FastURLFilter} for a description of
+ * the rule format.
+ */
+package org.apache.nutch.urlfilter.fast;
diff --git 
a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
 
b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
new file mode 100644
index 0000000..9609228
--- /dev/null
+++ 
b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.fast;
+
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TestFastURLFilter extends RegexURLFilterBaseTest {
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new FastURLFilter(rules);
+    } catch (IOException e) {
+      Assert.fail(e.toString());
+      return null;
+    }
+  }
+
+  @Test
+  public void test() {
+    test("fast-urlfilter-test.txt", "test.urls");
+    test("fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+  }
+
+  @Test
+  public void benchmark() {
+    bench(50, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+    bench(100, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+    bench(200, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+    bench(400, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+    bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+  }
+
+}
diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.urls 
b/src/plugin/urlfilter-regex/sample/Benchmarks.urls
index 40bf4ee..6a0e822 100644
--- a/src/plugin/urlfilter-regex/sample/Benchmarks.urls
+++ b/src/plugin/urlfilter-regex/sample/Benchmarks.urls
@@ -294,4 +294,24 @@
 
-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
 -http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
 +http://www.tversity.com/
--http://www.aspseek.org/index.php
\ No newline at end of file
+-http://www.aspseek.org/index.php
+-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif
+-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip
+-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg
+-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg
+-http://history.vineyard.net/mv1887.gif
+-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG
+-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG
+-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg
+-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg
+-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg
+-http://www.houseofmabel.com/programs/html3/LinkSure.zip
+-http://www.lib.utexas.edu/maps/states/california.gif
+-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg

Reply via email to