This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 1fc98bf NUTCH-2690 Configurable and fast URL filter - performs fast exact matches on host/domain names - before applying regexes to the path component of a URL new 8cc41d8 Merge pull request #433 from commoncrawl/cc-fast-url-filter 1fc98bf is described below commit 1fc98bf061aedb98be4453865201ce6d9f1dede6 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Mon Nov 28 12:56:46 2016 +0100 NUTCH-2690 Configurable and fast URL filter - performs fast exact matches on host/domain names - before applying regexes to the path component of a URL --- build.xml | 4 + conf/fast-urlfilter.txt.template | 49 ++++ conf/nutch-default.xml | 7 + default.properties | 1 + src/plugin/build.xml | 3 + .../urlfilter/api/RegexURLFilterBaseTest.java | 22 +- .../urlfilter-automaton/sample/Benchmarks.urls | 22 +- src/plugin/urlfilter-fast/README.md | 59 ++++ src/plugin/urlfilter-fast/build.xml | 51 ++++ src/plugin/urlfilter-fast/ivy.xml | 41 +++ src/plugin/urlfilter-fast/plugin.xml | 41 +++ .../sample/Benchmarks.urls | 22 +- .../sample/fast-urlfilter-benchmark.txt | 25 ++ .../urlfilter-fast/sample/fast-urlfilter-test.txt | 19 ++ src/plugin/urlfilter-fast/sample/test.urls | 21 ++ .../apache/nutch/urlfilter/fast/FastURLFilter.java | 315 +++++++++++++++++++++ .../apache/nutch/urlfilter/fast/package-info.java | 24 ++ .../nutch/urlfilter/fast/TestFastURLFilter.java | 55 ++++ src/plugin/urlfilter-regex/sample/Benchmarks.urls | 22 +- 19 files changed, 798 insertions(+), 5 deletions(-) diff --git a/build.xml b/build.xml index 04a36a6..5d883e3 100644 --- a/build.xml +++ b/build.xml @@ -231,6 +231,7 @@ <packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/> <packageset dir="${plugins.dir}/urlfilter-domain/src/java"/> <packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/> + <packageset dir="${plugins.dir}/urlfilter-fast/src/java"/> <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/> <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/> <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/> @@ -728,6 +729,7 @@ <packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/> <packageset dir="${plugins.dir}/urlfilter-domain/src/java"/> <packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/> + <packageset dir="${plugins.dir}/urlfilter-fast/src/java"/> <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/> <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/> <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/> @@ -1150,6 +1152,8 @@ <source path="${plugins.dir}/urlfilter-domain/src/test/" /> <source path="${plugins.dir}/urlfilter-domainblacklist/src/java/" /> <source path="${plugins.dir}/urlfilter-domainblacklist/src/test/" /> + <source path="${plugins.dir}/urlfilter-fast/src/java/"/> + <source path="${plugins.dir}/urlfilter-fast/src/test/"/> <source path="${plugins.dir}/urlfilter-ignoreexempt/src/java/" /> <source path="${plugins.dir}/urlfilter-prefix/src/java/" /> <source path="${plugins.dir}/urlfilter-prefix/src/test/" /> diff --git a/conf/fast-urlfilter.txt.template b/conf/fast-urlfilter.txt.template new file mode 100644 index 0000000..99bb5c9 --- /dev/null +++ b/conf/fast-urlfilter.txt.template @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Rule file for the plugin urlfilter-fast + +# Used to filter a large number of domain and host-specific regular +# expressions + +# +# `Domain` rules are applied to all hosts and subdomains of a domain, e.g. +# +# Domain example.org +# DenyPath (?i)%7c # matches against just the path part of URL +# DenyPathQuery ^/resource\?x=1 # matches against path + query +# +# +# To match against a single hostname: +# +# Host www.example.com +# DenyPath (?i)%7c +# +# +# Global rules are defined using the domain name `.`: +# +# Domain . +# (/[^/]+)/[^/]+\1/[^/]+\1/ +# # skips URLs with slash-delimited segment that repeats 3+ times, to break loops +# +# +# Comments start with the `#` character and reach until the end of the line. +# +# +# For more details, see +# - src/plugin/urlfilter-fast/README.md +# - src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java +# + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 951494e..b919e43 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1647,6 +1647,13 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this </property> <property> + <name>urlfilter.fast.file</name> + <value>fast-urlfilter.txt</value> + <description>Name of file on CLASSPATH containing regular expressions + used by urlfilter-fast (FastURLFilter) plugin.</description> +</property> + +<property> <name>urlfilter.order</name> <value></value> <description>The order by which url filters are applied. diff --git a/default.properties b/default.properties index a3bc0cf..899f33d 100644 --- a/default.properties +++ b/default.properties @@ -104,6 +104,7 @@ plugins.urlfilter=\ org.apache.nutch.urlfilter.automaton*:\ org.apache.nutch.urlfilter.domain*:\ org.apache.nutch.urlfilter.domainblacklist*:\ + org.apache.nutch.urlfilter.fast*:\ org.apache.nutch.urlfilter.ignoreexempt*:\ org.apache.nutch.urlfilter.prefix*:\ org.apache.nutch.urlfilter.regex*:\ diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 2592357..51c3fe7 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -89,6 +89,7 @@ <ant dir="urlfilter-automaton" target="deploy"/> <ant dir="urlfilter-domain" target="deploy" /> <ant dir="urlfilter-domainblacklist" target="deploy" /> + <ant dir="urlfilter-fast" target="deploy"/> <ant dir="urlfilter-prefix" target="deploy"/> <ant dir="urlfilter-regex" target="deploy"/> <ant dir="urlfilter-suffix" target="deploy"/> @@ -146,6 +147,7 @@ <ant dir="urlfilter-automaton" target="test"/> <ant dir="urlfilter-domain" target="test"/> <ant dir="urlfilter-domainblacklist" target="test"/> + <ant dir="urlfilter-fast" target="test"/> <!--ant dir="urlfilter-ignoreexempt" target="test"/--> <ant dir="urlfilter-prefix" target="test"/> <ant dir="urlfilter-regex" target="test"/> @@ -234,6 +236,7 @@ <ant dir="urlfilter-automaton" target="clean"/> <ant dir="urlfilter-domain" target="clean" /> <ant dir="urlfilter-domainblacklist" target="clean" /> + <ant dir="urlfilter-fast" target="clean"/> <ant dir="urlfilter-ignoreexempt" target="clean"/> <ant dir="urlfilter-prefix" target="clean"/> <ant dir="urlfilter-regex" target="clean"/> diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java index 730d3cb..c77c67e 100644 --- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java +++ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java @@ -43,8 +43,8 @@ public abstract class RegexURLFilterBaseTest { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); - private final static String SEPARATOR = System.getProperty("file.separator"); - private final static String SAMPLES = System.getProperty("test.data", "."); + protected final static String SEPARATOR = System.getProperty("file.separator"); + protected final static String SAMPLES = System.getProperty("test.data", "."); protected abstract URLFilter getURLFilter(Reader rules); @@ -72,6 +72,24 @@ public abstract class RegexURLFilterBaseTest { + (System.currentTimeMillis() - start) + "ms"); } + protected void bench(int loops, String rulesFile, String urlsFile) { + try { + bench(loops, new FileReader(SAMPLES + SEPARATOR + rulesFile), + new FileReader(SAMPLES + SEPARATOR + urlsFile)); + } catch (Exception e) { + Assert.fail(e.toString()); + } + } + + protected void test(String rulesFile, String urlsFile) { + try { + test(new FileReader(SAMPLES + SEPARATOR + rulesFile), + new FileReader(SAMPLES + SEPARATOR + urlsFile)); + } catch (Exception e) { + Assert.fail(e.toString()); + } + } + protected void test(String file) { try { test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"), diff --git a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls index 40bf4ee..6a0e822 100644 --- a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls +++ b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls @@ -294,4 +294,24 @@ -http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr -http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 +http://www.tversity.com/ --http://www.aspseek.org/index.php \ No newline at end of file +-http://www.aspseek.org/index.php +-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif +-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip +-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg +-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg +-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg +-http://history.vineyard.net/mv1887.gif +-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG +-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG +-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg +-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg +-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg +-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg +-http://www.houseofmabel.com/programs/html3/LinkSure.zip +-http://www.lib.utexas.edu/maps/states/california.gif +-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg +-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg diff --git a/src/plugin/urlfilter-fast/README.md b/src/plugin/urlfilter-fast/README.md new file mode 100644 index 0000000..46b293f --- /dev/null +++ b/src/plugin/urlfilter-fast/README.md @@ -0,0 +1,59 @@ + +Filters URLs based on a file of regular expressions using host/domains +matching first. The default policy is to accept a URL if no matches +are found. + +Rule Format: + +``` +Host www.example.org + DenyPath /path/to/be/excluded + DenyPath /some/other/path/excluded + +# Deny everything from *.example.com and example.com +Domain example.com + DenyPath .* + +Domain example.org + DenyPathQuery /resource/.*?action=exclude +``` + +`Host` rules are evaluated before `Domain` rules. For `Host` rules the +entire host name of a URL must match while the domain names in +`Domain` rules are considered as matches if the domain is a suffix of +the host name (consisting of complete host name parts). Shorter +domain suffixes are checked first, a single dot "`.`" as "domain name" +can be used to specify global rules applied to every URL. + +E.g., for "www.example.com" the rules given above are looked up in the +following order: + +1. check "www.example.com" whether host-based rules exist and whether one of them matches +1. check "www.example.com" for domain-based rules +1. check "example.com" for domain-based rules +1. check "com" for domain-based rules +1. check for global rules (domain name is ".") + +The first matching rule will reject the URL and no further rules are +checked. If no rule matches the URL is accepted. URLs without a host +name (e.g., <code>file:/path/file.txt</code> are checked for global +rules only. URLs which fail to be parsed as +[java.net.URL](https://docs.oracle.com/javase/8/docs/api/java/net/URL.html) +are always rejected. + +For rules either the URL path (`DenyPath`) or path and query +(`DenyPathQuery`) are checked whether the given [Java Regular +expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) +is found (see +[Matcher.find()](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Matcher.html#find--)) +in the URL path (and query). + +Rules are applied in the order of their definition. For better +performance, regular expressions which are simpler/faster or match +more URLs should be defined earlier. + +Comments in the rule file start with the `#` character and reach until +the end of the line. + +The rules file is defined via the property `urlfilter.fast.file`, +the default name is `fast-urlfilter.txt`. diff --git a/src/plugin/urlfilter-fast/build.xml b/src/plugin/urlfilter-fast/build.xml new file mode 100644 index 0000000..c22ca6e --- /dev/null +++ b/src/plugin/urlfilter-fast/build.xml @@ -0,0 +1,51 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-fast" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-regex-filter/*.jar" /> + </fileset> + <pathelement location="${nutch.root}/build/lib-regex-filter/test"/> + </path> + + <!-- Compile test classes for dependencies --> + <target name="deps-test-compile"> + <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample" includes="**/*.txt, **/*.urls"/> + </copy> + +</project> diff --git a/src/plugin/urlfilter-fast/ivy.xml b/src/plugin/urlfilter-fast/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/src/plugin/urlfilter-fast/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> diff --git a/src/plugin/urlfilter-fast/plugin.xml b/src/plugin/urlfilter-fast/plugin.xml new file mode 100644 index 0000000..4e28cb3 --- /dev/null +++ b/src/plugin/urlfilter-fast/plugin.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-fast" + name="Fast URL Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-fast.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-regex-filter"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.fast" + name="Nutch Fast URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="FastURLFilter" + class="org.apache.nutch.urlfilter.fast.FastURLFilter"/> + </extension> +</plugin> diff --git a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls b/src/plugin/urlfilter-fast/sample/Benchmarks.urls similarity index 91% copy from src/plugin/urlfilter-automaton/sample/Benchmarks.urls copy to src/plugin/urlfilter-fast/sample/Benchmarks.urls index 40bf4ee..6a0e822 100644 --- a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls +++ b/src/plugin/urlfilter-fast/sample/Benchmarks.urls @@ -294,4 +294,24 @@ -http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr -http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 +http://www.tversity.com/ --http://www.aspseek.org/index.php \ No newline at end of file +-http://www.aspseek.org/index.php +-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif +-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip +-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg +-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg +-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg +-http://history.vineyard.net/mv1887.gif +-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG +-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG +-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg +-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg +-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg +-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg +-http://www.houseofmabel.com/programs/html3/LinkSure.zip +-http://www.lib.utexas.edu/maps/states/california.gif +-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg +-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt new file mode 100644 index 0000000..27a918b --- /dev/null +++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt @@ -0,0 +1,25 @@ +# port of urlfilter-regex benchmarks to urlfilter-fast +# cf. +# src/plugin/urlfilter-regex/sample/Benchmarks.rules +# src/plugin/urlfilter-regex/sample/Benchmarks.urls + +# skip file:, ftp:, & mailto: urls +# -^(file|ftp|mailto): + +# skip image and other suffixes we can't yet parse +Domain . + DenyPath (?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$ + +# skip URLs containing certain characters as probable queries, etc. +Domain . + DenyPathQuery [?*!@=] + +# skip .fr .org and .net domains +Domain fr + DenyPath .* +Domain org + DenyPath .* +Domain net + DenyPath .* + +# accept every URL not matched by any rule diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt new file mode 100644 index 0000000..9f26529 --- /dev/null +++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt @@ -0,0 +1,19 @@ +Host www.example.org + DenyPath ^/path/to/be/excluded + DenyPath ^/some/other/path/excluded + +# Deny everything from *.example.com and example.com +Domain example.com + DenyPath .* + +Domain example.org + DenyPathQuery /resource/.*?action=exclude + +# exclude images from image server +Host i.example.org + DenyPath (?i)\.jpe?g$ + +# skip URLs with slash-delimited segment that repeats 3+ times, to break loops +Domain . + DenyPath (/.+?)/.*?\1/.*?\1/ + diff --git a/src/plugin/urlfilter-fast/sample/test.urls b/src/plugin/urlfilter-fast/sample/test.urls new file mode 100644 index 0000000..3aa4354 --- /dev/null +++ b/src/plugin/urlfilter-fast/sample/test.urls @@ -0,0 +1,21 @@ +-https://www.example.org/path/to/be/excluded +-https://www.example.org/path/to/be/excluded/continued +-https://www.example.org/some/other/path/excluded ++https://www.example.org/ ++https://www.example.org/%20white%20space%20in%20path%20escaped/ +-https://www1.example.com/ +-https://www2.example.com/ +-https://www.subnet.example.com/ ++https://www.examplex.com/ ++https://www.example.co.uk/ ++https://www.example.com.za/ +-https://www.example.org/resource/put?action=exclude +-http://www.nutch.org/abcd/foo/bar/foo/bar/foo/ +-http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/ ++http://www.nutch.org/abcd/foo1/bar1/zzz1/ +-https://i.example.org/394d46ef76ee5c1bbad1cb98b40dc463d322c94d/c=0-129-2047-1285/635969287686419433-WORLD-40943944.JPG?width=3200&height=1680&fit=crop +-ftp://ftp.example.com/file1.txt ++ftp://ftp.example.org/file1.txt ++file:/path/file1.txt ++file:///path/file1.txt +-file:/abcd/foo/bar/xyz/foo/bar/foo/ \ No newline at end of file diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java new file mode 100644 index 0000000..d53a2fd --- /dev/null +++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java @@ -0,0 +1,315 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.fast; + +import com.google.common.collect.LinkedHashMultimap; +import com.google.common.collect.Multimap; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.net.URL; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + * Filters URLs based on a file of regular expressions using host/domains + * matching first. The default policy is to accept a URL if no matches are + * found. + * + * Rule Format: + * + * <pre> + * Host www.example.org + * DenyPath /path/to/be/excluded + * DenyPath /some/other/path/excluded + * + * # Deny everything from *.example.com and example.com + * Domain example.com + * DenyPath .* + * + * Domain example.org + * DenyPathQuery /resource/.*?action=exclude + * </pre> + * + * <code>Host</code> rules are evaluated before <code>Domain</code> rules. For + * <code>Host</code> rules the entire host name of a URL must match while the + * domain names in <code>Domain</code> rules are considered as matches if the + * domain is a suffix of the host name (consisting of complete host name parts). + * Shorter domain suffixes are checked first, a single dot + * "<code>.</code>" as "domain name" can be used to specify + * global rules applied to every URL. + * + * E.g., for "www.example.com" the rules given above are looked up in the + * following order: + * <ol> + * <li>check "www.example.com" whether host-based rules exist and whether one of + * them matches</li> + * <li>check "www.example.com" for domain-based rules</li> + * <li>check "example.com" for domain-based rules</li> + * <li>check "com" for domain-based rules</li> + * <li>check for global rules ("<code>Domain .</code>")</li> + * </ol> + * The first matching rule will reject the URL and no further rules are checked. + * If no rule matches the URL is accepted. URLs without a host name (e.g., + * <code>file:/path/file.txt</code> are checked for global rules only. URLs + * which fail to be parsed as {@link java.net.URL} are always rejected. + * + * For rules either the URL path (<code>DenyPath</code>) or path and query + * (<code>DenyPathQuery</code>) are checked whether the given + * {@link java.util.regex Java Regular expression} is found (see + * {@link java.util.regex.Matcher#find()}) in the URL path (and query). + * + * Rules are applied in the order of their definition. For better performance, + * regular expressions which are simpler/faster or match more URLs should be + * defined earlier. + * + * Comments in the rule file start with the <code>#</code> character and reach + * until the end of the line. + * + * The rules file is defined via the property <code>urlfilter.fast.file</code>, + * the default name is <code>fast-urlfilter.txt</code>. + */ +public class FastURLFilter implements URLFilter { + + protected static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private Configuration conf; + public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file"; + private Multimap<String, Rule> hostRules = LinkedHashMultimap.create(); + private Multimap<String, Rule> domainRules = LinkedHashMultimap.create(); + + private static final Pattern CATCH_ALL_RULE = Pattern + .compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$"); + + public FastURLFilter() {} + + FastURLFilter(Reader rules) throws IOException, PatternSyntaxException { + reloadRules(rules); + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + try { + reloadRules(); + } catch (Exception e) { + LOG.error(e.getMessage()); + throw new RuntimeException(e.getMessage(), e); + } + } + + @Override + public Configuration getConf() { + return this.conf; + } + + @Override + public String filter(String url) { + + URL u; + + try { + u = new URL(url); + } catch (Exception e) { + LOG.debug("Rejected {} because failed to parse as URL: {}", url, + e.getMessage()); + return null; + } + + String hostname = u.getHost(); + + // first check for host-specific rules + for (Rule rule : hostRules.get(hostname)) { + if (rule.match(u)) { + return null; + } + } + + // also look up domain rules for host name + for (Rule rule : domainRules.get(hostname)) { + if (rule.match(u)) { + return null; + } + } + + // check suffixes of host name from longer to shorter: + // subdomains, domain, top-level domain + int start = 0; + int pos; + while ((pos = hostname.indexOf('.', start)) != -1) { + start = pos + 1; + String domain = hostname.substring(start); + for (Rule rule : domainRules.get(domain)) { + if (rule.match(u)) { + return null; + } + } + } + + // finally check "global" rules defined for `Domain .` + for (Rule rule : domainRules.get(".")) { + if (rule.match(u)) { + return null; + } + } + + // no reject rules found + return url; + } + + public void reloadRules() throws IOException { + String fileRules = conf.get(URLFILTER_FAST_FILE); + try (Reader reader = conf.getConfResourceAsReader(fileRules)) { + reloadRules(reader); + } + } + + private void reloadRules(Reader rules) throws IOException { + domainRules.clear(); + hostRules.clear(); + + BufferedReader reader = new BufferedReader(rules); + + String current = null; + boolean host = false; + int lineno = 0; + + String line; + try { + while((line = reader.readLine()) != null) { + lineno++; + line = line.trim(); + + if (line.indexOf("#") != -1) { + // strip comments + line = line.substring(0, line.indexOf("#")).trim(); + } + + if (StringUtils.isBlank(line)) { + continue; + } + + if (line.startsWith("Host")) { + host = true; + current = line.split("\\s+")[1]; + } else if (line.startsWith("Domain")) { + host = false; + current = line.split("\\s+")[1]; + } else { + if (current == null) { + continue; + } + + Rule rule = null; + try { + if (CATCH_ALL_RULE.matcher(line).matches()) { + rule = DenyAllRule.getInstance(); + } else if (line.startsWith("DenyPathQuery")) { + rule = new DenyPathQueryRule(line.split("\\s+")[1]); + } else if (line.startsWith("DenyPath")) { + rule = new DenyPathRule(line.split("\\s+")[1]); + } else { + LOG.warn("Problem reading rule on line {}: {}", lineno, line); + continue; + } + } catch (Exception e) { + LOG.warn("Problem reading rule on line {}: {} - {}", lineno, line, e.getMessage()); + continue; + } + + if (host) { + LOG.trace("Adding host rule [{}] [{}]", current, rule); + hostRules.put(current, rule); + } else { + LOG.trace("Adding domain rule [{}] [{}]", current, rule); + domainRules.put(current, rule); + } + } + } + } catch (IOException e) { + LOG.warn("Caught exception while reading rules file at line {}: {}", + lineno, e.getMessage()); + throw e; + } + } + + public static class Rule { + protected Pattern pattern; + + Rule() {} + + public Rule(String regex) { + pattern = Pattern.compile(regex); + } + + public boolean match(URL url) { + return pattern.matcher(url.toString()).find(); + } + + public String toString() { + return pattern.toString(); + } + } + + public static class DenyPathRule extends Rule { + public DenyPathRule(String regex) { + super(regex); + } + + public boolean match(URL url) { + String haystack = url.getPath(); + return pattern.matcher(haystack).find(); + } + } + + /** Rule for <code>DenyPath .*</code> or <code>DenyPath .?</code> */ + public static class DenyAllRule extends Rule { + + private static Rule instance = new DenyAllRule("."); + + private DenyAllRule(String regex) { + super(regex); + } + + public static Rule getInstance() { + return instance; + } + + public boolean match(URL url) { + return true; + } + } + + public static class DenyPathQueryRule extends Rule { + public DenyPathQueryRule(String regex) { + super(regex); + } + + public boolean match(URL url) { + String haystack = url.getFile(); + return pattern.matcher(haystack).find(); + } + } +} diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java new file mode 100644 index 0000000..d56f948 --- /dev/null +++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin that first does fast exact suffix matches on host/domain + * names before applying regular expressions to the path component of a URL. See + * {@link org.apache.nutch.urlfilter.fast.FastURLFilter} for a description of + * the rule format. + */ +package org.apache.nutch.urlfilter.fast; diff --git a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java new file mode 100644 index 0000000..9609228 --- /dev/null +++ b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.fast; + +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; + +import org.apache.nutch.net.URLFilter; +import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest; +import org.junit.Assert; +import org.junit.Test; + + +public class TestFastURLFilter extends RegexURLFilterBaseTest { + + protected URLFilter getURLFilter(Reader rules) { + try { + return new FastURLFilter(rules); + } catch (IOException e) { + Assert.fail(e.toString()); + return null; + } + } + + @Test + public void test() { + test("fast-urlfilter-test.txt", "test.urls"); + test("fast-urlfilter-benchmark.txt", "Benchmarks.urls"); + } + + @Test + public void benchmark() { + bench(50, "fast-urlfilter-benchmark.txt", "Benchmarks.urls"); + bench(100, "fast-urlfilter-benchmark.txt", "Benchmarks.urls"); + bench(200, "fast-urlfilter-benchmark.txt", "Benchmarks.urls"); + bench(400, "fast-urlfilter-benchmark.txt", "Benchmarks.urls"); + bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls"); + } + +} diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.urls b/src/plugin/urlfilter-regex/sample/Benchmarks.urls index 40bf4ee..6a0e822 100644 --- a/src/plugin/urlfilter-regex/sample/Benchmarks.urls +++ b/src/plugin/urlfilter-regex/sample/Benchmarks.urls @@ -294,4 +294,24 @@ -http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr -http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 +http://www.tversity.com/ --http://www.aspseek.org/index.php \ No newline at end of file +-http://www.aspseek.org/index.php +-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif +-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip +-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg +-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg +-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg +-http://history.vineyard.net/mv1887.gif +-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG +-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG +-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg +-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg +-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg +-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg +-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg +-http://www.houseofmabel.com/programs/html3/LinkSure.zip +-http://www.lib.utexas.edu/maps/states/california.gif +-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg +-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg