Repository: nutch Updated Branches: refs/heads/master f5e430e55 -> 15c583e22
NUTCH-2144 Added an extension point and a plugin that overrides db.ignore.external to accept external links Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/2015703c Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/2015703c Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/2015703c Branch: refs/heads/master Commit: 2015703cfd32cae98b14d2fd6af5ac4396237c48 Parents: 25e879a Author: Thamme Gowda <[email protected]> Authored: Sun Feb 28 19:23:26 2016 -0800 Committer: Thamme Gowda <[email protected]> Committed: Sun Feb 28 19:23:26 2016 -0800 ---------------------------------------------------------------------- build.xml | 4 + conf/nutch-default.xml | 8 ++ .../org/apache/nutch/fetcher/FetcherThread.java | 6 +- .../apache/nutch/net/URLExemptionFilter.java | 43 ++++++++ .../apache/nutch/net/URLExemptionFilters.java | 64 ++++++++++++ .../apache/nutch/parse/ParseOutputFormat.java | 28 +++-- .../org/apache/nutch/plugin/ExtensionPoint.java | 2 +- src/plugin/build.xml | 3 + src/plugin/nutch-extensionpoints/plugin.xml | 4 + src/plugin/urlfilter-ignoreexempt/README.md | 43 ++++++++ src/plugin/urlfilter-ignoreexempt/build.xml | 55 ++++++++++ .../urlfilter-ignoreexempt/data/.donotdelete | 0 src/plugin/urlfilter-ignoreexempt/ivy.xml | 41 ++++++++ src/plugin/urlfilter-ignoreexempt/plugin.xml | 45 +++++++++ .../ignoreexempt/ExemptionUrlFilter.java | 101 +++++++++++++++++++ .../urlfilter/ignoreexempt/package-info.java | 24 +++++ 16 files changed, 460 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/build.xml ---------------------------------------------------------------------- diff --git a/build.xml b/build.xml index 5d467a6..f8aa196 100644 --- a/build.xml +++ b/build.xml @@ -220,6 +220,7 @@ <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/> <packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/> <packageset dir="${plugins.dir}/urlfilter-validator/src/java"/> + <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/> <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/> <packageset dir="${plugins.dir}/urlmeta/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/> @@ -658,6 +659,7 @@ <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/> <packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/> <packageset dir="${plugins.dir}/urlfilter-validator/src/java"/> + <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/> <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/> <packageset dir="${plugins.dir}/urlmeta/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/> @@ -1080,6 +1082,8 @@ <source path="${plugins.dir}/urlfilter-suffix/src/test/" /> <source path="${plugins.dir}/urlfilter-validator/src/java/" /> <source path="${plugins.dir}/urlfilter-validator/src/test/" /> + <source path="${plugins.dir}/urlfilter-ignoreexempt/src/java/" /> + <source path="${plugins.dir}/urlfilter-ignoreexempt/src/test/" /> <source path="${plugins.dir}/parsefilter-naivebayes/src/java/" /> <source path="${plugins.dir}/urlmeta/src/java/" /> <source path="${plugins.dir}/urlnormalizer-basic/src/java/" /> http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 218b2f0..71f7efb 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -564,6 +564,14 @@ </property> <property> + <name>db.ignore.external.exemptions.file</name> + <value>db-ignore-external-exemptions.txt</value> + <description> + This file contains exemption rules used by 'urlfiter-ignoreexempt' plugin + </description> +</property> + +<property> <name>db.injector.overwrite</name> <value>false</value> <description>Whether existing records in the CrawlDB will be overwritten http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/fetcher/FetcherThread.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 958960d..09315a7 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -37,6 +37,7 @@ import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLExemptionFilters; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -74,6 +75,7 @@ public class FetcherThread extends Thread { private Configuration conf; private URLFilters urlFilters; + private URLExemptionFilters urlExemptionFilters; private ScoringFilters scfilters; private ParseUtil parseUtil; private URLNormalizers normalizers; @@ -139,6 +141,7 @@ public class FetcherThread extends Thread { this.setName("FetcherThread"); // use an informative name this.conf = conf; this.urlFilters = new URLFilters(conf); + this.urlExemptionFilters = new URLExemptionFilters(conf); this.scfilters = new ScoringFilters(conf); this.parseUtil = new ParseUtil(conf); this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true); @@ -660,7 +663,8 @@ public class FetcherThread extends Thread { String toUrl = links[i].getToUrl(); toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, - origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, normalizers); + origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, + urlFilters, urlExemptionFilters, normalizers); if (toUrl == null) { continue; } http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/net/URLExemptionFilter.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/net/URLExemptionFilter.java b/src/java/org/apache/nutch/net/URLExemptionFilter.java new file mode 100644 index 0000000..8de5800 --- /dev/null +++ b/src/java/org/apache/nutch/net/URLExemptionFilter.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +//Hadoop +import org.apache.hadoop.conf.Configurable; +// Nutch +import org.apache.nutch.plugin.Pluggable; + +/** + * Interface used to allow exemptions to external domain resources by overriding <code>db.ignore.external.links</code>. + * This is useful when the crawl is focused to a domain but resources like images are hosted on CDN. + */ + +public interface URLExemptionFilter extends Pluggable, Configurable{ + + /** The name of the extension point. */ + public final static String X_POINT_ID = URLExemptionFilter.class.getName(); + + /** + * Checks if toUrl is exempted when the ignore external is enabled + * @param fromUrl : the source url which generated the outlink + * @param toUrl : the destination url which needs to be checked for exemption + * @return true when toUrl is exempted from dbIgnore + */ + public boolean filter(String fromUrl, String toUrl); + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/net/URLExemptionFilters.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/net/URLExemptionFilters.java b/src/java/org/apache/nutch/net/URLExemptionFilters.java new file mode 100644 index 0000000..d362f2e --- /dev/null +++ b/src/java/org/apache/nutch/net/URLExemptionFilters.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.plugin.PluginRuntimeException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Creates and caches {@link URLExemptionFilter} implementing plugins. */ +public class URLExemptionFilters { + + private static final Logger LOG = LoggerFactory.getLogger(URLExemptionFilters.class); + + private URLExemptionFilter[] filters; + + public URLExemptionFilters(Configuration conf) { + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLExemptionFilter.X_POINT_ID).getExtensions(); + filters = new URLExemptionFilter[extensions.length]; + for (int i = 0; i < extensions.length; i++) { + try { + filters[i] = (URLExemptionFilter) extensions[i].getExtensionInstance(); + } catch (PluginRuntimeException e) { + throw new IllegalStateException(e); + } + } + LOG.info("Found {} extensions at point:'{}'", filters.length, + URLExemptionFilter.X_POINT_ID); + } + + + /** Run all defined filters. Assume logical AND. */ + public boolean isExempted(String fromUrl, String toUrl) { + if (filters.length < 1) { + //at least one filter should be on + return false; + } + //validate from, to and filters + boolean exempted = fromUrl != null && toUrl != null; + //An URL is exempted when all the filters accept it to pass through + for (int i = 0; i < this.filters.length && exempted; i++) { + exempted = this.filters[i].filter(fromUrl, toUrl); + } + return exempted; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/parse/ParseOutputFormat.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java index e8a7277..51b32fc 100644 --- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java +++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java @@ -49,8 +49,8 @@ import org.apache.hadoop.util.Progressable; public class ParseOutputFormat implements OutputFormat<Text, Parse> { private static final Logger LOG = LoggerFactory .getLogger(ParseOutputFormat.class); - private URLFilters filters; + private URLExemptionFilters exemptionFilters; private URLNormalizers normalizers; private ScoringFilters scfilters; @@ -94,6 +94,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> { if (job.getBoolean("parse.filter.urls", true)) { filters = new URLFilters(job); + exemptionFilters = new URLExemptionFilters(job); } if (job.getBoolean("parse.normalize.urls", true)) { @@ -209,7 +210,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> { String newUrl = pstatus.getMessage(); int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); newUrl = filterNormalize(fromUrl, newUrl, origin, - ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers, + ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers, URLNormalizers.SCOPE_FETCHER); if (newUrl != null) { @@ -240,7 +241,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> { // Only normalize and filter if fetcher.parse = false if (!isParsing) { toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin, - ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers); + ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers); if (toUrl == null) { continue; } @@ -319,16 +320,18 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> { public static String filterNormalize(String fromUrl, String toUrl, String fromHost, boolean ignoreInternalLinks, boolean ignoreExternalLinks, - String ignoreExternalLinksMode, URLFilters filters, + String ignoreExternalLinksMode, URLFilters filters, URLExemptionFilters exemptionFilters, URLNormalizers normalizers) { return filterNormalize(fromUrl, toUrl, fromHost, ignoreInternalLinks, ignoreExternalLinks, - ignoreExternalLinksMode, filters, normalizers, + ignoreExternalLinksMode, filters, exemptionFilters, normalizers, URLNormalizers.SCOPE_OUTLINK); } public static String filterNormalize(String fromUrl, String toUrl, - String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks, String ignoreExternalLinksMode, URLFilters filters, - URLNormalizers normalizers, String urlNormalizerScope) { + String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks, + String ignoreExternalLinksMode, URLFilters filters, + URLExemptionFilters exemptionFilters, URLNormalizers normalizers, + String urlNormalizerScope) { // ignore links to self (or anchors within the page) if (fromUrl.equals(toUrl)) { return null; @@ -343,30 +346,37 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> { if (ignoreExternalLinks) { if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { String toDomain = URLUtil.getDomainName(targetURL).toLowerCase(); + //FIXME: toDomain will never be null, correct? if (toDomain == null || !toDomain.equals(origin)) { return null; // skip it } } else { String toHost = targetURL.getHost().toLowerCase(); - if (toHost == null || !toHost.equals(origin)) { - return null; // skip it + if (!toHost.equals(origin)) { // external host link + if (exemptionFilters == null // check if it is exempted? + || !exemptionFilters.isExempted(fromUrl, toUrl)) { + return null; ///skip it, This external url is not exempted. + } } } } if (ignoreInternalLinks) { if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { String toDomain = URLUtil.getDomainName(targetURL).toLowerCase(); + //FIXME: toDomain will never be null, correct? if (toDomain == null || toDomain.equals(origin)) { return null; // skip it } } else { String toHost = targetURL.getHost().toLowerCase(); + //FIXME: toDomain will never be null, correct? if (toHost == null || toHost.equals(origin)) { return null; // skip it } } } } + try { if (normalizers != null) { toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/plugin/ExtensionPoint.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/plugin/ExtensionPoint.java b/src/java/org/apache/nutch/plugin/ExtensionPoint.java index 1200e4f..178c5a2 100644 --- a/src/java/org/apache/nutch/plugin/ExtensionPoint.java +++ b/src/java/org/apache/nutch/plugin/ExtensionPoint.java @@ -36,7 +36,7 @@ public class ExtensionPoint { * @param pId * unique extension point Id * @param pName - * name of the extension poin + * name of the extension point * @param pSchema * xml schema of the extension point */ http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 83c3db2..10731b3 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -76,6 +76,7 @@ <ant dir="urlfilter-regex" target="deploy"/> <ant dir="urlfilter-suffix" target="deploy"/> <ant dir="urlfilter-validator" target="deploy"/> + <ant dir="urlfilter-ignoreexempt" target="deploy"/> <ant dir="parsefilter-naivebayes" target="deploy"/> <ant dir="parsefilter-regex" target="deploy"/> <ant dir="urlmeta" target="deploy"/> @@ -124,6 +125,7 @@ <ant dir="urlfilter-regex" target="test"/> <ant dir="urlfilter-suffix" target="test"/> <ant dir="urlfilter-validator" target="test"/> + <ant dir="urlfilter-ignoreexempt" target="test"/> <ant dir="urlnormalizer-ajax" target="test"/> <ant dir="urlnormalizer-basic" target="test"/> <ant dir="urlnormalizer-host" target="test"/> @@ -192,6 +194,7 @@ <ant dir="urlfilter-regex" target="clean"/> <ant dir="urlfilter-suffix" target="clean"/> <ant dir="urlfilter-validator" target="clean"/> + <ant dir="urlfilter-ignoreexempt" target="clean"/> <ant dir="parsefilter-naivebayes" target="clean" /> <ant dir="urlmeta" target="clean"/> <ant dir="urlnormalizer-ajax" target="clean"/> http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/nutch-extensionpoints/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/nutch-extensionpoints/plugin.xml b/src/plugin/nutch-extensionpoints/plugin.xml index e095c1c..8cf7a23 100644 --- a/src/plugin/nutch-extensionpoints/plugin.xml +++ b/src/plugin/nutch-extensionpoints/plugin.xml @@ -49,6 +49,10 @@ name="Nutch URL Filter"/> <extension-point + id="org.apache.nutch.net.URLExemptionFilter" + name="Nutch URL Ignore Exemption Filter"/> + +<extension-point id="org.apache.nutch.net.URLNormalizer" name="Nutch URL Normalizer"/> http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/README.md ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-ignoreexempt/README.md b/src/plugin/urlfilter-ignoreexempt/README.md new file mode 100644 index 0000000..d48b672 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt/README.md @@ -0,0 +1,43 @@ +urlfilter-ignoreexempt +====================== + This plugin allows certain urls to be exempted when the external links are configured to be ignored. + This is useful when focused crawl is setup but some resources like static files are linked from CDNs (external domains). + +# How to enable ? +Add `urlfilter-ignoreexempt` value to `plugin.includes` property +```xml +<property> + <name>plugin.includes</name> + <value>protocol-http|urlfilter-(regex|ignoreexempt)...</value> +</property> +``` + +# How to configure rules? + +open `conf/db-ignore-external-exemptions.txt` and add the regex rules. + +## Format : + +The format is same same as `regex-urlfilter.txt`. + Each non-comment, non-blank line contains a regular expression + prefixed by '+' or '-'. The first matching pattern in the file + determines whether a URL is exempted or ignored. If no pattern + matches, the URL is ignored. + + +## Example : + + To exempt urls ending with image extensions, use this rule + +`+(?i)\.(jpg|png|gif)$` + + + +## Testing the Rules : + +After enabling the plugin and adding your rules to `conf/db-ignore-external-exemptions.txt`, run: + +`bin/nutch plugin urlfilter-ignoreexempt org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://yoururl.here` + + +This should print `true` for urls which are accepted by configured rules. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-ignoreexempt/build.xml b/src/plugin/urlfilter-ignoreexempt/build.xml new file mode 100644 index 0000000..105f551 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt/build.xml @@ -0,0 +1,55 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-ignoreexempt" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-regex-filter/*.jar" /> + <include name="**/urlfilter-regex/*.jar" /> + </fileset> + <pathelement location="${nutch.root}/build/lib-regex-filter/test"/> + <pathelement location="${nutch.root}/build/urlfilter-regex/test"/> + </path> + + <!-- Compile test classes for dependencies --> + <target name="deps-test-compile"> + <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/> + <ant target="compile-test" inheritall="false" dir="../urlfilter-regex"/> + </target> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/> + <ant target="deploy" inheritall="false" dir="../urlfilter-regex"/> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/data/.donotdelete ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-ignoreexempt/data/.donotdelete b/src/plugin/urlfilter-ignoreexempt/data/.donotdelete new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-ignoreexempt/ivy.xml b/src/plugin/urlfilter-ignoreexempt/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-ignoreexempt/plugin.xml b/src/plugin/urlfilter-ignoreexempt/plugin.xml new file mode 100644 index 0000000..4139ca4 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt/plugin.xml @@ -0,0 +1,45 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-ignoreexempt" + name="External Domain Ignore Exemption" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-ignoreexempt.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-regex-filter"/> + <import plugin="urlfilter-regex"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.ignoreexempt" + name="Ignore Exemption Url Filter" + point="org.apache.nutch.net.URLExemptionFilter"> + <implementation id="ExemptionUrlFilter" + class="org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter"> + <parameter name="file" value="db-ignore-external-exemptions.txt"/> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java new file mode 100644 index 0000000..bbac300 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.ignoreexempt; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLExemptionFilter; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.urlfilter.regex.RegexURLFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.Arrays; +import java.util.regex.Pattern; +import java.util.List; +import java.util.ArrayList; + + +/** + * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses regex configuration + * to check if URL is eligible for exemption from 'db.ignore.external'. + * When this filter is enabled, the external urls will be checked against configured sequence of regex rules. + *<p> + * The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be + * overridden using the property <code>"db.ignore.external.exemptions.file" in ./conf/nutch-*.xml</code> + *</p> + * + * The exemption rules are specified in plain text file where each line is a rule. + * The format is same same as `regex-urlfilter.txt`. + * Each non-comment, non-blank line contains a regular expression + * prefixed by '+' or '-'. The first matching pattern in the file + * determines whether a URL is exempted or ignored. If no pattern + * matches, the URL is ignored. + * + * @since Feb 10, 2016 + * @version 1 + * @see org.apache.nutch.net.URLExemptionFilter + * @see org.apache.nutch.urlfilter.regex.RegexURLFilter + */ +public class ExemptionUrlFilter extends RegexURLFilter + implements URLExemptionFilter { + + public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE + = "db.ignore.external.exemptions.file"; + private static final Logger LOG = + LoggerFactory.getLogger(ExemptionUrlFilter.class); + + private List<Pattern> exemptions; + private Configuration conf; + + public List<Pattern> getExemptions() { + return exemptions; + } + + @Override + public boolean filter(String fromUrl, String toUrl) { + //this implementation does not consider fromUrl param. + //the regex rules are applied to toUrl. + return this.filter(toUrl) != null; + } + + /** + * Gets reader for regex rules + */ + protected Reader getRulesReader(Configuration conf) + throws IOException { + String fileRules = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE); + return conf.getConfResourceAsReader(fileRules); + } + + public static void main(String[] args) { + + if (args.length != 1) { + System.out.println("Error: Invalid Args"); + System.out.println("Usage: " + + ExemptionUrlFilter.class.getName() + " <url>"); + return; + } + String url = args[0]; + ExemptionUrlFilter instance = new ExemptionUrlFilter(); + instance.setConf(NutchConfiguration.create()); + System.out.println(instance.filter(null, url)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java new file mode 100644 index 0000000..ee949c5 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin which identifies exemptions to external urls when + * when external urls are set to ignore. + * + */ +package org.apache.nutch.urlfilter.ignoreexempt; +
