Repository: nutch
Updated Branches:
  refs/heads/master f5e430e55 -> 15c583e22


NUTCH-2144 Added an extension point and a plugin that overrides 
db.ignore.external to accept external links


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/2015703c
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/2015703c
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/2015703c

Branch: refs/heads/master
Commit: 2015703cfd32cae98b14d2fd6af5ac4396237c48
Parents: 25e879a
Author: Thamme Gowda <[email protected]>
Authored: Sun Feb 28 19:23:26 2016 -0800
Committer: Thamme Gowda <[email protected]>
Committed: Sun Feb 28 19:23:26 2016 -0800

----------------------------------------------------------------------
 build.xml                                       |   4 +
 conf/nutch-default.xml                          |   8 ++
 .../org/apache/nutch/fetcher/FetcherThread.java |   6 +-
 .../apache/nutch/net/URLExemptionFilter.java    |  43 ++++++++
 .../apache/nutch/net/URLExemptionFilters.java   |  64 ++++++++++++
 .../apache/nutch/parse/ParseOutputFormat.java   |  28 +++--
 .../org/apache/nutch/plugin/ExtensionPoint.java |   2 +-
 src/plugin/build.xml                            |   3 +
 src/plugin/nutch-extensionpoints/plugin.xml     |   4 +
 src/plugin/urlfilter-ignoreexempt/README.md     |  43 ++++++++
 src/plugin/urlfilter-ignoreexempt/build.xml     |  55 ++++++++++
 .../urlfilter-ignoreexempt/data/.donotdelete    |   0
 src/plugin/urlfilter-ignoreexempt/ivy.xml       |  41 ++++++++
 src/plugin/urlfilter-ignoreexempt/plugin.xml    |  45 +++++++++
 .../ignoreexempt/ExemptionUrlFilter.java        | 101 +++++++++++++++++++
 .../urlfilter/ignoreexempt/package-info.java    |  24 +++++
 16 files changed, 460 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index 5d467a6..f8aa196 100644
--- a/build.xml
+++ b/build.xml
@@ -220,6 +220,7 @@
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-validator/src/java"/>
+      <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/urlmeta/src/java"/>
       <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/>
@@ -658,6 +659,7 @@
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-validator/src/java"/>
+      <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/urlmeta/src/java"/>
       <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/>
@@ -1080,6 +1082,8 @@
         <source path="${plugins.dir}/urlfilter-suffix/src/test/" />
         <source path="${plugins.dir}/urlfilter-validator/src/java/" />
         <source path="${plugins.dir}/urlfilter-validator/src/test/" />
+        <source path="${plugins.dir}/urlfilter-ignoreexempt/src/java/" />
+        <source path="${plugins.dir}/urlfilter-ignoreexempt/src/test/" />
         <source path="${plugins.dir}/parsefilter-naivebayes/src/java/" />
         <source path="${plugins.dir}/urlmeta/src/java/" />
         <source path="${plugins.dir}/urlnormalizer-basic/src/java/" />

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 218b2f0..71f7efb 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -564,6 +564,14 @@
 </property>
 
  <property>
+  <name>db.ignore.external.exemptions.file</name>
+  <value>db-ignore-external-exemptions.txt</value>
+  <description>
+    This file contains exemption rules used by 'urlfiter-ignoreexempt' plugin
+  </description>
+</property>
+
+<property>
   <name>db.injector.overwrite</name>
   <value>false</value>
   <description>Whether existing records in the CrawlDB will be overwritten

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/fetcher/FetcherThread.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 958960d..09315a7 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -37,6 +37,7 @@ import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLExemptionFilters;
 import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
@@ -74,6 +75,7 @@ public class FetcherThread extends Thread {
 
   private Configuration conf;
   private URLFilters urlFilters;
+  private URLExemptionFilters urlExemptionFilters;
   private ScoringFilters scfilters;
   private ParseUtil parseUtil;
   private URLNormalizers normalizers;
@@ -139,6 +141,7 @@ public class FetcherThread extends Thread {
     this.setName("FetcherThread"); // use an informative name
     this.conf = conf;
     this.urlFilters = new URLFilters(conf);
+    this.urlExemptionFilters = new URLExemptionFilters(conf);
     this.scfilters = new ScoringFilters(conf);
     this.parseUtil = new ParseUtil(conf);
     this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
@@ -660,7 +663,8 @@ public class FetcherThread extends Thread {
             String toUrl = links[i].getToUrl();
 
             toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl,
-                origin, ignoreInternalLinks, ignoreExternalLinks, 
ignoreExternalLinksMode, urlFilters, normalizers);
+                origin, ignoreInternalLinks, ignoreExternalLinks, 
ignoreExternalLinksMode,
+                    urlFilters, urlExemptionFilters,  normalizers);
             if (toUrl == null) {
               continue;
             }

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/net/URLExemptionFilter.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/net/URLExemptionFilter.java 
b/src/java/org/apache/nutch/net/URLExemptionFilter.java
new file mode 100644
index 0000000..8de5800
--- /dev/null
+++ b/src/java/org/apache/nutch/net/URLExemptionFilter.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+//Hadoop
+import org.apache.hadoop.conf.Configurable;
+// Nutch
+import org.apache.nutch.plugin.Pluggable;
+
+/**
+ * Interface used to allow exemptions to external domain resources by 
overriding <code>db.ignore.external.links</code>.
+ * This is useful when the crawl is focused to a domain but resources like 
images are hosted on CDN.
+ */
+
+public interface URLExemptionFilter extends Pluggable, Configurable{
+
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = URLExemptionFilter.class.getName();
+
+  /**
+   * Checks if toUrl is exempted when the ignore external is enabled
+   * @param fromUrl : the source url which generated the outlink
+   * @param toUrl : the destination url which needs to be checked for exemption
+   * @return true when toUrl is exempted from dbIgnore
+   */
+  public boolean filter(String fromUrl, String toUrl);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/net/URLExemptionFilters.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/net/URLExemptionFilters.java 
b/src/java/org/apache/nutch/net/URLExemptionFilters.java
new file mode 100644
index 0000000..d362f2e
--- /dev/null
+++ b/src/java/org/apache/nutch/net/URLExemptionFilters.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Creates and caches {@link URLExemptionFilter} implementing plugins. */
+public class URLExemptionFilters {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(URLExemptionFilters.class);
+
+  private URLExemptionFilter[] filters;
+
+  public URLExemptionFilters(Configuration conf) {
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLExemptionFilter.X_POINT_ID).getExtensions();
+    filters = new URLExemptionFilter[extensions.length];
+    for (int i = 0; i < extensions.length; i++) {
+      try {
+        filters[i] = (URLExemptionFilter) extensions[i].getExtensionInstance();
+      } catch (PluginRuntimeException e) {
+        throw new IllegalStateException(e);
+      }
+    }
+    LOG.info("Found {} extensions at point:'{}'", filters.length,
+        URLExemptionFilter.X_POINT_ID);
+  }
+
+
+  /** Run all defined filters. Assume logical AND. */
+  public boolean isExempted(String fromUrl, String toUrl) {
+    if (filters.length < 1) {
+      //at least one filter should be on
+      return false;
+    }
+    //validate from, to and filters
+    boolean exempted = fromUrl != null && toUrl != null;
+    //An URL is exempted when all the filters accept it to pass through
+    for (int i = 0; i < this.filters.length && exempted; i++) {
+      exempted = this.filters[i].filter(fromUrl, toUrl);
+    }
+    return exempted;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/parse/ParseOutputFormat.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
index e8a7277..51b32fc 100644
--- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java
+++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -49,8 +49,8 @@ import org.apache.hadoop.util.Progressable;
 public class ParseOutputFormat implements OutputFormat<Text, Parse> {
   private static final Logger LOG = LoggerFactory
       .getLogger(ParseOutputFormat.class);
-
   private URLFilters filters;
+  private URLExemptionFilters exemptionFilters;
   private URLNormalizers normalizers;
   private ScoringFilters scfilters;
 
@@ -94,6 +94,7 @@ public class ParseOutputFormat implements OutputFormat<Text, 
Parse> {
 
     if (job.getBoolean("parse.filter.urls", true)) {
       filters = new URLFilters(job);
+      exemptionFilters = new URLExemptionFilters(job);
     }
 
     if (job.getBoolean("parse.normalize.urls", true)) {
@@ -209,7 +210,7 @@ public class ParseOutputFormat implements 
OutputFormat<Text, Parse> {
           String newUrl = pstatus.getMessage();
           int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
           newUrl = filterNormalize(fromUrl, newUrl, origin,
-              ignoreInternalLinks, ignoreExternalLinks, 
ignoreExternalLinksMode, filters, normalizers,
+              ignoreInternalLinks, ignoreExternalLinks, 
ignoreExternalLinksMode, filters, exemptionFilters, normalizers,
               URLNormalizers.SCOPE_FETCHER);
 
           if (newUrl != null) {
@@ -240,7 +241,7 @@ public class ParseOutputFormat implements 
OutputFormat<Text, Parse> {
           // Only normalize and filter if fetcher.parse = false
           if (!isParsing) {
             toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin,
-                ignoreInternalLinks, ignoreExternalLinks, 
ignoreExternalLinksMode, filters, normalizers);
+                ignoreInternalLinks, ignoreExternalLinks, 
ignoreExternalLinksMode, filters, exemptionFilters, normalizers);
             if (toUrl == null) {
               continue;
             }
@@ -319,16 +320,18 @@ public class ParseOutputFormat implements 
OutputFormat<Text, Parse> {
 
   public static String filterNormalize(String fromUrl, String toUrl,
       String fromHost, boolean ignoreInternalLinks, boolean 
ignoreExternalLinks,
-      String ignoreExternalLinksMode, URLFilters filters,
+      String ignoreExternalLinksMode, URLFilters filters, URLExemptionFilters 
exemptionFilters,
       URLNormalizers normalizers) {
     return filterNormalize(fromUrl, toUrl, fromHost, ignoreInternalLinks, 
ignoreExternalLinks,
-        ignoreExternalLinksMode, filters, normalizers,
+        ignoreExternalLinksMode, filters, exemptionFilters, normalizers,
         URLNormalizers.SCOPE_OUTLINK);
   }
 
   public static String filterNormalize(String fromUrl, String toUrl,
-      String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks, 
String ignoreExternalLinksMode, URLFilters filters,
-      URLNormalizers normalizers, String urlNormalizerScope) {
+      String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks,
+       String ignoreExternalLinksMode, URLFilters filters,
+       URLExemptionFilters exemptionFilters, URLNormalizers normalizers,
+        String urlNormalizerScope) {
     // ignore links to self (or anchors within the page)
     if (fromUrl.equals(toUrl)) {
       return null;
@@ -343,30 +346,37 @@ public class ParseOutputFormat implements 
OutputFormat<Text, Parse> {
       if (ignoreExternalLinks) {
         if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
           String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+          //FIXME: toDomain will never be null, correct?
           if (toDomain == null || !toDomain.equals(origin)) {
             return null; // skip it
           }
         } else {
           String toHost = targetURL.getHost().toLowerCase();
-          if (toHost == null || !toHost.equals(origin)) {
-            return null; // skip it
+          if (!toHost.equals(origin)) { // external host link
+            if (exemptionFilters == null // check if it is exempted?
+                || !exemptionFilters.isExempted(fromUrl, toUrl)) {
+              return null; ///skip it, This external url is not exempted.
+            }
           }
         }
       }
       if (ignoreInternalLinks) {
         if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
           String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+          //FIXME: toDomain will never be null, correct?
           if (toDomain == null || toDomain.equals(origin)) {
             return null; // skip it
           }
         } else {
           String toHost = targetURL.getHost().toLowerCase();
+          //FIXME: toDomain will never be null, correct?
           if (toHost == null || toHost.equals(origin)) {
             return null; // skip it
           }
         }
       }
     }
+
     try {
       if (normalizers != null) {
         toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/java/org/apache/nutch/plugin/ExtensionPoint.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/plugin/ExtensionPoint.java 
b/src/java/org/apache/nutch/plugin/ExtensionPoint.java
index 1200e4f..178c5a2 100644
--- a/src/java/org/apache/nutch/plugin/ExtensionPoint.java
+++ b/src/java/org/apache/nutch/plugin/ExtensionPoint.java
@@ -36,7 +36,7 @@ public class ExtensionPoint {
    * @param pId
    *          unique extension point Id
    * @param pName
-   *          name of the extension poin
+   *          name of the extension point
    * @param pSchema
    *          xml schema of the extension point
    */

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 83c3db2..10731b3 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -76,6 +76,7 @@
      <ant dir="urlfilter-regex" target="deploy"/>
      <ant dir="urlfilter-suffix" target="deploy"/>
      <ant dir="urlfilter-validator" target="deploy"/>
+     <ant dir="urlfilter-ignoreexempt" target="deploy"/>
      <ant dir="parsefilter-naivebayes" target="deploy"/>
      <ant dir="parsefilter-regex" target="deploy"/>
      <ant dir="urlmeta" target="deploy"/>
@@ -124,6 +125,7 @@
      <ant dir="urlfilter-regex" target="test"/>
      <ant dir="urlfilter-suffix" target="test"/>
      <ant dir="urlfilter-validator" target="test"/>
+     <ant dir="urlfilter-ignoreexempt" target="test"/>
      <ant dir="urlnormalizer-ajax" target="test"/>
      <ant dir="urlnormalizer-basic" target="test"/>
      <ant dir="urlnormalizer-host" target="test"/>
@@ -192,6 +194,7 @@
     <ant dir="urlfilter-regex" target="clean"/>
     <ant dir="urlfilter-suffix" target="clean"/>
     <ant dir="urlfilter-validator" target="clean"/>
+    <ant dir="urlfilter-ignoreexempt" target="clean"/>
     <ant dir="parsefilter-naivebayes" target="clean" />
     <ant dir="urlmeta" target="clean"/>
     <ant dir="urlnormalizer-ajax" target="clean"/>

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/nutch-extensionpoints/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/nutch-extensionpoints/plugin.xml 
b/src/plugin/nutch-extensionpoints/plugin.xml
index e095c1c..8cf7a23 100644
--- a/src/plugin/nutch-extensionpoints/plugin.xml
+++ b/src/plugin/nutch-extensionpoints/plugin.xml
@@ -49,6 +49,10 @@
       name="Nutch URL Filter"/>
 
 <extension-point
+        id="org.apache.nutch.net.URLExemptionFilter"
+        name="Nutch URL Ignore Exemption Filter"/>
+
+<extension-point
       id="org.apache.nutch.net.URLNormalizer"
       name="Nutch URL Normalizer"/>
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/README.md
----------------------------------------------------------------------
diff --git a/src/plugin/urlfilter-ignoreexempt/README.md 
b/src/plugin/urlfilter-ignoreexempt/README.md
new file mode 100644
index 0000000..d48b672
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/README.md
@@ -0,0 +1,43 @@
+urlfilter-ignoreexempt
+======================
+  This plugin allows certain urls to be exempted when the external links are 
configured to be ignored.
+  This is useful when focused crawl is setup but some resources like static 
files are linked from CDNs (external domains).
+
+# How to enable ?
+Add `urlfilter-ignoreexempt` value to `plugin.includes` property
+```xml
+<property>
+  <name>plugin.includes</name>
+  <value>protocol-http|urlfilter-(regex|ignoreexempt)...</value>
+</property>
+```
+
+# How to configure rules?
+
+open `conf/db-ignore-external-exemptions.txt` and add the regex rules.
+
+## Format :
+
+The format is same same as `regex-urlfilter.txt`.
+ Each non-comment, non-blank line contains a regular expression
+ prefixed by '+' or '-'.  The first matching pattern in the file
+ determines whether a URL is exempted or ignored.  If no pattern
+ matches, the URL is ignored.
+
+
+## Example :
+
+ To exempt urls ending with image extensions, use this rule
+
+`+(?i)\.(jpg|png|gif)$`
+
+   
+   
+## Testing the Rules :
+
+After enabling the plugin and adding your rules to 
`conf/db-ignore-external-exemptions.txt`, run:
+   
+`bin/nutch plugin urlfilter-ignoreexempt  
org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://yoururl.here`
+
+
+This should print `true` for urls which are accepted by configured rules.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/urlfilter-ignoreexempt/build.xml 
b/src/plugin/urlfilter-ignoreexempt/build.xml
new file mode 100644
index 0000000..105f551
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/build.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-ignoreexempt" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+      <include name="**/urlfilter-regex/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+    <pathelement location="${nutch.root}/build/urlfilter-regex/test"/>
+  </path>
+
+  <!-- Compile test classes for dependencies -->
+  <target name="deps-test-compile">
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+    <ant target="compile-test" inheritall="false" dir="../urlfilter-regex"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+    <ant target="deploy" inheritall="false" dir="../urlfilter-regex"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/data/.donotdelete
----------------------------------------------------------------------
diff --git a/src/plugin/urlfilter-ignoreexempt/data/.donotdelete 
b/src/plugin/urlfilter-ignoreexempt/data/.donotdelete
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/urlfilter-ignoreexempt/ivy.xml 
b/src/plugin/urlfilter-ignoreexempt/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/urlfilter-ignoreexempt/plugin.xml 
b/src/plugin/urlfilter-ignoreexempt/plugin.xml
new file mode 100644
index 0000000..4139ca4
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/plugin.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-ignoreexempt"
+   name="External Domain Ignore Exemption"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-ignoreexempt.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-filter"/>
+      <import plugin="urlfilter-regex"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.ignoreexempt"
+              name="Ignore Exemption Url Filter"
+              point="org.apache.nutch.net.URLExemptionFilter">
+      <implementation id="ExemptionUrlFilter"
+        class="org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter">
+        <parameter name="file" value="db-ignore-external-exemptions.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
 
b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
new file mode 100644
index 0000000..bbac300
--- /dev/null
+++ 
b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLExemptionFilter;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.urlfilter.regex.RegexURLFilter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.regex.Pattern;
+import java.util.List;
+import java.util.ArrayList;
+
+
+/**
+ * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses 
regex configuration
+ * to check if URL is eligible for exemption from 'db.ignore.external'.
+ * When this filter is enabled, the external urls will be checked against 
configured sequence of regex rules.
+ *<p>
+ * The exemption rule file defaults to db-ignore-external-exemptions.txt in 
the classpath but can be
+ * overridden using the property  <code>"db.ignore.external.exemptions.file" 
in ./conf/nutch-*.xml</code>
+ *</p>
+ *
+ * The exemption rules are specified in plain text file where each line is a 
rule.
+ * The format is same same as `regex-urlfilter.txt`.
+ * Each non-comment, non-blank line contains a regular expression
+ * prefixed by '+' or '-'.  The first matching pattern in the file
+ * determines whether a URL is exempted or ignored.  If no pattern
+ * matches, the URL is ignored.
+ *
+ * @since Feb 10, 2016
+ * @version 1
+ * @see org.apache.nutch.net.URLExemptionFilter
+ * @see org.apache.nutch.urlfilter.regex.RegexURLFilter
+ */
+public class ExemptionUrlFilter extends RegexURLFilter
+    implements URLExemptionFilter {
+
+  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE
+      = "db.ignore.external.exemptions.file";
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ExemptionUrlFilter.class);
+
+  private List<Pattern> exemptions;
+  private Configuration conf;
+
+  public List<Pattern> getExemptions() {
+    return exemptions;
+  }
+
+  @Override
+  public boolean filter(String fromUrl, String toUrl) {
+    //this implementation does not consider fromUrl param.
+    //the regex rules are applied to toUrl.
+    return this.filter(toUrl) != null;
+  }
+
+  /**
+   * Gets reader for regex rules
+   */
+  protected Reader getRulesReader(Configuration conf)
+      throws IOException {
+    String fileRules = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
+    return conf.getConfResourceAsReader(fileRules);
+  }
+
+  public static void main(String[] args) {
+
+    if (args.length != 1) {
+      System.out.println("Error: Invalid Args");
+      System.out.println("Usage: " +
+          ExemptionUrlFilter.class.getName() + " <url>");
+      return;
+    }
+    String url = args[0];
+    ExemptionUrlFilter instance = new ExemptionUrlFilter();
+    instance.setConf(NutchConfiguration.create());
+    System.out.println(instance.filter(null, url));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/2015703c/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
 
b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
new file mode 100644
index 0000000..ee949c5
--- /dev/null
+++ 
b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin which identifies exemptions to external urls when
+ * when external urls are set to ignore.
+ *
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+

Reply via email to