Author: markus
Date: Fri Jan  8 12:11:18 2016
New Revision: 1723710

URL: http://svn.apache.org/viewvc?rev=1723710&view=rev
Log:
NUTCH-1838 Host and domain based regex and automaton filtering 


Added:
    nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules
    nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls
Modified:
    nutch/trunk/CHANGES.txt
    
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
    
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
    
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
    
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
    
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan  8 12:11:18 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1838 Host and domain based regex and automaton filtering (markus)
+
 * NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus)
 
 * NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus)

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
 Fri Jan  8 12:11:18 2016
@@ -24,6 +24,10 @@ package org.apache.nutch.urlfilter.api;
 public abstract class RegexRule {
 
   private final boolean sign;
+  
+  private final String hostOrDomain;
+  
+  private final String regex;
 
   /**
    * Constructs a new regular expression rule.
@@ -38,7 +42,27 @@ public abstract class RegexRule {
    *          {@link #match(String)} method).
    */
   protected RegexRule(boolean sign, String regex) {
+    this(sign, regex, null);
+  }
+  
+  /**
+   * Constructs a new regular expression rule.
+   * 
+   * @param sign
+   *          specifies if this rule must filter-in or filter-out. A
+   *          <code>true</code> value means that any url matching this rule 
must
+   *          be accepted, a <code>false</code> value means that any url
+   *          matching this rule must be rejected.
+   * @param regex
+   *          is the regular expression used for matching (see
+   *          {@link #match(String)} method).
+   * @param hostOrDomain
+   *          the host or domain to which this regex belongs
+   */
+  protected RegexRule(boolean sign, String regex, String hostOrDomain) {
     this.sign = sign;
+    this.hostOrDomain = hostOrDomain;
+    this.regex = regex;
   }
 
   /**
@@ -52,6 +76,20 @@ public abstract class RegexRule {
   }
 
   /**
+   * Return if this rule is used for filtering-in or out.
+   *
+   * @return host or domain this regex rule belongs to
+   */
+  protected String hostOrDomain() { return hostOrDomain; }
+  
+  /**
+   * Return if this rule's regex.
+   *
+   * @return this regex
+   */
+  protected String regex() { return regex; }
+
+  /**
    * Checks if a url matches this rule.
    * 
    * @param url

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 Fri Jan  8 12:11:18 2016
@@ -24,6 +24,7 @@ import java.io.BufferedReader;
 import java.io.InputStreamReader;
 import java.io.IOException;
 import java.io.StringReader;
+import java.net.MalformedURLException;
 import java.util.List;
 import java.util.ArrayList;
 
@@ -36,6 +37,7 @@ import org.apache.hadoop.conf.Configurat
 
 // Nutch imports
 import org.apache.nutch.net.*;
+import org.apache.nutch.util.URLUtil;
 
 /**
  * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
@@ -123,6 +125,20 @@ public abstract class RegexURLFilterBase
    *          is the regular expression associated to this rule.
    */
   protected abstract RegexRule createRule(boolean sign, String regex);
+  
+  /**
+   * Creates a new {@link RegexRule}.
+   * @param 
+   *        sign of the regular expression.
+   *        A <code>true</code> value means that any URL matching this rule
+   *        must be included, whereas a <code>false</code>
+   *        value means that any URL matching this rule must be excluded.
+   * @param regex
+   *        is the regular expression associated to this rule.
+   * @param hostOrDomain
+   *        the host or domain to which this regex belongs
+   */
+  protected abstract RegexRule createRule(boolean sign, String regex, String 
hostOrDomain);
 
   /**
    * Returns the name of the file of rules to use for a particular
@@ -142,7 +158,35 @@ public abstract class RegexURLFilterBase
 
   // Inherited Javadoc
   public String filter(String url) {
+    String host = URLUtil.getHost(url);
+    String domain = null;
+    
+    try {
+      domain = URLUtil.getDomainName(url);
+    } catch (MalformedURLException e) {
+      // shouldnt happen here right?
+    }
+    
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("URL belongs to host " + host + " and domain " + domain);
+    }
+
     for (RegexRule rule : rules) {
+      // Skip the skip for rules that don't share the same host and domain
+      if (rule.hostOrDomain() != null &&
+            !rule.hostOrDomain().equals(host) &&
+            !rule.hostOrDomain().equals(domain)) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + 
rule.hostOrDomain());
+        }
+
+        continue;
+      }
+    
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " 
and domain " + domain);
+      }
+
       if (rule.match(url)) {
         return rule.accept() ? url : null;
       }
@@ -204,7 +248,8 @@ public abstract class RegexURLFilterBase
     BufferedReader in = new BufferedReader(reader);
     List<RegexRule> rules = new ArrayList<RegexRule>();
     String line;
-
+    String hostOrDomain = null;
+    
     while ((line = in.readLine()) != null) {
       if (line.length() == 0) {
         continue;
@@ -222,15 +267,21 @@ public abstract class RegexURLFilterBase
       case '\n':
       case '#': // skip blank & comment lines
         continue;
+      case '>':
+        hostOrDomain = line.substring(1).trim();
+        continue;
+      case '<':
+        hostOrDomain = null;
+        continue;
       default:
         throw new IOException("Invalid first character: " + line);
       }
 
       String regex = line.substring(1);
       if (LOG.isTraceEnabled()) {
-        LOG.trace("Adding rule [" + regex + "]");
+        LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain);
       }
-      RegexRule rule = createRule(sign, regex);
+      RegexRule rule = createRule(sign, regex, hostOrDomain);
       rules.add(rule);
     }
     return rules;

Modified: 
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
 Fri Jan  8 12:11:18 2016
@@ -80,6 +80,10 @@ public class AutomatonURLFilter extends
   protected RegexRule createRule(boolean sign, String regex) {
     return new Rule(sign, regex);
   }
+  
+  protected RegexRule createRule(boolean sign, String regex, String 
hostOrDomain) {
+    return new Rule(sign, regex, hostOrDomain);
+  }
 
   /*
    * ------------------------------------ * 
</implementation:RegexURLFilterBase>
@@ -98,6 +102,11 @@ public class AutomatonURLFilter extends
       super(sign, regex);
       automaton = new RunAutomaton(new RegExp(regex, 
RegExp.ALL).toAutomaton());
     }
+    
+    Rule(boolean sign, String regex, String hostOrDomain) {
+      super(sign, regex, hostOrDomain);
+      automaton = new RunAutomaton(new RegExp(regex, 
RegExp.ALL).toAutomaton());
+    }
 
     protected boolean match(String url) {
       return automaton.run(url);

Added: nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules?rev=1723710&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules (added)
+++ nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules Fri Jan  8 
12:11:18 2016
@@ -0,0 +1,12 @@
+# Skip all url's containing skip for example.org
+> www.example.org
+-skip
+<
+
+# Allow all url's containing skip for example.com
+> www.example.com
++skip
+<
+
+# Skip everything else
+-.
\ No newline at end of file

Added: nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls?rev=1723710&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls (added)
+++ nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls Fri Jan  8 
12:11:18 2016
@@ -0,0 +1,3 @@
+-http://www.example.org/skip-me-now
++http://www.example.com/noone-can-skip-me
+-http://www.example.nl/i-am-filtered
\ No newline at end of file

Modified: 
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
 Fri Jan  8 12:11:18 2016
@@ -72,6 +72,12 @@ public class RegexURLFilter extends Rege
   protected RegexRule createRule(boolean sign, String regex) {
     return new Rule(sign, regex);
   }
+  
+  protected RegexRule createRule(boolean sign, String regex, String 
hostOrDomain) {
+    return new Rule(sign, regex, hostOrDomain);
+  }
+  
+  
 
   /*
    * ------------------------------------ * 
</implementation:RegexURLFilterBase>
@@ -89,7 +95,11 @@ public class RegexURLFilter extends Rege
     private Pattern pattern;
 
     Rule(boolean sign, String regex) {
-      super(sign, regex);
+      this(sign, regex, null);
+    }
+    
+    Rule(boolean sign, String regex, String hostOrDomain) {
+      super(sign, regex, hostOrDomain);
       pattern = Pattern.compile(regex);
     }
 

Modified: 
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
 Fri Jan  8 12:11:18 2016
@@ -52,5 +52,10 @@ public class TestRegexURLFilter extends
     bench(400, "Benchmarks");
     bench(800, "Benchmarks");
   }
+  
+  @Test
+  public void test1838() {
+    test("nutch1838");
+  }
 
 }


Reply via email to