Author: markus Date: Fri Jan 8 12:11:18 2016 New Revision: 1723710 URL: http://svn.apache.org/viewvc?rev=1723710&view=rev Log: NUTCH-1838 Host and domain based regex and automaton filtering
Added: nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723710&r1=1723709&r2=1723710&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jan 8 12:11:18 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-1838 Host and domain based regex and automaton filtering (markus) + * NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus) * NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus) Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1723710&r1=1723709&r2=1723710&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (original) +++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java Fri Jan 8 12:11:18 2016 @@ -24,6 +24,10 @@ package org.apache.nutch.urlfilter.api; public abstract class RegexRule { private final boolean sign; + + private final String hostOrDomain; + + private final String regex; /** * Constructs a new regular expression rule. @@ -38,7 +42,27 @@ public abstract class RegexRule { * {@link #match(String)} method). */ protected RegexRule(boolean sign, String regex) { + this(sign, regex, null); + } + + /** + * Constructs a new regular expression rule. + * + * @param sign + * specifies if this rule must filter-in or filter-out. A + * <code>true</code> value means that any url matching this rule must + * be accepted, a <code>false</code> value means that any url + * matching this rule must be rejected. + * @param regex + * is the regular expression used for matching (see + * {@link #match(String)} method). + * @param hostOrDomain + * the host or domain to which this regex belongs + */ + protected RegexRule(boolean sign, String regex, String hostOrDomain) { this.sign = sign; + this.hostOrDomain = hostOrDomain; + this.regex = regex; } /** @@ -52,6 +76,20 @@ public abstract class RegexRule { } /** + * Return if this rule is used for filtering-in or out. + * + * @return host or domain this regex rule belongs to + */ + protected String hostOrDomain() { return hostOrDomain; } + + /** + * Return if this rule's regex. + * + * @return this regex + */ + protected String regex() { return regex; } + + /** * Checks if a url matches this rule. * * @param url Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1723710&r1=1723709&r2=1723710&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (original) +++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Fri Jan 8 12:11:18 2016 @@ -24,6 +24,7 @@ import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import java.io.StringReader; +import java.net.MalformedURLException; import java.util.List; import java.util.ArrayList; @@ -36,6 +37,7 @@ import org.apache.hadoop.conf.Configurat // Nutch imports import org.apache.nutch.net.*; +import org.apache.nutch.util.URLUtil; /** * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular @@ -123,6 +125,20 @@ public abstract class RegexURLFilterBase * is the regular expression associated to this rule. */ protected abstract RegexRule createRule(boolean sign, String regex); + + /** + * Creates a new {@link RegexRule}. + * @param + * sign of the regular expression. + * A <code>true</code> value means that any URL matching this rule + * must be included, whereas a <code>false</code> + * value means that any URL matching this rule must be excluded. + * @param regex + * is the regular expression associated to this rule. + * @param hostOrDomain + * the host or domain to which this regex belongs + */ + protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain); /** * Returns the name of the file of rules to use for a particular @@ -142,7 +158,35 @@ public abstract class RegexURLFilterBase // Inherited Javadoc public String filter(String url) { + String host = URLUtil.getHost(url); + String domain = null; + + try { + domain = URLUtil.getDomainName(url); + } catch (MalformedURLException e) { + // shouldnt happen here right? + } + + if (LOG.isDebugEnabled()) { + LOG.debug("URL belongs to host " + host + " and domain " + domain); + } + for (RegexRule rule : rules) { + // Skip the skip for rules that don't share the same host and domain + if (rule.hostOrDomain() != null && + !rule.hostOrDomain().equals(host) && + !rule.hostOrDomain().equals(domain)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain()); + } + + continue; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain); + } + if (rule.match(url)) { return rule.accept() ? url : null; } @@ -204,7 +248,8 @@ public abstract class RegexURLFilterBase BufferedReader in = new BufferedReader(reader); List<RegexRule> rules = new ArrayList<RegexRule>(); String line; - + String hostOrDomain = null; + while ((line = in.readLine()) != null) { if (line.length() == 0) { continue; @@ -222,15 +267,21 @@ public abstract class RegexURLFilterBase case '\n': case '#': // skip blank & comment lines continue; + case '>': + hostOrDomain = line.substring(1).trim(); + continue; + case '<': + hostOrDomain = null; + continue; default: throw new IOException("Invalid first character: " + line); } String regex = line.substring(1); if (LOG.isTraceEnabled()) { - LOG.trace("Adding rule [" + regex + "]"); + LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain); } - RegexRule rule = createRule(sign, regex); + RegexRule rule = createRule(sign, regex, hostOrDomain); rules.add(rule); } return rules; Modified: nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=1723710&r1=1723709&r2=1723710&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java Fri Jan 8 12:11:18 2016 @@ -80,6 +80,10 @@ public class AutomatonURLFilter extends protected RegexRule createRule(boolean sign, String regex) { return new Rule(sign, regex); } + + protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) { + return new Rule(sign, regex, hostOrDomain); + } /* * ------------------------------------ * </implementation:RegexURLFilterBase> @@ -98,6 +102,11 @@ public class AutomatonURLFilter extends super(sign, regex); automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton()); } + + Rule(boolean sign, String regex, String hostOrDomain) { + super(sign, regex, hostOrDomain); + automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton()); + } protected boolean match(String url) { return automaton.run(url); Added: nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules?rev=1723710&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules (added) +++ nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules Fri Jan 8 12:11:18 2016 @@ -0,0 +1,12 @@ +# Skip all url's containing skip for example.org +> www.example.org +-skip +< + +# Allow all url's containing skip for example.com +> www.example.com ++skip +< + +# Skip everything else +-. \ No newline at end of file Added: nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls?rev=1723710&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls (added) +++ nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls Fri Jan 8 12:11:18 2016 @@ -0,0 +1,3 @@ +-http://www.example.org/skip-me-now ++http://www.example.com/noone-can-skip-me +-http://www.example.nl/i-am-filtered \ No newline at end of file Modified: nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1723710&r1=1723709&r2=1723710&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java Fri Jan 8 12:11:18 2016 @@ -72,6 +72,12 @@ public class RegexURLFilter extends Rege protected RegexRule createRule(boolean sign, String regex) { return new Rule(sign, regex); } + + protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) { + return new Rule(sign, regex, hostOrDomain); + } + + /* * ------------------------------------ * </implementation:RegexURLFilterBase> @@ -89,7 +95,11 @@ public class RegexURLFilter extends Rege private Pattern pattern; Rule(boolean sign, String regex) { - super(sign, regex); + this(sign, regex, null); + } + + Rule(boolean sign, String regex, String hostOrDomain) { + super(sign, regex, hostOrDomain); pattern = Pattern.compile(regex); } Modified: nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=1723710&r1=1723709&r2=1723710&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Fri Jan 8 12:11:18 2016 @@ -52,5 +52,10 @@ public class TestRegexURLFilter extends bench(400, "Benchmarks"); bench(800, "Benchmarks"); } + + @Test + public void test1838() { + test("nutch1838"); + } }