Author: markus
Date: Fri Jan 8 12:11:18 2016
New Revision: 1723710
URL: http://svn.apache.org/viewvc?rev=1723710&view=rev
Log:
NUTCH-1838 Host and domain based regex and automaton filtering
Added:
nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules
nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 8 12:11:18 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-1838 Host and domain based regex and automaton filtering (markus)
+
* NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus)
* NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus)
Modified:
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
(original)
+++
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
Fri Jan 8 12:11:18 2016
@@ -24,6 +24,10 @@ package org.apache.nutch.urlfilter.api;
public abstract class RegexRule {
private final boolean sign;
+
+ private final String hostOrDomain;
+
+ private final String regex;
/**
* Constructs a new regular expression rule.
@@ -38,7 +42,27 @@ public abstract class RegexRule {
* {@link #match(String)} method).
*/
protected RegexRule(boolean sign, String regex) {
+ this(sign, regex, null);
+ }
+
+ /**
+ * Constructs a new regular expression rule.
+ *
+ * @param sign
+ * specifies if this rule must filter-in or filter-out. A
+ * <code>true</code> value means that any url matching this rule
must
+ * be accepted, a <code>false</code> value means that any url
+ * matching this rule must be rejected.
+ * @param regex
+ * is the regular expression used for matching (see
+ * {@link #match(String)} method).
+ * @param hostOrDomain
+ * the host or domain to which this regex belongs
+ */
+ protected RegexRule(boolean sign, String regex, String hostOrDomain) {
this.sign = sign;
+ this.hostOrDomain = hostOrDomain;
+ this.regex = regex;
}
/**
@@ -52,6 +76,20 @@ public abstract class RegexRule {
}
/**
+ * Return if this rule is used for filtering-in or out.
+ *
+ * @return host or domain this regex rule belongs to
+ */
+ protected String hostOrDomain() { return hostOrDomain; }
+
+ /**
+ * Return if this rule's regex.
+ *
+ * @return this regex
+ */
+ protected String regex() { return regex; }
+
+ /**
* Checks if a url matches this rule.
*
* @param url
Modified:
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
(original)
+++
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
Fri Jan 8 12:11:18 2016
@@ -24,6 +24,7 @@ import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.StringReader;
+import java.net.MalformedURLException;
import java.util.List;
import java.util.ArrayList;
@@ -36,6 +37,7 @@ import org.apache.hadoop.conf.Configurat
// Nutch imports
import org.apache.nutch.net.*;
+import org.apache.nutch.util.URLUtil;
/**
* Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
@@ -123,6 +125,20 @@ public abstract class RegexURLFilterBase
* is the regular expression associated to this rule.
*/
protected abstract RegexRule createRule(boolean sign, String regex);
+
+ /**
+ * Creates a new {@link RegexRule}.
+ * @param
+ * sign of the regular expression.
+ * A <code>true</code> value means that any URL matching this rule
+ * must be included, whereas a <code>false</code>
+ * value means that any URL matching this rule must be excluded.
+ * @param regex
+ * is the regular expression associated to this rule.
+ * @param hostOrDomain
+ * the host or domain to which this regex belongs
+ */
+ protected abstract RegexRule createRule(boolean sign, String regex, String
hostOrDomain);
/**
* Returns the name of the file of rules to use for a particular
@@ -142,7 +158,35 @@ public abstract class RegexURLFilterBase
// Inherited Javadoc
public String filter(String url) {
+ String host = URLUtil.getHost(url);
+ String domain = null;
+
+ try {
+ domain = URLUtil.getDomainName(url);
+ } catch (MalformedURLException e) {
+ // shouldnt happen here right?
+ }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("URL belongs to host " + host + " and domain " + domain);
+ }
+
for (RegexRule rule : rules) {
+ // Skip the skip for rules that don't share the same host and domain
+ if (rule.hostOrDomain() != null &&
+ !rule.hostOrDomain().equals(host) &&
+ !rule.hostOrDomain().equals(domain)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Skipping rule [" + rule.regex() + "] for host: " +
rule.hostOrDomain());
+ }
+
+ continue;
+ }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + "
and domain " + domain);
+ }
+
if (rule.match(url)) {
return rule.accept() ? url : null;
}
@@ -204,7 +248,8 @@ public abstract class RegexURLFilterBase
BufferedReader in = new BufferedReader(reader);
List<RegexRule> rules = new ArrayList<RegexRule>();
String line;
-
+ String hostOrDomain = null;
+
while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
@@ -222,15 +267,21 @@ public abstract class RegexURLFilterBase
case '\n':
case '#': // skip blank & comment lines
continue;
+ case '>':
+ hostOrDomain = line.substring(1).trim();
+ continue;
+ case '<':
+ hostOrDomain = null;
+ continue;
default:
throw new IOException("Invalid first character: " + line);
}
String regex = line.substring(1);
if (LOG.isTraceEnabled()) {
- LOG.trace("Adding rule [" + regex + "]");
+ LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain);
}
- RegexRule rule = createRule(sign, regex);
+ RegexRule rule = createRule(sign, regex, hostOrDomain);
rules.add(rule);
}
return rules;
Modified:
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
(original)
+++
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
Fri Jan 8 12:11:18 2016
@@ -80,6 +80,10 @@ public class AutomatonURLFilter extends
protected RegexRule createRule(boolean sign, String regex) {
return new Rule(sign, regex);
}
+
+ protected RegexRule createRule(boolean sign, String regex, String
hostOrDomain) {
+ return new Rule(sign, regex, hostOrDomain);
+ }
/*
* ------------------------------------ *
</implementation:RegexURLFilterBase>
@@ -98,6 +102,11 @@ public class AutomatonURLFilter extends
super(sign, regex);
automaton = new RunAutomaton(new RegExp(regex,
RegExp.ALL).toAutomaton());
}
+
+ Rule(boolean sign, String regex, String hostOrDomain) {
+ super(sign, regex, hostOrDomain);
+ automaton = new RunAutomaton(new RegExp(regex,
RegExp.ALL).toAutomaton());
+ }
protected boolean match(String url) {
return automaton.run(url);
Added: nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules?rev=1723710&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules (added)
+++ nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules Fri Jan 8
12:11:18 2016
@@ -0,0 +1,12 @@
+# Skip all url's containing skip for example.org
+> www.example.org
+-skip
+<
+
+# Allow all url's containing skip for example.com
+> www.example.com
++skip
+<
+
+# Skip everything else
+-.
\ No newline at end of file
Added: nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls?rev=1723710&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls (added)
+++ nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls Fri Jan 8
12:11:18 2016
@@ -0,0 +1,3 @@
+-http://www.example.org/skip-me-now
++http://www.example.com/noone-can-skip-me
+-http://www.example.nl/i-am-filtered
\ No newline at end of file
Modified:
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
(original)
+++
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
Fri Jan 8 12:11:18 2016
@@ -72,6 +72,12 @@ public class RegexURLFilter extends Rege
protected RegexRule createRule(boolean sign, String regex) {
return new Rule(sign, regex);
}
+
+ protected RegexRule createRule(boolean sign, String regex, String
hostOrDomain) {
+ return new Rule(sign, regex, hostOrDomain);
+ }
+
+
/*
* ------------------------------------ *
</implementation:RegexURLFilterBase>
@@ -89,7 +95,11 @@ public class RegexURLFilter extends Rege
private Pattern pattern;
Rule(boolean sign, String regex) {
- super(sign, regex);
+ this(sign, regex, null);
+ }
+
+ Rule(boolean sign, String regex, String hostOrDomain) {
+ super(sign, regex, hostOrDomain);
pattern = Pattern.compile(regex);
}
Modified:
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=1723710&r1=1723709&r2=1723710&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
(original)
+++
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
Fri Jan 8 12:11:18 2016
@@ -52,5 +52,10 @@ public class TestRegexURLFilter extends
bench(400, "Benchmarks");
bench(800, "Benchmarks");
}
+
+ @Test
+ public void test1838() {
+ test("nutch1838");
+ }
}