Author: markus
Date: Tue Jun 12 10:15:44 2012
New Revision: 1349227
URL: http://svn.apache.org/viewvc?rev=1349227&view=rev
Log:
NUTCH-1352 Improve regex urlfilters/normalizers synchronization
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jun 12 10:15:44 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1352 Improve regex urlfilters/normalizers synchronization (ferdy via
markus)
+
* NUTCH-1024 Dynamically set fetchInterval by MIME-type (markus)
* NUTCH-1364 Add a counter in Generator for malformed urls (lewismc)
Modified:
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
(original)
+++
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
Tue Jun 12 10:15:44 2012
@@ -16,7 +16,6 @@
*/
package org.apache.nutch.urlfilter.api;
-import org.apache.nutch.net.*;
/**
@@ -26,8 +25,7 @@ import org.apache.nutch.net.*;
*/
public abstract class RegexRule {
- private boolean sign;
- private String regex;
+ private final boolean sign;
/**
* Constructs a new regular expression rule.
@@ -41,7 +39,6 @@ public abstract class RegexRule {
*/
protected RegexRule(boolean sign, String regex) {
this.sign = sign;
- this.regex = regex;
}
/**
Modified:
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
(original)
+++
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
Tue Jun 12 10:15:44 2012
@@ -61,7 +61,7 @@ public abstract class RegexURLFilterBase
private final static Logger LOG =
LoggerFactory.getLogger(RegexURLFilterBase.class);
/** An array of applicable rules */
- private RegexRule[] rules;
+ private List<RegexRule> rules;
/** The current configuration */
private Configuration conf;
@@ -125,10 +125,10 @@ public abstract class RegexURLFilterBase
* -------------------------- */
// Inherited Javadoc
- public synchronized String filter(String url) {
- for (int i=0; i<rules.length; i++) {
- if (rules[i].match(url)) {
- return rules[i].accept() ? url : null;
+ public String filter(String url) {
+ for (RegexRule rule : rules) {
+ if (rule.match(url)) {
+ return rule.accept() ? url : null;
}
};
return null;
@@ -174,11 +174,11 @@ public abstract class RegexURLFilterBase
* @param reader is a reader of regular expressions rules.
* @return the corresponding {@RegexRule rules}.
*/
- private RegexRule[] readRules(Reader reader)
+ private List<RegexRule> readRules(Reader reader)
throws IOException, IllegalArgumentException {
BufferedReader in = new BufferedReader(reader);
- List rules = new ArrayList();
+ List<RegexRule> rules = new ArrayList<RegexRule>();
String line;
while((line=in.readLine())!=null) {
@@ -205,7 +205,7 @@ public abstract class RegexURLFilterBase
RegexRule rule = createRule(sign, regex);
rules.add(rule);
}
- return (RegexRule[]) rules.toArray(new RegexRule[rules.size()]);
+ return rules;
}
/**
Modified:
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
(original)
+++
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
Tue Jun 12 10:15:44 2012
@@ -17,15 +17,13 @@
package org.apache.nutch.urlfilter.regex;
// JDK imports
-import java.io.Reader;
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
-// Hadoop imports
import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.net.*;
import org.apache.nutch.urlfilter.api.RegexRule;
import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
import org.apache.nutch.util.NutchConfiguration;
Modified:
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
(original)
+++
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Tue Jun 12 10:15:44 2012
@@ -28,22 +28,23 @@ import org.slf4j.LoggerFactory;
import org.apache.nutch.net.URLNormalizer;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
import org.apache.oro.text.regex.*;
/** Converts URLs to a normal form . */
-public class BasicURLNormalizer implements URLNormalizer {
+public class BasicURLNormalizer extends Configured implements URLNormalizer {
public static final Logger LOG =
LoggerFactory.getLogger(BasicURLNormalizer.class);
private Perl5Compiler compiler = new Perl5Compiler();
- private ThreadLocal matchers = new ThreadLocal() {
- protected synchronized Object initialValue() {
+ private ThreadLocal<Perl5Matcher> matchers = new
ThreadLocal<Perl5Matcher>() {
+ protected Perl5Matcher initialValue() {
return new Perl5Matcher();
}
};
- private Rule relativePathRule = null;
- private Rule leadingRelativePathRule = null;
- private Rule currentPathRule = null;
- private Rule adjacentSlashRule = null;
+ private final Rule relativePathRule;
+ private final Rule leadingRelativePathRule;
+ private final Rule currentPathRule;
+ private final Rule adjacentSlashRule;
private Configuration conf;
@@ -80,7 +81,6 @@ public class BasicURLNormalizer implemen
adjacentSlashRule.substitution = new Perl5Substitution("/");
} catch (MalformedPatternException e) {
- LOG.error("Error: ", e);
throw new RuntimeException(e);
}
}
Modified:
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
(original)
+++
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
Tue Jun 12 10:15:44 2012
@@ -17,36 +17,36 @@
package org.apache.nutch.net.urlnormalizer.regex;
-import java.net.URL;
-import java.net.MalformedURLException;
-import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
-import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
-
+import java.net.MalformedURLException;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
-import java.util.List;
-import java.util.ArrayList;
import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import javax.xml.parsers.DocumentBuilderFactory;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
-
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;
-
-import javax.xml.parsers.*;
-import org.w3c.dom.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
import org.xml.sax.InputSource;
-import java.util.regex.*;
-
/**
* Allows users to do regex substitutions on all/any URLs that are encountered,
* which is useful for stripping session IDs from URLs.
@@ -75,9 +75,20 @@ public class RegexURLNormalizer extends
public String substitution;
}
- private HashMap scopedRules;
+ private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal =
+ new ThreadLocal<HashMap<String,List<Rule>>>() {
+ protected java.util.HashMap<String,java.util.List<Rule>> initialValue() {
+ return new HashMap<String, List<Rule>>();
+ };
+ };
+
+ public HashMap<String, List<Rule>> getScopedRules() {
+ return scopedRulesThreadLocal.get();
+ }
+
+ private List<Rule> defaultRules;
- private static final List EMPTY_RULES = Collections.EMPTY_LIST;
+ private static final List<Rule> EMPTY_RULES = Collections.emptyList();
/**
* The default constructor which is called from UrlNormalizerFactory
@@ -98,45 +109,44 @@ public class RegexURLNormalizer extends
public RegexURLNormalizer(Configuration conf, String filename)
throws IOException, PatternSyntaxException {
super(conf);
- List rules = readConfigurationFile(filename);
- if (rules != null)
- scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
+ List<Rule> rules = readConfigurationFile(filename);
+ if (rules != null) {
+ defaultRules = rules;
+ }
}
public void setConf(Configuration conf) {
super.setConf(conf);
if (conf == null) return;
// the default constructor was called
- if (this.scopedRules == null) {
- String filename = getConf().get("urlnormalizer.regex.file");
- String stringRules = getConf().get("urlnormalizer.regex.rules");
- scopedRules = new HashMap();
- Reader reader = null;
- if (stringRules != null) {
- reader = new StringReader(stringRules);
- } else {
- reader = getConf().getConfResourceAsReader(filename);
- }
- List rules = null;
- if (reader == null) {
- LOG.warn("Can't load the default rules! ");
+
+ String filename = getConf().get("urlnormalizer.regex.file");
+ String stringRules = getConf().get("urlnormalizer.regex.rules");
+ Reader reader = null;
+ if (stringRules != null) {
+ reader = new StringReader(stringRules);
+ } else {
+ reader = getConf().getConfResourceAsReader(filename);
+ }
+ List<Rule> rules = null;
+ if (reader == null) {
+ LOG.warn("Can't load the default rules! ");
+ rules = EMPTY_RULES;
+ } else {
+ try {
+ rules = readConfiguration(reader);
+ } catch (Exception e) {
+ LOG.warn("Couldn't read default config: " + e);
rules = EMPTY_RULES;
- } else {
- try {
- rules = readConfiguration(reader);
- } catch (Exception e) {
- LOG.warn("Couldn't read default config: " + e);
- rules = EMPTY_RULES;
- }
}
- scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
}
+ defaultRules = rules;
}
// used in JUnit test.
void setConfiguration(Reader reader, String scope) {
- List rules = readConfiguration(reader);
- scopedRules.put(scope, rules);
+ List<Rule> rules = readConfiguration(reader);
+ getScopedRules().put(scope, rules);
LOG.debug("Set config for scope '" + scope + "': " + rules.size() + "
rules.");
}
@@ -144,23 +154,20 @@ public class RegexURLNormalizer extends
* This function does the replacements by iterating through all the regex
* patterns. It accepts a string url as input and returns the altered string.
*/
- public synchronized String regexNormalize(String urlString, String scope) {
- List curRules = (List)scopedRules.get(scope);
+ public String regexNormalize(String urlString, String scope) {
+ HashMap<String, List<Rule>> scopedRules = getScopedRules();
+ List<Rule> curRules = scopedRules.get(scope);
if (curRules == null) {
// try to populate
String configFile = getConf().get("urlnormalizer.regex.file." + scope);
if (configFile != null) {
LOG.debug("resource for scope '" + scope + "': " + configFile);
- if (configFile == null) {
- LOG.warn("Can't load resource for config file: " + configFile);
- } else {
- try {
- Reader reader = getConf().getConfResourceAsReader(configFile);
- curRules = readConfiguration(reader);
- scopedRules.put(scope, curRules);
- } catch (Exception e) {
- LOG.warn("Couldn't load resource '" + configFile + "': " + e);
- }
+ try {
+ Reader reader = getConf().getConfResourceAsReader(configFile);
+ curRules = readConfiguration(reader);
+ scopedRules.put(scope, curRules);
+ } catch (Exception e) {
+ LOG.warn("Couldn't load resource '" + configFile + "': " + e);
}
}
if (curRules == EMPTY_RULES || curRules == null) {
@@ -169,10 +176,9 @@ public class RegexURLNormalizer extends
}
}
if (curRules == EMPTY_RULES || curRules == null) {
- // use global rules
- curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
+ curRules = defaultRules;
}
- Iterator i = curRules.iterator();
+ Iterator<Rule> i = curRules.iterator();
while (i.hasNext()) {
Rule r = (Rule) i.next();
@@ -183,13 +189,13 @@ public class RegexURLNormalizer extends
return urlString;
}
- public synchronized String normalize(String urlString, String scope)
+ public String normalize(String urlString, String scope)
throws MalformedURLException {
return regexNormalize(urlString, scope);
}
/** Reads the configuration file and populates a List of Rules. */
- private List readConfigurationFile(String filename) {
+ private List<Rule> readConfigurationFile(String filename) {
if (LOG.isInfoEnabled()) {
LOG.info("loading " + filename);
}
@@ -202,8 +208,8 @@ public class RegexURLNormalizer extends
}
}
- private List readConfiguration(Reader reader) {
- List rules = new ArrayList();
+ private List<Rule> readConfiguration(Reader reader) {
+ List<Rule> rules = new ArrayList<Rule>();
try {
// borrowed heavily from code in Configuration.java
@@ -261,10 +267,11 @@ public class RegexURLNormalizer extends
IOException {
RegexURLNormalizer normalizer = new RegexURLNormalizer();
normalizer.setConf(NutchConfiguration.create());
- Iterator i =
((List)normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator();
+ HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules();
+ Iterator<Rule> i = normalizer.defaultRules.iterator();
System.out.println("* Rules for 'DEFAULT' scope:");
while (i.hasNext()) {
- Rule r = (Rule) i.next();
+ Rule r = i.next();
System.out.print(" " + r.pattern.pattern() + " -> ");
System.out.println(r.substitution);
}
@@ -272,13 +279,13 @@ public class RegexURLNormalizer extends
if (args.length > 1) {
normalizer.normalize("http://test.com", args[1]);
}
- if (normalizer.scopedRules.size() > 1) {
- Iterator it = normalizer.scopedRules.keySet().iterator();
+ if (scopedRules.size() > 1) {
+ Iterator<String> it = scopedRules.keySet().iterator();
while (it.hasNext()) {
- String scope = (String)it.next();
+ String scope = it.next();
if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) continue;
System.out.println("* Rules for '" + scope + "' scope:");
- i = ((List)normalizer.scopedRules.get(scope)).iterator();
+ i = ((List<Rule>)scopedRules.get(scope)).iterator();
while (i.hasNext()) {
Rule r = (Rule) i.next();
System.out.print(" " + r.pattern.pattern() + " -> ");