Author: markus
Date: Tue Jun 12 10:15:44 2012
New Revision: 1349227

URL: http://svn.apache.org/viewvc?rev=1349227&view=rev
Log:
NUTCH-1352 Improve regex urlfilters/normalizers synchronization

Modified:
    nutch/trunk/CHANGES.txt
    
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
    
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
    
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
    
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
    
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jun 12 10:15:44 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1352 Improve regex urlfilters/normalizers synchronization (ferdy via 
markus)
+
 * NUTCH-1024 Dynamically set fetchInterval by MIME-type (markus)
 
 * NUTCH-1364 Add a counter in Generator for malformed urls (lewismc)

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
 Tue Jun 12 10:15:44 2012
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.urlfilter.api;
 
-import org.apache.nutch.net.*;
 
 
 /**
@@ -26,8 +25,7 @@ import org.apache.nutch.net.*;
  */
 public abstract class RegexRule {
 
-  private boolean sign;
-  private String regex;
+  private final boolean sign;
 
   /**
    * Constructs a new regular expression rule.
@@ -41,7 +39,6 @@ public abstract class RegexRule {
    */
   protected RegexRule(boolean sign, String regex) {
     this.sign = sign;
-    this.regex = regex;
   }
 
   /**

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 Tue Jun 12 10:15:44 2012
@@ -61,7 +61,7 @@ public abstract class RegexURLFilterBase
   private final static Logger LOG = 
LoggerFactory.getLogger(RegexURLFilterBase.class);
 
   /** An array of applicable rules */
-  private RegexRule[] rules;
+  private List<RegexRule> rules;
 
   /** The current configuration */
   private Configuration conf;
@@ -125,10 +125,10 @@ public abstract class RegexURLFilterBase
    * -------------------------- */
   
   // Inherited Javadoc
-  public synchronized String filter(String url) {
-    for (int i=0; i<rules.length; i++) {
-      if (rules[i].match(url)) {
-        return rules[i].accept() ? url : null;
+  public String filter(String url) {
+    for (RegexRule rule : rules) {
+      if (rule.match(url)) {
+        return rule.accept() ? url : null;
       }
     };
     return null;
@@ -174,11 +174,11 @@ public abstract class RegexURLFilterBase
    * @param reader is a reader of regular expressions rules.
    * @return the corresponding {@RegexRule rules}.
    */
-  private RegexRule[] readRules(Reader reader)
+  private List<RegexRule> readRules(Reader reader)
     throws IOException, IllegalArgumentException {
 
     BufferedReader in = new BufferedReader(reader);
-    List rules = new ArrayList();
+    List<RegexRule> rules = new ArrayList<RegexRule>();
     String line;
        
     while((line=in.readLine())!=null) {
@@ -205,7 +205,7 @@ public abstract class RegexURLFilterBase
       RegexRule rule = createRule(sign, regex);
       rules.add(rule);
     }
-    return (RegexRule[]) rules.toArray(new RegexRule[rules.size()]);
+    return rules;
   }
 
   /**

Modified: 
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
 Tue Jun 12 10:15:44 2012
@@ -17,15 +17,13 @@
 package org.apache.nutch.urlfilter.regex;
 
 // JDK imports
-import java.io.Reader;
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
-// Hadoop imports
 import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.net.*;
 import org.apache.nutch.urlfilter.api.RegexRule;
 import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
 import org.apache.nutch.util.NutchConfiguration;

Modified: 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 Tue Jun 12 10:15:44 2012
@@ -28,22 +28,23 @@ import org.slf4j.LoggerFactory;
 import org.apache.nutch.net.URLNormalizer;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
 import org.apache.oro.text.regex.*;
 
 /** Converts URLs to a normal form . */
-public class BasicURLNormalizer implements URLNormalizer {
+public class BasicURLNormalizer extends Configured implements URLNormalizer {
     public static final Logger LOG = 
LoggerFactory.getLogger(BasicURLNormalizer.class);
 
     private Perl5Compiler compiler = new Perl5Compiler();
-    private ThreadLocal matchers = new ThreadLocal() {
-        protected synchronized Object initialValue() {
+    private ThreadLocal<Perl5Matcher> matchers = new 
ThreadLocal<Perl5Matcher>() {
+        protected Perl5Matcher initialValue() {
           return new Perl5Matcher();
         }
       };
-    private Rule relativePathRule = null;
-    private Rule leadingRelativePathRule = null;
-    private Rule currentPathRule = null;
-    private Rule adjacentSlashRule = null;
+    private final Rule relativePathRule;
+    private final Rule leadingRelativePathRule;
+    private final Rule currentPathRule;
+    private final Rule adjacentSlashRule;
 
     private Configuration conf;
 
@@ -80,7 +81,6 @@ public class BasicURLNormalizer implemen
         adjacentSlashRule.substitution = new Perl5Substitution("/");
         
       } catch (MalformedPatternException e) {
-        LOG.error("Error: ", e);
         throw new RuntimeException(e);
       }
     }

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1349227&r1=1349226&r2=1349227&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 Tue Jun 12 10:15:44 2012
@@ -17,36 +17,36 @@
 
 package org.apache.nutch.net.urlnormalizer.regex;
 
-import java.net.URL;
-import java.net.MalformedURLException;
-import java.io.FileInputStream;
 import java.io.FileReader;
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.Reader;
 import java.io.StringReader;
-
+import java.net.MalformedURLException;
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
-import java.util.List;
-import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import javax.xml.parsers.DocumentBuilderFactory;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
-
 import org.apache.nutch.net.URLNormalizer;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.util.NutchConfiguration;
-
-import javax.xml.parsers.*;
-import org.w3c.dom.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
 import org.xml.sax.InputSource;
 
-import java.util.regex.*;
-
 /**
  * Allows users to do regex substitutions on all/any URLs that are encountered,
  * which is useful for stripping session IDs from URLs.
@@ -75,9 +75,20 @@ public class RegexURLNormalizer extends 
     public String substitution;
   }
 
-  private HashMap scopedRules;
+  private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = 
+      new ThreadLocal<HashMap<String,List<Rule>>>() {
+    protected java.util.HashMap<String,java.util.List<Rule>> initialValue() {
+      return new HashMap<String, List<Rule>>();
+    };
+  };
+  
+  public HashMap<String, List<Rule>> getScopedRules() {
+    return scopedRulesThreadLocal.get();
+  }
+  
+  private List<Rule> defaultRules; 
   
-  private static final List EMPTY_RULES = Collections.EMPTY_LIST;
+  private static final List<Rule> EMPTY_RULES = Collections.emptyList();
 
   /**
    * The default constructor which is called from UrlNormalizerFactory
@@ -98,45 +109,44 @@ public class RegexURLNormalizer extends 
   public RegexURLNormalizer(Configuration conf, String filename)
           throws IOException, PatternSyntaxException {
     super(conf);
-    List rules = readConfigurationFile(filename);
-    if (rules != null)
-      scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
+    List<Rule> rules = readConfigurationFile(filename);
+    if (rules != null) {
+      defaultRules = rules;
+    }
   }
 
   public void setConf(Configuration conf) {
     super.setConf(conf);
     if (conf == null) return;
     // the default constructor was called
-    if (this.scopedRules == null) {
-      String filename = getConf().get("urlnormalizer.regex.file");
-      String stringRules = getConf().get("urlnormalizer.regex.rules");
-      scopedRules = new HashMap();
-      Reader reader = null;
-      if (stringRules != null) {
-        reader = new StringReader(stringRules);
-      } else {
-        reader = getConf().getConfResourceAsReader(filename);
-      }
-      List rules = null;
-      if (reader == null) {
-        LOG.warn("Can't load the default rules! ");
+
+    String filename = getConf().get("urlnormalizer.regex.file");
+    String stringRules = getConf().get("urlnormalizer.regex.rules");
+    Reader reader = null;
+    if (stringRules != null) {
+      reader = new StringReader(stringRules);
+    } else {
+      reader = getConf().getConfResourceAsReader(filename);
+    }
+    List<Rule> rules = null;
+    if (reader == null) {
+      LOG.warn("Can't load the default rules! ");
+      rules = EMPTY_RULES;
+    } else {
+      try {
+        rules = readConfiguration(reader);
+      } catch (Exception e) {
+        LOG.warn("Couldn't read default config: " + e);
         rules = EMPTY_RULES;
-      } else {
-        try {
-          rules = readConfiguration(reader);
-        } catch (Exception e) {
-          LOG.warn("Couldn't read default config: " + e);
-          rules = EMPTY_RULES;
-        }
       }
-      scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
     }
+    defaultRules = rules;
   }
 
   // used in JUnit test.
   void setConfiguration(Reader reader, String scope) {
-    List rules = readConfiguration(reader);
-    scopedRules.put(scope, rules);
+    List<Rule> rules = readConfiguration(reader);
+    getScopedRules().put(scope, rules);
     LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " 
rules.");
   }
   
@@ -144,23 +154,20 @@ public class RegexURLNormalizer extends 
    * This function does the replacements by iterating through all the regex
    * patterns. It accepts a string url as input and returns the altered string.
    */
-  public synchronized String regexNormalize(String urlString, String scope) {
-    List curRules = (List)scopedRules.get(scope);
+  public String regexNormalize(String urlString, String scope) {
+    HashMap<String, List<Rule>> scopedRules = getScopedRules();
+    List<Rule> curRules = scopedRules.get(scope);
     if (curRules == null) {
       // try to populate
       String configFile = getConf().get("urlnormalizer.regex.file." + scope);
       if (configFile != null) {
         LOG.debug("resource for scope '" + scope + "': " + configFile);
-        if (configFile == null) {
-          LOG.warn("Can't load resource for config file: " + configFile);
-        } else {
-          try {
-            Reader reader = getConf().getConfResourceAsReader(configFile);
-            curRules = readConfiguration(reader);
-            scopedRules.put(scope, curRules);
-          } catch (Exception e) {
-            LOG.warn("Couldn't load resource '" + configFile + "': " + e);
-          }
+        try {
+          Reader reader = getConf().getConfResourceAsReader(configFile);
+          curRules = readConfiguration(reader);
+          scopedRules.put(scope, curRules);
+        } catch (Exception e) {
+          LOG.warn("Couldn't load resource '" + configFile + "': " + e);
         }
       }
       if (curRules == EMPTY_RULES || curRules == null) {
@@ -169,10 +176,9 @@ public class RegexURLNormalizer extends 
       }
     }
     if (curRules == EMPTY_RULES || curRules == null) {
-      // use global rules
-      curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
+      curRules = defaultRules;
     }
-    Iterator i = curRules.iterator();
+    Iterator<Rule> i = curRules.iterator();
     while (i.hasNext()) {
       Rule r = (Rule) i.next();
 
@@ -183,13 +189,13 @@ public class RegexURLNormalizer extends 
     return urlString;
   }
 
-  public synchronized String normalize(String urlString, String scope)
+  public String normalize(String urlString, String scope)
           throws MalformedURLException {
     return regexNormalize(urlString, scope);
   }
 
   /** Reads the configuration file and populates a List of Rules. */
-  private List readConfigurationFile(String filename) {
+  private List<Rule> readConfigurationFile(String filename) {
     if (LOG.isInfoEnabled()) {
       LOG.info("loading " + filename);
     }
@@ -202,8 +208,8 @@ public class RegexURLNormalizer extends 
     }
   }
   
-  private List readConfiguration(Reader reader) {
-    List rules = new ArrayList();
+  private List<Rule> readConfiguration(Reader reader) {
+    List<Rule> rules = new ArrayList<Rule>();
     try {
 
       // borrowed heavily from code in Configuration.java
@@ -261,10 +267,11 @@ public class RegexURLNormalizer extends 
           IOException {
     RegexURLNormalizer normalizer = new RegexURLNormalizer();
     normalizer.setConf(NutchConfiguration.create());
-    Iterator i = 
((List)normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator();
+    HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules();
+    Iterator<Rule> i = normalizer.defaultRules.iterator();
     System.out.println("* Rules for 'DEFAULT' scope:");
     while (i.hasNext()) {
-      Rule r = (Rule) i.next();
+      Rule r = i.next();
       System.out.print("  " + r.pattern.pattern() + " -> ");
       System.out.println(r.substitution);
     }
@@ -272,13 +279,13 @@ public class RegexURLNormalizer extends 
     if (args.length > 1) {
       normalizer.normalize("http://test.com";, args[1]);
     }
-    if (normalizer.scopedRules.size() > 1) {
-      Iterator it = normalizer.scopedRules.keySet().iterator();
+    if (scopedRules.size() > 1) {
+      Iterator<String> it = scopedRules.keySet().iterator();
       while (it.hasNext()) {
-        String scope = (String)it.next();
+        String scope = it.next();
         if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) continue;
         System.out.println("* Rules for '" + scope + "' scope:");
-        i = ((List)normalizer.scopedRules.get(scope)).iterator();
+        i = ((List<Rule>)scopedRules.get(scope)).iterator();
         while (i.hasNext()) {
           Rule r = (Rule) i.next();
           System.out.print("  " + r.pattern.pattern() + " -> ");


Reply via email to