Author: markus
Date: Mon Jul  4 13:44:57 2011
New Revision: 1142664

URL: http://svn.apache.org/viewvc?rev=1142664&view=rev
Log:
NUTCH-1013 Migrate RegexURLNormalizer from Apache ORO java.util.regex

Modified:
    nutch/branches/branch-1.4/CHANGES.txt
    
nutch/branches/branch-1.4/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1142664&r1=1142663&r2=1142664&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Mon Jul  4 13:44:57 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.4 - Current development
 
+* NUTCH-1013 Migrate RegexURLNormalizer from Apache ORO to java.util.regex 
(markus)
+
 * NUTCH-1016 Strip UTF-8 non-character codepoints and add logging for 
SolrWriter (markus)
 
 * NUTCH-1012 Cannot handle illegal charset $charset (markus)

Modified: 
nutch/branches/branch-1.4/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1142664&r1=1142663&r2=1142664&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 (original)
+++ 
nutch/branches/branch-1.4/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 Mon Jul  4 13:44:57 2011
@@ -44,7 +44,8 @@ import org.apache.nutch.util.NutchConfig
 import javax.xml.parsers.*;
 import org.w3c.dom.*;
 import org.xml.sax.InputSource;
-import org.apache.oro.text.regex.*;
+
+import java.util.regex.*;
 
 /**
  * Allows users to do regex substitutions on all/any URLs that are encountered,
@@ -69,7 +70,7 @@ public class RegexURLNormalizer extends 
    * string.
    */
   private static class Rule {
-    public Perl5Pattern pattern;
+    public Pattern pattern;
 
     public String substitution;
   }
@@ -78,8 +79,6 @@ public class RegexURLNormalizer extends 
   
   private static final List EMPTY_RULES = Collections.EMPTY_LIST;
 
-  private PatternMatcher matcher = new Perl5Matcher();
-
   /**
    * The default constructor which is called from UrlNormalizerFactory
    * (normalizerClass.newInstance()) in method: getNormalizer()*
@@ -97,7 +96,7 @@ public class RegexURLNormalizer extends 
    * configuration files for it.
    */
   public RegexURLNormalizer(Configuration conf, String filename)
-          throws IOException, MalformedPatternException {
+          throws IOException, PatternSyntaxException {
     super(conf);
     List rules = readConfigurationFile(filename);
     if (rules != null)
@@ -176,9 +175,10 @@ public class RegexURLNormalizer extends 
     Iterator i = curRules.iterator();
     while (i.hasNext()) {
       Rule r = (Rule) i.next();
-      urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution(
-              r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual
-                                                                // substitution
+
+      Matcher matcher = r.pattern.matcher(urlString);
+
+      urlString = matcher.replaceAll(r.substitution);
     }
     return urlString;
   }
@@ -203,7 +203,6 @@ public class RegexURLNormalizer extends 
   }
   
   private List readConfiguration(Reader reader) {
-    Perl5Compiler compiler = new Perl5Compiler();
     List rules = new ArrayList();
     try {
 
@@ -242,7 +241,7 @@ public class RegexURLNormalizer extends 
         }
         if (patternValue != null && subValue != null) {
           Rule rule = new Rule();
-          rule.pattern = (Perl5Pattern) compiler.compile(patternValue);
+          rule.pattern = Pattern.compile(patternValue);
           rule.substitution = subValue;
           rules.add(rule);
         }
@@ -258,7 +257,7 @@ public class RegexURLNormalizer extends 
   }
 
   /** Spits out patterns and substitutions that are in the configuration file. 
*/
-  public static void main(String args[]) throws MalformedPatternException,
+  public static void main(String args[]) throws PatternSyntaxException,
           IOException {
     RegexURLNormalizer normalizer = new RegexURLNormalizer();
     normalizer.setConf(NutchConfiguration.create());
@@ -266,7 +265,7 @@ public class RegexURLNormalizer extends 
     System.out.println("* Rules for 'DEFAULT' scope:");
     while (i.hasNext()) {
       Rule r = (Rule) i.next();
-      System.out.print("  " + r.pattern.getPattern() + " -> ");
+      System.out.print("  " + r.pattern.pattern() + " -> ");
       System.out.println(r.substitution);
     }
     // load the scope
@@ -282,7 +281,7 @@ public class RegexURLNormalizer extends 
         i = ((List)normalizer.scopedRules.get(scope)).iterator();
         while (i.hasNext()) {
           Rule r = (Rule) i.next();
-          System.out.print("  " + r.pattern.getPattern() + " -> ");
+          System.out.print("  " + r.pattern.pattern() + " -> ");
           System.out.println(r.substitution);
         }
       }


Reply via email to