Author: snagel
Date: Tue Oct 23 20:52:21 2012
New Revision: 1401460

URL: http://svn.apache.org/viewvc?rev=1401460&view=rev
Log:
NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns

Modified:
    nutch/branches/2.x/CHANGES.txt
    
nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1401460&r1=1401459&r2=1401460&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Oct 23 20:52:21 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns 
(snagel)
+
 * NUTCH-1433 Upgrade to Tika 1.2 (jnioche)
 
 * NUTCH-1087 Deprecate crawl command and replace with example script (jnioche)

Modified: 
nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1401460&r1=1401459&r2=1401460&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 Tue Oct 23 20:52:21 2012
@@ -247,7 +247,14 @@ public class RegexURLNormalizer extends 
         }
         if (patternValue != null && subValue != null) {
           Rule rule = new Rule();
-          rule.pattern = Pattern.compile(patternValue);
+          try {
+            rule.pattern = Pattern.compile(patternValue);
+          } catch (PatternSyntaxException e) {
+            if (LOG.isErrorEnabled()) {
+              LOG.error("skipped rule: " + patternValue + " -> " + subValue + 
" : invalid regular expression pattern: " + e);
+            }
+            continue;
+          }
           rule.substitution = subValue;
           rules.add(rule);
         }


Reply via email to