Author: snagel
Date: Tue Oct 23 20:51:35 2012
New Revision: 1401459

URL: http://svn.apache.org/viewvc?rev=1401459&view=rev
Log:
NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns

Modified:
    nutch/trunk/CHANGES.txt
    
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1401459&r1=1401458&r2=1401459&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct 23 20:51:35 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns 
(snagel)
+
 * NUTCH-1341 NotModified time set to now but page not modified (markus)
 
 * NUTCH-1215 UpdateDB should not require segment as input (markus)

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1401459&r1=1401458&r2=1401459&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 Tue Oct 23 20:51:35 2012
@@ -247,7 +247,14 @@ public class RegexURLNormalizer extends 
         }
         if (patternValue != null && subValue != null) {
           Rule rule = new Rule();
-          rule.pattern = Pattern.compile(patternValue);
+          try {
+            rule.pattern = Pattern.compile(patternValue);
+          } catch (PatternSyntaxException e) {
+            if (LOG.isErrorEnabled()) {
+              LOG.error("skipped rule: " + patternValue + " -> " + subValue + 
" : invalid regular expression pattern: " + e);
+            }
+            continue;
+          }
           rule.substitution = subValue;
           rules.add(rule);
         }


Reply via email to