Author: snagel
Date: Tue Oct 23 20:51:35 2012
New Revision: 1401459
URL: http://svn.apache.org/viewvc?rev=1401459&view=rev
Log:
NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1401459&r1=1401458&r2=1401459&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct 23 20:51:35 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns
(snagel)
+
* NUTCH-1341 NotModified time set to now but page not modified (markus)
* NUTCH-1215 UpdateDB should not require segment as input (markus)
Modified:
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1401459&r1=1401458&r2=1401459&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
(original)
+++
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
Tue Oct 23 20:51:35 2012
@@ -247,7 +247,14 @@ public class RegexURLNormalizer extends
}
if (patternValue != null && subValue != null) {
Rule rule = new Rule();
- rule.pattern = Pattern.compile(patternValue);
+ try {
+ rule.pattern = Pattern.compile(patternValue);
+ } catch (PatternSyntaxException e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error("skipped rule: " + patternValue + " -> " + subValue +
" : invalid regular expression pattern: " + e);
+ }
+ continue;
+ }
rule.substitution = subValue;
rules.add(rule);
}