Author: jnioche
Date: Mon Nov 11 10:15:03 2013
New Revision: 1540654
URL: http://svn.apache.org/r1540654
Log:
NUTCH-1666 Optimisation for BasicURLNormalizer (jnioche)
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1540654&r1=1540653&r2=1540654&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Nov 11 10:15:03 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1666 Optimisation for BasicURLNormalizer (jnioche)
+
* NUTCH-1656 ParseMeta not passed to CrawlDatum for not_modified (markus)
* NUTCH-1606 Check that Factory classes use the cache in a thread safe way
(jnioche)
Modified:
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1540654&r1=1540653&r2=1540654&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
(original)
+++
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Mon Nov 11 10:15:03 2013
@@ -45,6 +45,8 @@ public class BasicURLNormalizer extends
private final Rule leadingRelativePathRule;
private final Rule currentPathRule;
private final Rule adjacentSlashRule;
+
+ private final static java.util.regex.Pattern hasNormalizablePattern =
java.util.regex.Pattern.compile("/\\.?\\.?/");
private Configuration conf;
@@ -145,6 +147,10 @@ public class BasicURLNormalizer extends
}
private String substituteUnnecessaryRelativePaths(String file) {
+
+ if (!hasNormalizablePattern.matcher(file).find())
+ return file;
+
String fileWorkCopy = file;
int oldLen = file.length();
int newLen = oldLen - 1;