Author: snagel
Date: Wed Oct 10 21:15:55 2012
New Revision: 1396800
URL: http://svn.apache.org/viewvc?rev=1396800&view=rev
Log:
NUTCH-1344 BasicURLNormalizer to normalize https same as http
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1396800&r1=1396799&r2=1396800&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Oct 10 21:15:55 2012
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1344 BasicURLNormalizer to normalize https same as http
+
* NUTCH-706 Url regex normalizer: pattern for session id removal not to match
"newsId" (Meghna Kukreja via snagel)
Release 2.1 (19/09/2012) ddmmyyyy
Modified:
nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1396800&r1=1396799&r2=1396800&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
(original)
+++
nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Wed Oct 10 21:15:55 2012
@@ -104,7 +104,7 @@ public class BasicURLNormalizer extends
if (!urlString.startsWith(protocol)) // protocol was lowercased
changed = true;
- if ("http".equals(protocol) || "ftp".equals(protocol)) {
+ if ("http".equals(protocol) || "https".equals(protocol) ||
"ftp".equals(protocol)) {
if (host != null) {
String newHost = host.toLowerCase(); // lowercase host