Update of /cvsroot/nutch/nutch/src/java/net/nutch/net
In directory 
sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21785/src/java/net/nutch/net

Modified Files:
        BasicUrlNormalizer.java 
Log Message:
Improved BasicUrlNormalizer to better handle relative urls.  The file
part of a URL is normalized in the following manner:

  1. "/aa/../" will be replaced by "/" This is done step by step until
     the url doesn´t change anymore. So we ensure, that
     "/aa/bb/../../" will be replaced by "/", too

  2. leading "/../" will be replaced by "/"

(Sven Wende via cutting)


Index: BasicUrlNormalizer.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/net/BasicUrlNormalizer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** BasicUrlNormalizer.java     7 Sep 2004 19:26:07 -0000       1.1
--- BasicUrlNormalizer.java     10 Dec 2004 19:37:31 -0000      1.2
***************
*** 11,70 ****
  import java.util.logging.Logger;
  import net.nutch.util.LogFormatter;
  
  /** Converts URLs to a normal form . */
  public class BasicUrlNormalizer implements UrlNormalizer {
!   public static final Logger LOG =
!     LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer");
!   
!   public String normalize(String urlString)
!     throws MalformedURLException {
!     if ("".equals(urlString))                     // permit empty
!       return urlString;
  
!     urlString = urlString.trim();                 // remove extra spaces
  
!     URL url = new URL(urlString);
  
!     String protocol = url.getProtocol();
!     String host = url.getHost();
!     int port = url.getPort();
!     String file = url.getFile();
  
!     boolean changed = false;
  
!     if (!urlString.startsWith(protocol))        // protocol was lowercased
!       changed = true;
  
-     if ("http".equals(protocol) || "ftp".equals(protocol)) {
-       
-       if (host != null) {
-         String newHost = host.toLowerCase();    // lowercase host
-         if (!host.equals(newHost)) {
-           host = newHost;
-           changed = true;
          }
-       }
  
!       if (port == url.getDefaultPort()) {       // uses default port
!         port = -1;                              // so don't specify it
!         changed = true;
!       }
  
!       if (file == null || "".equals(file)) {    // add a slash
!         file = "/";
!         changed = true;
!       }
  
!       if (url.getRef() != null) {                 // remove the ref
!         changed = true;
!       }
  
      }
  
-     if (changed)
-       urlString = new URL(protocol, host, port, file).toString();
  
!     return urlString;
!   }
  
  }
--- 11,156 ----
  import java.util.logging.Logger;
  import net.nutch.util.LogFormatter;
+ import org.apache.oro.text.regex.*;
  
  /** Converts URLs to a normal form . */
  public class BasicUrlNormalizer implements UrlNormalizer {
!     public static final Logger LOG =
!             LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer");
  
!     private Perl5Compiler compiler = new Perl5Compiler();
!     private PatternMatcher matcher = new Perl5Matcher();
!     private Rule relativePathRule = null;
!     private Rule leadingRelativePathRule = null;
  
!     public BasicUrlNormalizer() {
!       try {
!         // this pattern tries to find spots like "/xx/../" in the url, which
!         // could be replaced by "/" xx consists of chars, different then "/"
!         // (slash) and needs to have at least one char different from "."
!         relativePathRule = new Rule();
!         relativePathRule.pattern = (Perl5Pattern)
!           compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)");
!         relativePathRule.substitution = new Perl5Substitution("/");
  
!         // this pattern tries to find spots like leading "/../" in the url,
!         // which could be replaced by "/"
!         leadingRelativePathRule = new Rule();
!         leadingRelativePathRule.pattern = (Perl5Pattern)
!           compiler.compile("^(/\\.\\./)+");
!         leadingRelativePathRule.substitution = new Perl5Substitution("/");
  
!       } catch (MalformedPatternException e) {
!         e.printStackTrace();
!         throw new RuntimeException(e);
!       }
!     }
  
!     public String normalize(String urlString)
!             throws MalformedURLException {
!         if ("".equals(urlString))                     // permit empty
!             return urlString;
! 
!         urlString = urlString.trim();                 // remove extra spaces
! 
!         URL url = new URL(urlString);
! 
!         String protocol = url.getProtocol();
!         String host = url.getHost();
!         int port = url.getPort();
!         String file = url.getFile();
! 
!         boolean changed = false;
! 
!         if (!urlString.startsWith(protocol))        // protocol was lowercased
!             changed = true;
! 
!         if ("http".equals(protocol) || "ftp".equals(protocol)) {
! 
!             if (host != null) {
!                 String newHost = host.toLowerCase();    // lowercase host
!                 if (!host.equals(newHost)) {
!                     host = newHost;
!                     changed = true;
!                 }
!             }
! 
!             if (port == url.getDefaultPort()) {       // uses default port
!                 port = -1;                              // so don't specify it
!                 changed = true;
!             }
! 
!             if (file == null || "".equals(file)) {    // add a slash
!                 file = "/";
!                 changed = true;
!             }
! 
!             if (url.getRef() != null) {                 // remove the ref
!                 changed = true;
!             }
! 
!             // check for unnecessary use of "/../"
!             String file2 = substituteUnnecessaryRelativePaths(file);
! 
!             if (!file.equals(file2)) {
!                 changed = true;
!                 file = file2;
!             }
  
          }
  
!         if (changed)
!             urlString = new URL(protocol, host, port, file).toString();
  
!         return urlString;
!     }
  
!     private String substituteUnnecessaryRelativePaths(String file) {
!         String fileWorkCopy = file;
!         int oldLen = file.length();
!         int newLen = oldLen - 1;
! 
!         // All substitutions will be done step by step, to ensure that certain
!         // constellations will be normalized, too
!         //
!         // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
!         // following manner:
!         //   "/aa/bb/../../cc/../foo.html"
!         //   "/aa/../cc/../foo.html"
!         //   "/cc/../foo.html"
!         //   "/foo.html"
!         //
!         // The normalization also takes care of leading "/../", which will be
!         // replaced by "/", because this is a rather a sign of bad webserver
!         // configuration than of a wanted link.  For example, urls like
!         // "http://www.foo.com/../"; should return a http 404 error instead of
!         // redirecting to "http://www.foo.com";.
!         //
!         while (oldLen != newLen) {
!             // substitue first occurence of "/xx/../" by "/"
!             oldLen = fileWorkCopy.length();
!             fileWorkCopy = Util.substitute
!               (matcher, relativePathRule.pattern,
!                relativePathRule.substitution, fileWorkCopy, 1);
  
+             // remove leading "/../"
+             fileWorkCopy = Util.substitute
+               (matcher, leadingRelativePathRule.pattern,
+                leadingRelativePathRule.substitution, fileWorkCopy, 1);
+             newLen = fileWorkCopy.length();
+         }
+ 
+         return fileWorkCopy;
      }
  
  
!     /**
!      * Class which holds a compiled pattern and its corresponding substition
!      * string.
!      */
!     private static class Rule {
!         public Perl5Pattern pattern;
!         public Perl5Substitution substitution;
!     }
  
  }
+ 



-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now. 
http://productguide.itmanagersjournal.com/
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to