Update of /cvsroot/nutch/nutch/src/java/net/nutch/net In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21785/src/java/net/nutch/net
Modified Files: BasicUrlNormalizer.java Log Message: Improved BasicUrlNormalizer to better handle relative urls. The file part of a URL is normalized in the following manner: 1. "/aa/../" will be replaced by "/" This is done step by step until the url doesn´t change anymore. So we ensure, that "/aa/bb/../../" will be replaced by "/", too 2. leading "/../" will be replaced by "/" (Sven Wende via cutting) Index: BasicUrlNormalizer.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/net/BasicUrlNormalizer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** BasicUrlNormalizer.java 7 Sep 2004 19:26:07 -0000 1.1 --- BasicUrlNormalizer.java 10 Dec 2004 19:37:31 -0000 1.2 *************** *** 11,70 **** import java.util.logging.Logger; import net.nutch.util.LogFormatter; /** Converts URLs to a normal form . */ public class BasicUrlNormalizer implements UrlNormalizer { ! public static final Logger LOG = ! LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer"); ! ! public String normalize(String urlString) ! throws MalformedURLException { ! if ("".equals(urlString)) // permit empty ! return urlString; ! urlString = urlString.trim(); // remove extra spaces ! URL url = new URL(urlString); ! String protocol = url.getProtocol(); ! String host = url.getHost(); ! int port = url.getPort(); ! String file = url.getFile(); ! boolean changed = false; ! if (!urlString.startsWith(protocol)) // protocol was lowercased ! changed = true; - if ("http".equals(protocol) || "ftp".equals(protocol)) { - - if (host != null) { - String newHost = host.toLowerCase(); // lowercase host - if (!host.equals(newHost)) { - host = newHost; - changed = true; } - } ! if (port == url.getDefaultPort()) { // uses default port ! port = -1; // so don't specify it ! changed = true; ! } ! if (file == null || "".equals(file)) { // add a slash ! file = "/"; ! changed = true; ! } ! if (url.getRef() != null) { // remove the ref ! changed = true; ! } } - if (changed) - urlString = new URL(protocol, host, port, file).toString(); ! return urlString; ! } } --- 11,156 ---- import java.util.logging.Logger; import net.nutch.util.LogFormatter; + import org.apache.oro.text.regex.*; /** Converts URLs to a normal form . */ public class BasicUrlNormalizer implements UrlNormalizer { ! public static final Logger LOG = ! LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer"); ! private Perl5Compiler compiler = new Perl5Compiler(); ! private PatternMatcher matcher = new Perl5Matcher(); ! private Rule relativePathRule = null; ! private Rule leadingRelativePathRule = null; ! public BasicUrlNormalizer() { ! try { ! // this pattern tries to find spots like "/xx/../" in the url, which ! // could be replaced by "/" xx consists of chars, different then "/" ! // (slash) and needs to have at least one char different from "." ! relativePathRule = new Rule(); ! relativePathRule.pattern = (Perl5Pattern) ! compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)"); ! relativePathRule.substitution = new Perl5Substitution("/"); ! // this pattern tries to find spots like leading "/../" in the url, ! // which could be replaced by "/" ! leadingRelativePathRule = new Rule(); ! leadingRelativePathRule.pattern = (Perl5Pattern) ! compiler.compile("^(/\\.\\./)+"); ! leadingRelativePathRule.substitution = new Perl5Substitution("/"); ! } catch (MalformedPatternException e) { ! e.printStackTrace(); ! throw new RuntimeException(e); ! } ! } ! public String normalize(String urlString) ! throws MalformedURLException { ! if ("".equals(urlString)) // permit empty ! return urlString; ! ! urlString = urlString.trim(); // remove extra spaces ! ! URL url = new URL(urlString); ! ! String protocol = url.getProtocol(); ! String host = url.getHost(); ! int port = url.getPort(); ! String file = url.getFile(); ! ! boolean changed = false; ! ! if (!urlString.startsWith(protocol)) // protocol was lowercased ! changed = true; ! ! if ("http".equals(protocol) || "ftp".equals(protocol)) { ! ! if (host != null) { ! String newHost = host.toLowerCase(); // lowercase host ! if (!host.equals(newHost)) { ! host = newHost; ! changed = true; ! } ! } ! ! if (port == url.getDefaultPort()) { // uses default port ! port = -1; // so don't specify it ! changed = true; ! } ! ! if (file == null || "".equals(file)) { // add a slash ! file = "/"; ! changed = true; ! } ! ! if (url.getRef() != null) { // remove the ref ! changed = true; ! } ! ! // check for unnecessary use of "/../" ! String file2 = substituteUnnecessaryRelativePaths(file); ! ! if (!file.equals(file2)) { ! changed = true; ! file = file2; ! } } ! if (changed) ! urlString = new URL(protocol, host, port, file).toString(); ! return urlString; ! } ! private String substituteUnnecessaryRelativePaths(String file) { ! String fileWorkCopy = file; ! int oldLen = file.length(); ! int newLen = oldLen - 1; ! ! // All substitutions will be done step by step, to ensure that certain ! // constellations will be normalized, too ! // ! // For example: "/aa/bb/../../cc/../foo.html will be normalized in the ! // following manner: ! // "/aa/bb/../../cc/../foo.html" ! // "/aa/../cc/../foo.html" ! // "/cc/../foo.html" ! // "/foo.html" ! // ! // The normalization also takes care of leading "/../", which will be ! // replaced by "/", because this is a rather a sign of bad webserver ! // configuration than of a wanted link. For example, urls like ! // "http://www.foo.com/../" should return a http 404 error instead of ! // redirecting to "http://www.foo.com". ! // ! while (oldLen != newLen) { ! // substitue first occurence of "/xx/../" by "/" ! oldLen = fileWorkCopy.length(); ! fileWorkCopy = Util.substitute ! (matcher, relativePathRule.pattern, ! relativePathRule.substitution, fileWorkCopy, 1); + // remove leading "/../" + fileWorkCopy = Util.substitute + (matcher, leadingRelativePathRule.pattern, + leadingRelativePathRule.substitution, fileWorkCopy, 1); + newLen = fileWorkCopy.length(); + } + + return fileWorkCopy; } ! /** ! * Class which holds a compiled pattern and its corresponding substition ! * string. ! */ ! private static class Rule { ! public Perl5Pattern pattern; ! public Perl5Substitution substitution; ! } } + ------------------------------------------------------- SF email is sponsored by - The IT Product Guide Read honest & candid reviews on hundreds of IT Products from real users. Discover which products truly live up to the hype. Start reading now. http://productguide.itmanagersjournal.com/ _______________________________________________ Nutch-cvs mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-cvs