Modified: nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java (original) +++ nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -67,52 +67,52 @@ import org.apache.nutch.parse.Parse; */ public class URLMetaIndexingFilter implements IndexingFilter { - private static final Logger LOG = LoggerFactory - .getLogger(URLMetaIndexingFilter.class); - private static final String CONF_PROPERTY = "urlmeta.tags"; - private static String[] urlMetaTags; - private Configuration conf; - - /** - * This will take the metatags that you have listed in your "urlmeta.tags" - * property, and looks for them inside the CrawlDatum object. If they exist, - * this will add it as an attribute inside the NutchDocument. - * - * @see IndexingFilter#filter - */ - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - if (conf != null) - this.setConf(conf); - - if (urlMetaTags == null || doc == null) - return doc; - - for (String metatag : urlMetaTags) { - Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); - - if (metadata != null) - doc.add(metatag, metadata.toString()); - } - - return doc; - } - - /** Boilerplate */ - public Configuration getConf() { - return conf; - } - - /** - * handles conf assignment and pulls the value assignment from the - * "urlmeta.tags" property - */ - public void setConf(Configuration conf) { - this.conf = conf; + private static final Logger LOG = LoggerFactory + .getLogger(URLMetaIndexingFilter.class); + private static final String CONF_PROPERTY = "urlmeta.tags"; + private static String[] urlMetaTags; + private Configuration conf; + + /** + * This will take the metatags that you have listed in your "urlmeta.tags" + * property, and looks for them inside the CrawlDatum object. If they exist, + * this will add it as an attribute inside the NutchDocument. + * + * @see IndexingFilter#filter + */ + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + if (conf != null) + this.setConf(conf); + + if (urlMetaTags == null || doc == null) + return doc; + + for (String metatag : urlMetaTags) { + Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); + + if (metadata != null) + doc.add(metatag, metadata.toString()); + } + + return doc; + } + + /** Boilerplate */ + public Configuration getConf() { + return conf; + } + + /** + * handles conf assignment and pulls the value assignment from the + * "urlmeta.tags" property + */ + public void setConf(Configuration conf) { + this.conf = conf; - if (conf == null) - return; + if (conf == null) + return; - urlMetaTags = conf.getStrings(CONF_PROPERTY); - } + urlMetaTags = conf.getStrings(CONF_PROPERTY); + } }
Modified: nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java (original) +++ nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java Thu Jan 29 05:38:59 2015 @@ -43,7 +43,8 @@ import org.apache.nutch.scoring.ScoringF */ public class URLMetaScoringFilter extends Configured implements ScoringFilter { - private static final Logger LOG = LoggerFactory.getLogger(URLMetaScoringFilter.class); + private static final Logger LOG = LoggerFactory + .getLogger(URLMetaScoringFilter.class); private static final String CONF_PROPERTY = "urlmeta.tags"; private static String[] urlMetaTags; private Configuration conf; @@ -73,8 +74,8 @@ public class URLMetaScoringFilter extend if (metaFromParse == null) continue; - nextTarget.getValue().getMetaData().put(new Text(metatag), - new Text(metaFromParse)); + nextTarget.getValue().getMetaData() + .put(new Text(metatag), new Text(metaFromParse)); } } return adjust; Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -39,185 +39,180 @@ import org.apache.oro.text.regex.*; * </ul> */ public class BasicURLNormalizer extends Configured implements URLNormalizer { - public static final Logger LOG = LoggerFactory.getLogger(BasicURLNormalizer.class); + public static final Logger LOG = LoggerFactory + .getLogger(BasicURLNormalizer.class); - private Perl5Compiler compiler = new Perl5Compiler(); - private ThreadLocal<Perl5Matcher> matchers = new ThreadLocal<Perl5Matcher>() { - protected Perl5Matcher initialValue() { - return new Perl5Matcher(); - } - }; - private final Rule relativePathRule; - private final Rule leadingRelativePathRule; - private final Rule currentPathRule; - private final Rule adjacentSlashRule; - - private final static java.util.regex.Pattern hasNormalizablePattern = java.util.regex.Pattern.compile("/\\.?\\.?/"); - - private Configuration conf; - - public BasicURLNormalizer() { - try { - // this pattern tries to find spots like "/xx/../" in the url, which - // could be replaced by "/" xx consists of chars, different then "/" - // (slash) and needs to have at least one char different from "." - relativePathRule = new Rule(); - relativePathRule.pattern = (Perl5Pattern) - compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)", - Perl5Compiler.READ_ONLY_MASK); - relativePathRule.substitution = new Perl5Substitution("/"); - - // this pattern tries to find spots like leading "/../" in the url, - // which could be replaced by "/" - leadingRelativePathRule = new Rule(); - leadingRelativePathRule.pattern = (Perl5Pattern) - compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK); - leadingRelativePathRule.substitution = new Perl5Substitution("/"); - - // this pattern tries to find spots like "/./" in the url, - // which could be replaced by "/" - currentPathRule = new Rule(); - currentPathRule.pattern = (Perl5Pattern) - compiler.compile("(/\\./)", Perl5Compiler.READ_ONLY_MASK); - currentPathRule.substitution = new Perl5Substitution("/"); - - // this pattern tries to find spots like "xx//yy" in the url, - // which could be replaced by a "/" - adjacentSlashRule = new Rule(); - adjacentSlashRule.pattern = (Perl5Pattern) - compiler.compile("/{2,}", Perl5Compiler.READ_ONLY_MASK); - adjacentSlashRule.substitution = new Perl5Substitution("/"); - - } catch (MalformedPatternException e) { - throw new RuntimeException(e); - } + private Perl5Compiler compiler = new Perl5Compiler(); + private ThreadLocal<Perl5Matcher> matchers = new ThreadLocal<Perl5Matcher>() { + protected Perl5Matcher initialValue() { + return new Perl5Matcher(); + } + }; + private final Rule relativePathRule; + private final Rule leadingRelativePathRule; + private final Rule currentPathRule; + private final Rule adjacentSlashRule; + + private final static java.util.regex.Pattern hasNormalizablePattern = java.util.regex.Pattern + .compile("/\\.?\\.?/"); + + private Configuration conf; + + public BasicURLNormalizer() { + try { + // this pattern tries to find spots like "/xx/../" in the url, which + // could be replaced by "/" xx consists of chars, different then "/" + // (slash) and needs to have at least one char different from "." + relativePathRule = new Rule(); + relativePathRule.pattern = (Perl5Pattern) compiler.compile( + "(/[^/]*[^/.]{1}[^/]*/\\.\\./)", Perl5Compiler.READ_ONLY_MASK); + relativePathRule.substitution = new Perl5Substitution("/"); + + // this pattern tries to find spots like leading "/../" in the url, + // which could be replaced by "/" + leadingRelativePathRule = new Rule(); + leadingRelativePathRule.pattern = (Perl5Pattern) compiler.compile( + "^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK); + leadingRelativePathRule.substitution = new Perl5Substitution("/"); + + // this pattern tries to find spots like "/./" in the url, + // which could be replaced by "/" + currentPathRule = new Rule(); + currentPathRule.pattern = (Perl5Pattern) compiler.compile("(/\\./)", + Perl5Compiler.READ_ONLY_MASK); + currentPathRule.substitution = new Perl5Substitution("/"); + + // this pattern tries to find spots like "xx//yy" in the url, + // which could be replaced by a "/" + adjacentSlashRule = new Rule(); + adjacentSlashRule.pattern = (Perl5Pattern) compiler.compile("/{2,}", + Perl5Compiler.READ_ONLY_MASK); + adjacentSlashRule.substitution = new Perl5Substitution("/"); + + } catch (MalformedPatternException e) { + throw new RuntimeException(e); } + } - public String normalize(String urlString, String scope) - throws MalformedURLException { - if ("".equals(urlString)) // permit empty - return urlString; - - urlString = urlString.trim(); // remove extra spaces - - URL url = new URL(urlString); - - String protocol = url.getProtocol(); - String host = url.getHost(); - int port = url.getPort(); - String file = url.getFile(); - - boolean changed = false; - - if (!urlString.startsWith(protocol)) // protocol was lowercased - changed = true; - - if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) { - - if (host != null) { - String newHost = host.toLowerCase(); // lowercase host - if (!host.equals(newHost)) { - host = newHost; - changed = true; - } - } - - if (port == url.getDefaultPort()) { // uses default port - port = -1; // so don't specify it - changed = true; - } - - if (file == null || "".equals(file)) { // add a slash - file = "/"; - changed = true; - } - - if (url.getRef() != null) { // remove the ref - changed = true; - } - - // check for unnecessary use of "/../" - String file2 = substituteUnnecessaryRelativePaths(file); - - if (!file.equals(file2)) { - changed = true; - file = file2; - } + public String normalize(String urlString, String scope) + throws MalformedURLException { + if ("".equals(urlString)) // permit empty + return urlString; - } + urlString = urlString.trim(); // remove extra spaces - if (changed) - urlString = new URL(protocol, host, port, file).toString(); + URL url = new URL(urlString); - return urlString; - } + String protocol = url.getProtocol(); + String host = url.getHost(); + int port = url.getPort(); + String file = url.getFile(); + + boolean changed = false; - private String substituteUnnecessaryRelativePaths(String file) { - - if (!hasNormalizablePattern.matcher(file).find()) - return file; - - String fileWorkCopy = file; - int oldLen = file.length(); - int newLen = oldLen - 1; - - // All substitutions will be done step by step, to ensure that certain - // constellations will be normalized, too - // - // For example: "/aa/bb/../../cc/../foo.html will be normalized in the - // following manner: - // "/aa/bb/../../cc/../foo.html" - // "/aa/../cc/../foo.html" - // "/cc/../foo.html" - // "/foo.html" - // - // The normalization also takes care of leading "/../", which will be - // replaced by "/", because this is a rather a sign of bad webserver - // configuration than of a wanted link. For example, urls like - // "http://www.foo.com/../" should return a http 404 error instead of - // redirecting to "http://www.foo.com". - // - Perl5Matcher matcher = (Perl5Matcher)matchers.get(); - - while (oldLen != newLen) { - // substitue first occurence of "/xx/../" by "/" - oldLen = fileWorkCopy.length(); - fileWorkCopy = Util.substitute - (matcher, relativePathRule.pattern, - relativePathRule.substitution, fileWorkCopy, 1); - - // remove leading "/../" - fileWorkCopy = Util.substitute - (matcher, leadingRelativePathRule.pattern, - leadingRelativePathRule.substitution, fileWorkCopy, 1); - - // remove unnecessary "/./" - fileWorkCopy = Util.substitute - (matcher, currentPathRule.pattern, - currentPathRule.substitution, fileWorkCopy, 1); - - - // collapse adjacent slashes with "/" - fileWorkCopy = Util.substitute - (matcher, adjacentSlashRule.pattern, - adjacentSlashRule.substitution, fileWorkCopy, 1); - - newLen = fileWorkCopy.length(); + if (!urlString.startsWith(protocol)) // protocol was lowercased + changed = true; + + if ("http".equals(protocol) || "https".equals(protocol) + || "ftp".equals(protocol)) { + + if (host != null) { + String newHost = host.toLowerCase(); // lowercase host + if (!host.equals(newHost)) { + host = newHost; + changed = true; } + } + + if (port == url.getDefaultPort()) { // uses default port + port = -1; // so don't specify it + changed = true; + } + + if (file == null || "".equals(file)) { // add a slash + file = "/"; + changed = true; + } + + if (url.getRef() != null) { // remove the ref + changed = true; + } + + // check for unnecessary use of "/../" + String file2 = substituteUnnecessaryRelativePaths(file); + + if (!file.equals(file2)) { + changed = true; + file = file2; + } - return fileWorkCopy; } + if (changed) + urlString = new URL(protocol, host, port, file).toString(); + + return urlString; + } + + private String substituteUnnecessaryRelativePaths(String file) { + + if (!hasNormalizablePattern.matcher(file).find()) + return file; - /** - * Class which holds a compiled pattern and its corresponding substition - * string. - */ - private static class Rule { - public Perl5Pattern pattern; - public Perl5Substitution substitution; + String fileWorkCopy = file; + int oldLen = file.length(); + int newLen = oldLen - 1; + + // All substitutions will be done step by step, to ensure that certain + // constellations will be normalized, too + // + // For example: "/aa/bb/../../cc/../foo.html will be normalized in the + // following manner: + // "/aa/bb/../../cc/../foo.html" + // "/aa/../cc/../foo.html" + // "/cc/../foo.html" + // "/foo.html" + // + // The normalization also takes care of leading "/../", which will be + // replaced by "/", because this is a rather a sign of bad webserver + // configuration than of a wanted link. For example, urls like + // "http://www.foo.com/../" should return a http 404 error instead of + // redirecting to "http://www.foo.com". + // + Perl5Matcher matcher = (Perl5Matcher) matchers.get(); + + while (oldLen != newLen) { + // substitue first occurence of "/xx/../" by "/" + oldLen = fileWorkCopy.length(); + fileWorkCopy = Util.substitute(matcher, relativePathRule.pattern, + relativePathRule.substitution, fileWorkCopy, 1); + + // remove leading "/../" + fileWorkCopy = Util.substitute(matcher, leadingRelativePathRule.pattern, + leadingRelativePathRule.substitution, fileWorkCopy, 1); + + // remove unnecessary "/./" + fileWorkCopy = Util.substitute(matcher, currentPathRule.pattern, + currentPathRule.substitution, fileWorkCopy, 1); + + // collapse adjacent slashes with "/" + fileWorkCopy = Util.substitute(matcher, adjacentSlashRule.pattern, + adjacentSlashRule.substitution, fileWorkCopy, 1); + + newLen = fileWorkCopy.length(); } + return fileWorkCopy; + } + + /** + * Class which holds a compiled pattern and its corresponding substition + * string. + */ + private static class Rule { + public Perl5Pattern pattern; + public Perl5Substitution substitution; + } public void setConf(Configuration conf) { this.conf = conf; @@ -228,4 +223,3 @@ public class BasicURLNormalizer extends } } - Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * and dot segments in path. */ package org.apache.nutch.net.urlnormalizer.basic; + Modified: nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -26,16 +26,15 @@ import org.junit.Test; /** Unit tests for BasicURLNormalizer. */ public class TestBasicURLNormalizer { private BasicURLNormalizer normalizer; - + private Configuration conf; - + public TestBasicURLNormalizer() { normalizer = new BasicURLNormalizer(); conf = NutchConfiguration.create(); normalizer.setConf(conf); } - @Test public void testNormalizer() throws Exception { // check that leading and trailing spaces are removed @@ -58,59 +57,49 @@ public class TestBasicURLNormalizer { // check that references are removed normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html"); - // // check that encoding is normalized - // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); + // // check that encoding is normalized + // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); // check that unnecessary "../" are removed - normalizeTest("http://foo.com/aa/./foo.html", - "http://foo.com/aa/foo.html" ); - normalizeTest("http://foo.com/aa/../", - "http://foo.com/" ); - normalizeTest("http://foo.com/aa/bb/../", - "http://foo.com/aa/"); - normalizeTest("http://foo.com/aa/..", - "http://foo.com/aa/.."); + normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/aa/../", "http://foo.com/"); + normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/"); + normalizeTest("http://foo.com/aa/..", "http://foo.com/aa/.."); normalizeTest("http://foo.com/aa/bb/cc/../../foo.html", - "http://foo.com/aa/foo.html"); + "http://foo.com/aa/foo.html"); normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html", - "http://foo.com/aa/cc/ee/foo.html"); - normalizeTest("http://foo.com/../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/../../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/../aa/../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/aa/../../foo.html", - "http://foo.com/foo.html" ); + "http://foo.com/aa/cc/ee/foo.html"); + normalizeTest("http://foo.com/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/../../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/aa/../../foo.html", "http://foo.com/foo.html"); normalizeTest("http://foo.com/aa/../bb/../foo.html/../../", - "http://foo.com/" ); - normalizeTest("http://foo.com/../aa/foo.html", - "http://foo.com/aa/foo.html" ); - normalizeTest("http://foo.com/../aa/../foo.html", - "http://foo.com/foo.html" ); + "http://foo.com/"); + normalizeTest("http://foo.com/../aa/foo.html", "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); normalizeTest("http://foo.com/a..a/foo.html", - "http://foo.com/a..a/foo.html" ); - normalizeTest("http://foo.com/a..a/../foo.html", - "http://foo.com/foo.html" ); + "http://foo.com/a..a/foo.html"); + normalizeTest("http://foo.com/a..a/../foo.html", "http://foo.com/foo.html"); normalizeTest("http://foo.com/foo.foo/../foo.html", - "http://foo.com/foo.html" ); + "http://foo.com/foo.html"); normalizeTest("http://foo.com//aa/bb/foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com/aa//bb/foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com/aa/bb//foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com//aa//bb//foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com////aa////bb////foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); } private void normalizeTest(String weird, String normal) throws Exception { - Assert.assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals(normal, + normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); } - + public static void main(String[] args) throws Exception { new TestBasicURLNormalizer().testNormalizer(); } Modified: nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -35,34 +35,36 @@ import org.apache.nutch.plugin.PluginRep import org.apache.nutch.util.URLUtil; /** - * URL normalizer for mapping hosts to their desired form. It takes - * a simple text file as source in the format: - * + * URL normalizer for mapping hosts to their desired form. It takes a simple + * text file as source in the format: + * * example.org www.example.org - * - * mapping all URL's of example.org the the www sub-domain. It also - * allows for wildcards to be used to map all sub-domains to another - * host: - * + * + * mapping all URL's of example.org the the www sub-domain. It also allows for + * wildcards to be used to map all sub-domains to another host: + * * *.example.org www.example.org */ public class HostURLNormalizer implements URLNormalizer { private Configuration conf; - private static final Logger LOG = LoggerFactory.getLogger(HostURLNormalizer.class); + private static final Logger LOG = LoggerFactory + .getLogger(HostURLNormalizer.class); private static String attributeFile = null; private String hostsFile = null; - private static final HashMap<String,String> hostsMap = new HashMap<String,String>(); + private static final HashMap<String, String> hostsMap = new HashMap<String, String>(); - public HostURLNormalizer() {} + public HostURLNormalizer() { + } public HostURLNormalizer(String hostsFile) { this.hostsFile = hostsFile; } - private synchronized void readConfiguration(Reader configReader) throws IOException { + private synchronized void readConfiguration(Reader configReader) + throws IOException { if (hostsMap.size() > 0) { return; } @@ -92,8 +94,8 @@ public class HostURLNormalizer implement // get the extensions for domain urlfilter String pluginName = "urlnormalizer-host"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( - URLNormalizer.class.getName()).getExtensions(); + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLNormalizer.class.getName()).getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { @@ -110,13 +112,12 @@ public class HostURLNormalizer implement if (attributeFile != null) { if (LOG.isInfoEnabled()) { LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); + + " as " + attributeFile); } - } - else { + } else { if (LOG.isWarnEnabled()) { LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " - + pluginName); + + pluginName); } } @@ -125,8 +126,7 @@ public class HostURLNormalizer implement String stringRules = conf.get("urlnormalizer.hosts.rules"); if (hostsFile != null) { file = hostsFile; - } - else if (attributeFile != null) { + } else if (attributeFile != null) { file = attributeFile; } Reader reader = null; @@ -140,13 +140,13 @@ public class HostURLNormalizer implement reader = new FileReader(file); } readConfiguration(reader); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } } - public String normalize(String urlString, String scope) throws MalformedURLException { + public String normalize(String urlString, String scope) + throws MalformedURLException { String host = new URL(urlString).getHost(); // Test static hosts @@ -164,7 +164,7 @@ public class HostURLNormalizer implement String wildCardHost = new String(); // Add the tld to the buffer - hostBuffer.append(hostParts[hostParts.length -1]); + hostBuffer.append(hostParts[hostParts.length - 1]); for (int i = hostParts.length - 2; i > 0; i--) { // Prepend another sub domain Modified: nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * configuration file. */ package org.apache.nutch.net.urlnormalizer.host; + Modified: nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -36,14 +36,22 @@ public class TestHostURLNormalizer { normalizer.setConf(conf); // Force www. sub domain when hitting link without sub domain - Assert.assertEquals("http://www.example.org/page.html", normalizer.normalize("http://example.org/page.html", URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals("http://www.example.org/page.html", + normalizer.normalize("http://example.org/page.html", + URLNormalizers.SCOPE_DEFAULT)); // Force no sub domain to www. URL's - Assert.assertEquals("http://example.net/path/to/something.html", normalizer.normalize("http://www.example.net/path/to/something.html", URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals("http://example.net/path/to/something.html", normalizer + .normalize("http://www.example.net/path/to/something.html", + URLNormalizers.SCOPE_DEFAULT)); // Force all sub domains to www. - Assert.assertEquals("http://example.com/?does=it&still=work", normalizer.normalize("http://example.com/?does=it&still=work", URLNormalizers.SCOPE_DEFAULT)); - Assert.assertEquals("http://example.com/buh", normalizer.normalize("http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT)); - Assert.assertEquals("http://example.com/blaat", normalizer.normalize("http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals("http://example.com/?does=it&still=work", normalizer + .normalize("http://example.com/?does=it&still=work", + URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals("http://example.com/buh", normalizer.normalize( + "http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals("http://example.com/blaat", normalizer.normalize( + "http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT)); } } Modified: nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -24,15 +24,17 @@ import org.apache.nutch.net.URLNormalize /** * This URLNormalizer doesn't change urls. It is sometimes useful if for a given - * scope at least one normalizer must be defined but no transformations are required. + * scope at least one normalizer must be defined but no transformations are + * required. * * @author Andrzej Bialecki */ public class PassURLNormalizer implements URLNormalizer { private Configuration conf; - - public String normalize(String urlString, String scope) throws MalformedURLException { + + public String normalize(String urlString, String scope) + throws MalformedURLException { return urlString; } @@ -41,7 +43,7 @@ public class PassURLNormalizer implement } public void setConf(Configuration conf) { - this.conf = conf; + this.conf = conf; } } Modified: nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * one URL normalizer must be defined in any scope. */ package org.apache.nutch.net.urlnormalizer.pass; + Modified: nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -29,7 +29,7 @@ public class TestPassURLNormalizer { @Test public void testPassURLNormalizer() { Configuration conf = NutchConfiguration.create(); - + PassURLNormalizer normalizer = new PassURLNormalizer(); normalizer.setConf(conf); String url = "http://www.example.com/test/..//"; @@ -39,7 +39,7 @@ public class TestPassURLNormalizer { } catch (MalformedURLException mue) { Assert.fail(mue.toString()); } - + Assert.assertEquals(url, result); } } Modified: nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -33,18 +33,20 @@ import org.apache.nutch.plugin.PluginRep import org.apache.nutch.util.URLUtil; /** - * URL normalizer plugin for normalizing query strings but sorting - * query string parameters. Not sorting query strings can lead to large - * amounts of duplicate URL's such as ?a=x&b=y vs b=y&a=x. - * + * URL normalizer plugin for normalizing query strings but sorting query string + * parameters. Not sorting query strings can lead to large amounts of duplicate + * URL's such as ?a=x&b=y vs b=y&a=x. + * */ public class QuerystringURLNormalizer implements URLNormalizer { private Configuration conf; - private static final Logger LOG = LoggerFactory.getLogger(QuerystringURLNormalizer.class); + private static final Logger LOG = LoggerFactory + .getLogger(QuerystringURLNormalizer.class); - public QuerystringURLNormalizer() {} + public QuerystringURLNormalizer() { + } public Configuration getConf() { return conf; @@ -54,20 +56,21 @@ public class QuerystringURLNormalizer im this.conf = conf; } - public String normalize(String urlString, String scope) throws MalformedURLException { + public String normalize(String urlString, String scope) + throws MalformedURLException { URL url = new URL(urlString); - + String queryString = url.getQuery(); - + if (queryString == null) { return urlString; } - + List<String> queryStringParts = Arrays.asList(queryString.split("&")); Collections.sort(queryStringParts); - + StringBuilder sb = new StringBuilder(); - + sb.append(url.getProtocol()); sb.append("://"); sb.append(url.getHost()); @@ -82,7 +85,7 @@ public class QuerystringURLNormalizer im sb.append("#"); sb.append(url.getRef()); } - + return sb.toString(); } } Modified: nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * by permutations. */ package org.apache.nutch.net.urlnormalizer.querystring; + Modified: nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -31,12 +31,19 @@ public class TestQuerystringURLNormalize QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer(); normalizer.setConf(conf); - - assertEquals("http://example.com/?a=b&c=d", normalizer.normalize("http://example.com/?c=d&a=b", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com/a/b/c", normalizer.normalize("http://example.com/a/b/c", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com:1234/a/b/c", normalizer.normalize("http://example.com:1234/a/b/c", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com:1234/a/b/c#ref", normalizer.normalize("http://example.com:1234/a/b/c#ref", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref", normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com/?a=b&a=c&c=d", normalizer.normalize("http://example.com/?c=d&a=b&a=c", URLNormalizers.SCOPE_DEFAULT)); + + assertEquals("http://example.com/?a=b&c=d", normalizer.normalize( + "http://example.com/?c=d&a=b", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com/a/b/c", normalizer.normalize( + "http://example.com/a/b/c", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com:1234/a/b/c", normalizer.normalize( + "http://example.com:1234/a/b/c", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com:1234/a/b/c#ref", normalizer.normalize( + "http://example.com:1234/a/b/c#ref", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref", + normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref", + URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com/?a=b&a=c&c=d", normalizer.normalize( + "http://example.com/?c=d&a=b&a=c", URLNormalizers.SCOPE_DEFAULT)); } } Modified: nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -51,19 +51,23 @@ import org.xml.sax.InputSource; * Allows users to do regex substitutions on all/any URLs that are encountered, * which is useful for stripping session IDs from URLs. * - * <p>This class uses the <tt>urlnormalizer.regex.file</tt> property. - * It should be set to the file name of an xml file which should contain the - * patterns and substitutions to be done on encountered URLs. + * <p> + * This class uses the <tt>urlnormalizer.regex.file</tt> property. It should be + * set to the file name of an xml file which should contain the patterns and + * substitutions to be done on encountered URLs. + * </p> + * <p> + * This class also supports different rules depending on the scope. Please see + * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details. * </p> - * <p>This class also supports different rules depending on the scope. Please see - * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.</p> * * @author Luke Baker * @author Andrzej Bialecki */ public class RegexURLNormalizer extends Configured implements URLNormalizer { - private static final Logger LOG = LoggerFactory.getLogger(RegexURLNormalizer.class); + private static final Logger LOG = LoggerFactory + .getLogger(RegexURLNormalizer.class); /** * Class which holds a compiled pattern and its corresponding substition @@ -75,19 +79,18 @@ public class RegexURLNormalizer extends public String substitution; } - private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = - new ThreadLocal<HashMap<String,List<Rule>>>() { - protected java.util.HashMap<String,java.util.List<Rule>> initialValue() { + private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = new ThreadLocal<HashMap<String, List<Rule>>>() { + protected java.util.HashMap<String, java.util.List<Rule>> initialValue() { return new HashMap<String, List<Rule>>(); }; }; - + public HashMap<String, List<Rule>> getScopedRules() { return scopedRulesThreadLocal.get(); } - - private List<Rule> defaultRules; - + + private List<Rule> defaultRules; + private static final List<Rule> EMPTY_RULES = Collections.emptyList(); /** @@ -107,7 +110,7 @@ public class RegexURLNormalizer extends * configuration files for it. */ public RegexURLNormalizer(Configuration conf, String filename) - throws IOException, PatternSyntaxException { + throws IOException, PatternSyntaxException { super(conf); List<Rule> rules = readConfigurationFile(filename); if (rules != null) { @@ -117,7 +120,8 @@ public class RegexURLNormalizer extends public void setConf(Configuration conf) { super.setConf(conf); - if (conf == null) return; + if (conf == null) + return; // the default constructor was called String filename = getConf().get("urlnormalizer.regex.file"); @@ -147,9 +151,10 @@ public class RegexURLNormalizer extends void setConfiguration(Reader reader, String scope) { List<Rule> rules = readConfiguration(reader); getScopedRules().put(scope, rules); - LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules."); + LOG.debug("Set config for scope '" + scope + "': " + rules.size() + + " rules."); } - + /** * This function does the replacements by iterating through all the regex * patterns. It accepts a string url as input and returns the altered string. @@ -190,7 +195,7 @@ public class RegexURLNormalizer extends } public String normalize(String urlString, String scope) - throws MalformedURLException { + throws MalformedURLException { return regexNormalize(urlString, scope); } @@ -207,17 +212,17 @@ public class RegexURLNormalizer extends return EMPTY_RULES; } } - + private List<Rule> readConfiguration(Reader reader) { List<Rule> rules = new ArrayList<Rule>(); try { // borrowed heavily from code in Configuration.java Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() - .parse(new InputSource(reader)); + .parse(new InputSource(reader)); Element root = doc.getDocumentElement(); if ((!"regex-normalize".equals(root.getTagName())) - && (LOG.isErrorEnabled())) { + && (LOG.isErrorEnabled())) { LOG.error("bad conf file: top-level element not <regex-normalize>"); } NodeList regexes = root.getChildNodes(); @@ -240,7 +245,7 @@ public class RegexURLNormalizer extends if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) patternValue = ((Text) field.getFirstChild()).getData(); if ("substitution".equals(field.getTagName()) - && field.hasChildNodes()) + && field.hasChildNodes()) subValue = ((Text) field.getFirstChild()).getData(); if (!field.hasChildNodes()) subValue = ""; @@ -251,7 +256,8 @@ public class RegexURLNormalizer extends rule.pattern = Pattern.compile(patternValue); } catch (PatternSyntaxException e) { if (LOG.isErrorEnabled()) { - LOG.error("skipped rule: " + patternValue + " -> " + subValue + " : invalid regular expression pattern: " + e); + LOG.error("skipped rule: " + patternValue + " -> " + subValue + + " : invalid regular expression pattern: " + e); } continue; } @@ -265,13 +271,14 @@ public class RegexURLNormalizer extends } return EMPTY_RULES; } - if (rules.size() == 0) return EMPTY_RULES; + if (rules.size() == 0) + return EMPTY_RULES; return rules; } /** Spits out patterns and substitutions that are in the configuration file. */ public static void main(String args[]) throws PatternSyntaxException, - IOException { + IOException { RegexURLNormalizer normalizer = new RegexURLNormalizer(); normalizer.setConf(NutchConfiguration.create()); HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules(); @@ -290,9 +297,10 @@ public class RegexURLNormalizer extends Iterator<String> it = scopedRules.keySet().iterator(); while (it.hasNext()) { String scope = it.next(); - if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) continue; + if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) + continue; System.out.println("* Rules for '" + scope + "' scope:"); - i = ((List<Rule>)scopedRules.get(scope)).iterator(); + i = ((List<Rule>) scopedRules.get(scope)).iterator(); while (i.hasNext()) { Rule r = (Rule) i.next(); System.out.print(" " + r.pattern.pattern() + " -> "); @@ -303,10 +311,12 @@ public class RegexURLNormalizer extends if (args.length > 0) { System.out.println("\n---------- Normalizer test -----------"); String scope = URLNormalizers.SCOPE_DEFAULT; - if (args.length > 1) scope = args[1]; + if (args.length > 1) + scope = args[1]; System.out.println("Scope: " + scope); System.out.println("Input url: '" + args[0] + "'"); - System.out.println("Output url: '" + normalizer.normalize(args[0], scope) + "'"); + System.out.println("Output url: '" + normalizer.normalize(args[0], scope) + + "'"); } System.exit(0); } Modified: nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * ({@link java.util.regex.Pattern}). */ package org.apache.nutch.net.urlnormalizer.regex; + Modified: nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java Thu Jan 29 05:38:59 2015 @@ -36,24 +36,27 @@ import org.apache.nutch.util.NutchConfig /** Unit tests for RegexUrlNormalizer. */ public class TestRegexURLNormalizer { - private static final Logger LOG = LoggerFactory.getLogger(TestRegexURLNormalizer.class); - + private static final Logger LOG = LoggerFactory + .getLogger(TestRegexURLNormalizer.class); + private RegexURLNormalizer normalizer; private Configuration conf; private Map<String, NormalizedURL[]> testData = new HashMap<String, NormalizedURL[]>(); - + // This system property is defined in ./src/plugin/build-plugin.xml private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation. - + public TestRegexURLNormalizer() throws IOException { normalizer = new RegexURLNormalizer(); conf = NutchConfiguration.create(); normalizer.setConf(conf); File[] configs = new File(sampleDir).listFiles(new FileFilter() { public boolean accept(File f) { - if (f.getName().endsWith(".xml") && f.getName().startsWith("regex-normalize-")) + if (f.getName().endsWith(".xml") + && f.getName().startsWith("regex-normalize-")) return true; return false; } @@ -74,8 +77,8 @@ public class TestRegexURLNormalizer { @Test public void testNormalizerDefault() throws Exception { - normalizeTest((NormalizedURL[])testData.get(URLNormalizers.SCOPE_DEFAULT), - URLNormalizers.SCOPE_DEFAULT); + normalizeTest((NormalizedURL[]) testData.get(URLNormalizers.SCOPE_DEFAULT), + URLNormalizers.SCOPE_DEFAULT); } @Test @@ -83,33 +86,36 @@ public class TestRegexURLNormalizer { Iterator<String> it = testData.keySet().iterator(); while (it.hasNext()) { String scope = it.next(); - normalizeTest((NormalizedURL[])testData.get(scope), scope); + normalizeTest((NormalizedURL[]) testData.get(scope), scope); } } - private void normalizeTest(NormalizedURL[] urls, String scope) throws Exception { + private void normalizeTest(NormalizedURL[] urls, String scope) + throws Exception { for (int i = 0; i < urls.length; i++) { String url = urls[i].url; String normalized = normalizer.normalize(urls[i].url, scope); String expected = urls[i].expectedURL; - LOG.info("scope: " + scope + " url: " + url + " | normalized: " + normalized + " | expected: " + expected); + LOG.info("scope: " + scope + " url: " + url + " | normalized: " + + normalized + " | expected: " + expected); Assert.assertEquals(urls[i].expectedURL, normalized); } } - + private void bench(int loops, String scope) { long start = System.currentTimeMillis(); try { - NormalizedURL[] expected = (NormalizedURL[])testData.get(scope); - if (expected == null) return; + NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope); + if (expected == null) + return; for (int i = 0; i < loops; i++) { normalizeTest(expected, scope); } } catch (Exception e) { Assert.fail(e.toString()); } - LOG.info("bench time (" + loops + ") " + - (System.currentTimeMillis() - start) + "ms"); + LOG.info("bench time (" + loops + ") " + + (System.currentTimeMillis() - start) + "ms"); } private static class NormalizedURL { @@ -126,17 +132,18 @@ public class TestRegexURLNormalizer { private NormalizedURL[] readTestFile(String scope) throws IOException { File f = new File(sampleDir, "regex-normalize-" + scope + ".test"); @SuppressWarnings("resource") - BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); + BufferedReader in = new BufferedReader(new InputStreamReader( + new FileInputStream(f), "UTF-8")); List<NormalizedURL> list = new ArrayList<NormalizedURL>(); String line; - while((line = in.readLine()) != null) { - if ( line.trim().length() == 0 || - line.startsWith("#") || - line.startsWith(" ")) continue; + while ((line = in.readLine()) != null) { + if (line.trim().length() == 0 || line.startsWith("#") + || line.startsWith(" ")) + continue; list.add(new NormalizedURL(line)); } return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]); - } + } public static void main(String[] args) throws Exception { if (args.length == 0) { @@ -150,7 +157,8 @@ public class TestRegexURLNormalizer { if (args[i].equals("-bench")) { bench = true; iter = Integer.parseInt(args[++i]); - } else scope = args[i]; + } else + scope = args[i]; } if (scope == null) { System.err.println("Missing required scope name."); @@ -161,11 +169,12 @@ public class TestRegexURLNormalizer { System.exit(-1); } TestRegexURLNormalizer test = new TestRegexURLNormalizer(); - NormalizedURL[] urls = (NormalizedURL[])test.testData.get(scope); + NormalizedURL[] urls = (NormalizedURL[]) test.testData.get(scope); if (urls == null) { - LOG.warn("Missing test data for scope '" + scope + "', using default scope."); + LOG.warn("Missing test data for scope '" + scope + + "', using default scope."); scope = URLNormalizers.SCOPE_DEFAULT; - urls = (NormalizedURL[])test.testData.get(scope); + urls = (NormalizedURL[]) test.testData.get(scope); } if (bench) { test.bench(iter, scope); @@ -174,6 +183,4 @@ public class TestRegexURLNormalizer { } } - - } Modified: nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java (original) +++ nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java Thu Jan 29 05:38:59 2015 @@ -35,19 +35,26 @@ import org.slf4j.LoggerFactory; /** * Emulate a continuous crawl for one URL. - * + * */ public class ContinuousCrawlTestUtil extends TestCase { - private static final Logger LOG = LoggerFactory.getLogger(ContinuousCrawlTestUtil.class); + private static final Logger LOG = LoggerFactory + .getLogger(ContinuousCrawlTestUtil.class); protected static Text dummyURL = new Text("http://nutch.apache.org/"); protected static Configuration defaultConfig = CrawlDBTestUtil .createConfiguration(); - protected long interval = FetchSchedule.SECONDS_PER_DAY*1000; // (default) launch crawler every day - protected long duration = 2*365L*FetchSchedule.SECONDS_PER_DAY*1000L; // run for two years + protected long interval = FetchSchedule.SECONDS_PER_DAY * 1000; // (default) + // launch + // crawler + // every day + protected long duration = 2 * 365L * FetchSchedule.SECONDS_PER_DAY * 1000L; // run + // for + // two + // years protected Configuration configuration; private FetchSchedule schedule; @@ -62,7 +69,7 @@ public class ContinuousCrawlTestUtil ext protected Content content = new Content(); { - byte[] data = {'n', 'u', 't', 'c', 'h'}; + byte[] data = { 'n', 'u', 't', 'c', 'h' }; content.setContent(data); } @@ -89,17 +96,17 @@ public class ContinuousCrawlTestUtil ext /** set the interval the crawl is relaunched (default: every day) */ protected void setInterval(int seconds) { - interval = seconds*1000L; + interval = seconds * 1000L; } /** set the duration of the continuous crawl (default = 2 years) */ protected void setDuraction(int seconds) { - duration = seconds*1000L; + duration = seconds * 1000L; } /** * default fetch action: set status and time - * + * * @param datum * CrawlDatum to fetch * @param currentTime @@ -124,19 +131,20 @@ public class ContinuousCrawlTestUtil ext * change content to force a changed signature */ protected void changeContent() { - byte [] data = Arrays.copyOf(content.getContent(), content.getContent().length+1); + byte[] data = Arrays.copyOf(content.getContent(), + content.getContent().length + 1); data[content.getContent().length] = '2'; // append one byte content.setContent(data); LOG.info("document content changed"); } - /** * default parse action: add signature if successfully fetched - * + * * @param fetchDatum * fetch datum - * @return list of all datums resulting from parse (status: signature, linked, parse_metadata) + * @return list of all datums resulting from parse (status: signature, linked, + * parse_metadata) */ protected List<CrawlDatum> parse(CrawlDatum fetchDatum) { List<CrawlDatum> parseDatums = new ArrayList<CrawlDatum>(0); @@ -150,7 +158,7 @@ public class ContinuousCrawlTestUtil ext /** * default implementation to check the result state - * + * * @param datum * the CrawlDatum to be checked * @return true if the check succeeds @@ -166,7 +174,7 @@ public class ContinuousCrawlTestUtil ext * <p> * A loop emulates a continuous crawl launched in regular intervals (see * {@link #setInterval(int)} over a longer period ({@link #setDuraction(int)}. - * + * * <ul> * <li>every "round" emulates * <ul> @@ -177,11 +185,11 @@ public class ContinuousCrawlTestUtil ext * <li>and is checked whether it is correct (see {@link #check(CrawlDatum)}) * </ul> * </p> - * + * * @param maxErrors * (if > 0) continue crawl even if the checked CrawlDatum is not * correct, but stop after max. number of errors - * + * * @return false if a check of CrawlDatum failed, true otherwise */ protected boolean run(int maxErrors) { @@ -205,9 +213,11 @@ public class ContinuousCrawlTestUtil ext long lastFetchTime = -1; boolean ok = true; // record failure but keep going CrawlDatum fetchDatum = new CrawlDatum(); - /* Keep copies because CrawlDbReducer.reduce() - * and FetchSchedule.shouldFetch() may alter the references. - * Copies are used for verbose logging in case of an error. */ + /* + * Keep copies because CrawlDbReducer.reduce() and + * FetchSchedule.shouldFetch() may alter the references. Copies are used for + * verbose logging in case of an error. + */ CrawlDatum copyDbDatum = new CrawlDatum(); CrawlDatum copyFetchDatum = new CrawlDatum(); CrawlDatum afterShouldFetch = new CrawlDatum(); Modified: nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original) +++ nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Thu Jan 29 05:38:59 2015 @@ -36,10 +36,10 @@ import org.mortbay.jetty.bio.SocketConne import org.mortbay.jetty.handler.ContextHandler; import org.mortbay.jetty.handler.ResourceHandler; - public class CrawlDBTestUtil { - private static final Logger LOG = LoggerFactory.getLogger(CrawlDBTestUtil.class); + private static final Logger LOG = LoggerFactory + .getLogger(CrawlDBTestUtil.class); /** * Creates synthetic crawldb @@ -52,12 +52,12 @@ public class CrawlDBTestUtil { * urls to be inserted, objects are of type URLCrawlDatum * @throws Exception */ - public static void createCrawlDb(Configuration conf, FileSystem fs, Path crawldb, List<URLCrawlDatum> init) - throws Exception { + public static void createCrawlDb(Configuration conf, FileSystem fs, + Path crawldb, List<URLCrawlDatum> init) throws Exception { LOG.trace("* creating crawldb: " + crawldb); Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME); - MapFile.Writer writer = new MapFile.Writer(conf, fs, new Path(dir, "part-00000") - .toString(), Text.class, CrawlDatum.class); + MapFile.Writer writer = new MapFile.Writer(conf, fs, new Path(dir, + "part-00000").toString(), Text.class, CrawlDatum.class); Iterator<URLCrawlDatum> it = init.iterator(); while (it.hasNext()) { URLCrawlDatum row = it.next(); @@ -69,25 +69,25 @@ public class CrawlDBTestUtil { /** * For now we need to manually construct our Configuration, because we need to - * override the default one and it is currently not possible to use dynamically - * set values. + * override the default one and it is currently not possible to use + * dynamically set values. * * @return * @deprecated Use {@link #createConfiguration()} instead */ @Deprecated - public static Configuration create(){ + public static Configuration create() { return createConfiguration(); } /** * For now we need to manually construct our Configuration, because we need to - * override the default one and it is currently not possible to use dynamically - * set values. + * override the default one and it is currently not possible to use + * dynamically set values. * * @return */ - public static Configuration createConfiguration(){ + public static Configuration createConfiguration() { Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("crawl-tests.xml"); @@ -108,32 +108,36 @@ public class CrawlDBTestUtil { /** * Generate seedlist - * @throws IOException + * + * @throws IOException */ - public static void generateSeedList(FileSystem fs, Path urlPath, List<String> urls) throws IOException{ + public static void generateSeedList(FileSystem fs, Path urlPath, + List<String> urls) throws IOException { generateSeedList(fs, urlPath, urls, new ArrayList<String>()); } - + /** * Generate seedlist - * @throws IOException + * + * @throws IOException */ - public static void generateSeedList(FileSystem fs, Path urlPath, List<String> urls, List<String>metadata) throws IOException{ + public static void generateSeedList(FileSystem fs, Path urlPath, + List<String> urls, List<String> metadata) throws IOException { FSDataOutputStream out; - Path file=new Path(urlPath,"urls.txt"); + Path file = new Path(urlPath, "urls.txt"); fs.mkdirs(urlPath); - out=fs.create(file); - - Iterator<String> urls_i=urls.iterator(); - Iterator<String> metadata_i=metadata.iterator(); - + out = fs.create(file); + + Iterator<String> urls_i = urls.iterator(); + Iterator<String> metadata_i = metadata.iterator(); + String url; String md; - while(urls_i.hasNext()){ - url=urls_i.next(); + while (urls_i.hasNext()) { + url = urls_i.next(); out.writeBytes(url); - + if (metadata_i.hasNext()) { md = metadata_i.next(); out.writeBytes(md); @@ -141,19 +145,22 @@ public class CrawlDBTestUtil { out.writeBytes("\n"); } - + out.flush(); out.close(); } - + /** * Creates a new JettyServer with one static root context * - * @param port port to listen to - * @param staticContent folder where static content lives - * @throws UnknownHostException + * @param port + * port to listen to + * @param staticContent + * folder where static content lives + * @throws UnknownHostException */ - public static Server getServer(int port, String staticContent) throws UnknownHostException{ + public static Server getServer(int port, String staticContent) + throws UnknownHostException { Server webServer = new org.mortbay.jetty.Server(); SocketConnector listener = new SocketConnector(); listener.setPort(port); Modified: nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java (original) +++ nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java Thu Jan 29 05:38:59 2015 @@ -45,7 +45,8 @@ import org.slf4j.LoggerFactory; */ public class CrawlDbUpdateUtil<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> { - private static final Logger LOG = LoggerFactory.getLogger(CrawlDbUpdateUtil.class); + private static final Logger LOG = LoggerFactory + .getLogger(CrawlDbUpdateUtil.class); private T reducer; @@ -74,9 +75,8 @@ public class CrawlDbUpdateUtil<T extends } /** - * Dummy reporter which does nothing and does not return null for - * getCounter() - * + * Dummy reporter which does nothing and does not return null for getCounter() + * * @see {@link Reporter#NULL} */ private class DummyReporter implements Reporter { @@ -117,8 +117,10 @@ public class CrawlDbUpdateUtil<T extends * run * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} * and return the CrawlDatum(s) which would have been written into CrawlDb - * @param values list of input CrawlDatums - * @return list of resulting CrawlDatum(s) in CrawlDb + * + * @param values + * list of input CrawlDatums + * @return list of resulting CrawlDatum(s) in CrawlDb */ public List<CrawlDatum> update(List<CrawlDatum> values) { if (values == null || values.size() == 0) { @@ -138,12 +140,14 @@ public class CrawlDbUpdateUtil<T extends * run * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} * and return the CrawlDatum(s) which would have been written into CrawlDb - * @param dbDatum previous CrawlDatum in CrawlDb - * @param fetchDatum CrawlDatum resulting from fetching - * @return list of resulting CrawlDatum(s) in CrawlDb + * + * @param dbDatum + * previous CrawlDatum in CrawlDb + * @param fetchDatum + * CrawlDatum resulting from fetching + * @return list of resulting CrawlDatum(s) in CrawlDb */ - public List<CrawlDatum> update(CrawlDatum dbDatum, - CrawlDatum fetchDatum) { + public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) { List<CrawlDatum> values = new ArrayList<CrawlDatum>(); if (dbDatum != null) values.add(dbDatum); Modified: nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java (original) +++ nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java Thu Jan 29 05:38:59 2015 @@ -21,12 +21,12 @@ import org.apache.hadoop.io.IntWritable; public class DummyWritable extends IntWritable { - public DummyWritable() { + public DummyWritable() { - } + } - public DummyWritable(int i) { - super(i); - } + public DummyWritable(int i) { + super(i); + } } Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java (original) +++ nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java Thu Jan 29 05:38:59 2015 @@ -13,11 +13,12 @@ import org.slf4j.LoggerFactory; public class TODOTestCrawlDbStates extends TestCrawlDbStates { - private static final Logger LOG = LoggerFactory.getLogger(TODOTestCrawlDbStates.class); + private static final Logger LOG = LoggerFactory + .getLogger(TODOTestCrawlDbStates.class); /** - * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max is reached. - * Retry counter has to be reset appropriately. + * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max + * is reached. Retry counter has to be reset appropriately. */ @Test public void testCrawlDbReducerPageRetrySchedule() { @@ -86,8 +87,7 @@ public class TODOTestCrawlDbStates exten * <p> * Problem: documents not modified for a longer time are fetched in every * cycle because of an error in the SYNC_DELTA calculation of - * {@link AdaptiveFetchSchedule}. - * <br> + * {@link AdaptiveFetchSchedule}. <br> * The next fetch time should always be in the future, never in the past. * </p> */ @@ -95,15 +95,15 @@ public class TODOTestCrawlDbStates exten public void testAdaptiveFetchScheduleSyncDelta() { LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule"); Configuration conf = CrawlDBTestUtil.createConfiguration(); - conf.setLong("db.fetch.interval.default", 172800); // 2 days - conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day + conf.setLong("db.fetch.interval.default", 172800); // 2 days + conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days - conf.setLong("db.fetch.interval.max", 604800); // 7 days + conf.setLong("db.fetch.interval.max", 604800); // 7 days conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl.AdaptiveFetchSchedule"); ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime( conf); - crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY/3); + crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3); if (!crawlUtil.run(100)) { fail("failed: sync_delta calculation with AdaptiveFetchSchedule"); } @@ -150,10 +150,10 @@ public class TODOTestCrawlDbStates exten // next fetch time is in less than one minute // (critical: Nutch can hardly be so fast) LOG.error("Less then one minute until next fetch: " + result); - } + } // Next fetch time should be within min. and max. (tolerance: 60 sec.) - if (secondsUntilNextFetch+60 < minInterval - || secondsUntilNextFetch-60 > maxInterval) { + if (secondsUntilNextFetch + 60 < minInterval + || secondsUntilNextFetch - 60 > maxInterval) { LOG.error("Interval until next fetch time (" + TimingUtil.elapsedTime(fetchTime, result.getFetchTime()) + ") is not within min. and max. interval: " + result); Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java (original) +++ nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java Thu Jan 29 05:38:59 2015 @@ -61,19 +61,19 @@ public class TestAdaptiveFetchSchedule e Text url = new Text("http://www.example.com"); changed = FetchSchedule.STATUS_UNKNOWN; - fs.setFetchSchedule(url, p, p.getFetchTime(), - p.getModifiedTime(), curTime, lastModified, changed); + fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime, + lastModified, changed); validateFetchInterval(changed, p.getFetchInterval()); changed = FetchSchedule.STATUS_MODIFIED; - fs.setFetchSchedule(url, p, p.getFetchTime(), - p.getModifiedTime(), curTime, lastModified, changed); + fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime, + lastModified, changed); validateFetchInterval(changed, p.getFetchInterval()); p.setFetchInterval(interval); changed = FetchSchedule.STATUS_NOTMODIFIED; - fs.setFetchSchedule(url, p, p.getFetchTime(), - p.getModifiedTime(), curTime, lastModified, changed); + fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime, + lastModified, changed); validateFetchInterval(changed, p.getFetchInterval()); } Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java (original) +++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java Thu Jan 29 05:38:59 2015 @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - + package org.apache.nutch.crawl; import java.io.IOException; @@ -34,9 +34,9 @@ import org.junit.Before; import org.junit.Test; /** - * CrawlDbFiltering test which tests for correct, error free url - * normalization when the CrawlDB includes urls with <code>DB GONE</code> status - * and <code>CRAWLDB_PURGE_404</code> is set to true. + * CrawlDbFiltering test which tests for correct, error free url normalization + * when the CrawlDB includes urls with <code>DB GONE</code> status and + * <code>CRAWLDB_PURGE_404</code> is set to true. * * @author lufeng */ @@ -68,27 +68,27 @@ public class TestCrawlDbFilter { /** * Test url404Purging - * + * * @throws Exception */ @Test public void testUrl404Purging() throws Exception { // create a CrawlDatum with DB GONE status ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); - list.add(new URLCrawlDatum(new Text("http://www.example.com"), new CrawlDatum( - CrawlDatum.STATUS_DB_GONE, 0, 0.0f))); - list.add(new URLCrawlDatum(new Text("http://www.example1.com"), new CrawlDatum( - CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f))); - list.add(new URLCrawlDatum(new Text("http://www.example2.com"), new CrawlDatum( - CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f))); + list.add(new URLCrawlDatum(new Text("http://www.example.com"), + new CrawlDatum(CrawlDatum.STATUS_DB_GONE, 0, 0.0f))); + list.add(new URLCrawlDatum(new Text("http://www.example1.com"), + new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f))); + list.add(new URLCrawlDatum(new Text("http://www.example2.com"), + new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f))); dbDir = new Path(testdir, "crawldb"); - newCrawlDb = new Path(testdir,"newcrawldb"); + newCrawlDb = new Path(testdir, "newcrawldb"); // create crawldb CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list); // set CRAWLDB_PURGE_404 to true - conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404,true); - conf.setBoolean(CrawlDbFilter.URL_NORMALIZING,true); - conf.setBoolean(CrawlDbFilter.URL_FILTERING,false); + conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404, true); + conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true); + conf.setBoolean(CrawlDbFilter.URL_FILTERING, false); conf.setInt("urlnormalizer.loop.count", 2); JobConf job = new NutchJob(conf); job.setJobName("Test CrawlDbFilter"); @@ -105,8 +105,7 @@ public class TestCrawlDbFilter { job.setOutputValueClass(CrawlDatum.class); JobClient.runJob(job); - Path fetchlist = new Path(new Path(newCrawlDb, - "part-00000"), "data"); + Path fetchlist = new Path(new Path(newCrawlDb, "part-00000"), "data"); ArrayList<URLCrawlDatum> l = readContents(fetchlist); @@ -116,11 +115,14 @@ public class TestCrawlDbFilter { /** * Read contents of fetchlist. - * @param fetchlist path to Generated fetchlist + * + * @param fetchlist + * path to Generated fetchlist * @return Generated {@link URLCrawlDatum} objects * @throws IOException */ - private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws IOException { + private ArrayList<URLCrawlDatum> readContents(Path fetchlist) + throws IOException { // verify results SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf); Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java (original) +++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Thu Jan 29 05:38:59 2015 @@ -34,18 +34,15 @@ import org.junit.Before; import org.junit.Test; public class TestCrawlDbMerger { - private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class.getName()); - + private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class + .getName()); + String url10 = "http://example.com/"; String url11 = "http://example.com/foo"; String url20 = "http://example.com/"; String url21 = "http://example.com/bar"; - String[] urls_expected = new String[] { - url10, - url11, - url21 - }; - + String[] urls_expected = new String[] { url10, url11, url21 }; + TreeSet<String> init1 = new TreeSet<String>(); TreeSet<String> init2 = new TreeSet<String>(); HashMap<String, CrawlDatum> expected = new HashMap<String, CrawlDatum>(); @@ -54,7 +51,7 @@ public class TestCrawlDbMerger { FileSystem fs; Path testDir; CrawlDbReader reader; - + @Before public void setUp() throws Exception { init1.add(url10); @@ -81,20 +78,21 @@ public class TestCrawlDbMerger { expected.put(url21, cd2); conf = NutchConfiguration.create(); fs = FileSystem.get(conf); - testDir = new Path("test-crawldb-" + - new java.util.Random().nextInt()); + testDir = new Path("test-crawldb-" + new java.util.Random().nextInt()); fs.mkdirs(testDir); } - + @After public void tearDown() { try { if (fs.exists(testDir)) fs.delete(testDir, true); - } catch (Exception e) { } + } catch (Exception e) { + } try { reader.close(); - } catch (Exception e) { } + } catch (Exception e) { + } } @Test @@ -106,7 +104,7 @@ public class TestCrawlDbMerger { createCrawlDb(conf, fs, crawldb2, init2, cd2); CrawlDbMerger merger = new CrawlDbMerger(conf); LOG.fine("* merging crawldbs to " + output); - merger.merge(output, new Path[]{crawldb1, crawldb2}, false, false); + merger.merge(output, new Path[] { crawldb1, crawldb2 }, false, false); LOG.fine("* reading crawldb: " + output); reader = new CrawlDbReader(); String crawlDb = output.toString(); @@ -127,11 +125,13 @@ public class TestCrawlDbMerger { reader.close(); fs.delete(testDir, true); } - - private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb, TreeSet<String> init, CrawlDatum cd) throws Exception { + + private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb, + TreeSet<String> init, CrawlDatum cd) throws Exception { LOG.fine("* creating crawldb: " + crawldb); Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME); - MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir, "part-00000").toString(), Text.class, CrawlDatum.class); + MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir, + "part-00000").toString(), Text.class, CrawlDatum.class); Iterator<String> it = init.iterator(); while (it.hasNext()) { String key = it.next();
