Author: snagel Date: Mon May 12 19:39:43 2014 New Revision: 1594071 URL: http://svn.apache.org/r1594071 Log: NUTCH-1752 Cache robots.txt rules per protocol:host:port
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1594071&r1=1594070&r2=1594071&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Mon May 12 19:39:43 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel) + * NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with >2 threads (brian44 via jnioche) * NUTCH-1182 fetcher to log hung threads (snagel) Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1594071&r1=1594070&r2=1594071&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Mon May 12 19:39:43 2014 @@ -48,23 +48,38 @@ public class HttpRobotRulesParser extend allowForbidden = conf.getBoolean("http.robots.403.allow", false); } + /** Compose unique key to store and access robot rules in cache for given URL */ + protected static String getCacheKey(URL url) { + String protocol = url.getProtocol().toLowerCase(); // normalize to lower case + String host = url.getHost().toLowerCase(); // normalize to lower case + int port = url.getPort(); + if (port == -1) { + port = url.getDefaultPort(); + } + /* Robot rules apply only to host, protocol, and port where robots.txt is + * hosted (cf. NUTCH-1752). Consequently */ + String cacheKey = protocol + ":" + host + ":" + port; + return cacheKey; + } + /** - * The hosts for which the caching of robots rules is yet to be done, - * it sends a Http request to the host corresponding to the {@link URL} - * passed, gets robots file, parses the rules and caches the rules object - * to avoid re-work in future. + * Get the rules from robots.txt which applies for the given {@code url}. + * Robot rules are cached for a unique combination of host, protocol, and + * port. If no rules are found in the cache, a HTTP request is send to fetch + * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the + * rules are cached to avoid re-fetching and re-parsing it again. * - * @param http The {@link Protocol} object - * @param url URL - * - * @return robotRules A {@link BaseRobotRules} object for the rules + * @param http + * The {@link Protocol} object + * @param url + * URL robots.txt applies to + * + * @return {@link BaseRobotRules} holding the rules from robots.txt */ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { - String protocol = url.getProtocol().toLowerCase(); // normalize to lower case - String host = url.getHost().toLowerCase(); // normalize to lower case - - BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host); + String cacheKey = getCacheKey(url); + BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey); boolean cacheRule = true; @@ -114,10 +129,10 @@ public class HttpRobotRulesParser extend } if (cacheRule) { - CACHE.put(protocol + ":" + host, robotRules); // cache rules for host - if (redir != null && !redir.getHost().equals(host)) { + CACHE.put(cacheKey, robotRules); // cache rules for host + if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { // cache also for the redirected host - CACHE.put(protocol + ":" + redir.getHost(), robotRules); + CACHE.put(getCacheKey(redir), robotRules); } } } Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1594071&r1=1594070&r2=1594071&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon May 12 19:39:43 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel) + * NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with >2 threads (brian44 via jnioche) * NUTCH-1766 Generator to unlock crawldb and remove tempdir if generate job fails (Diaa via jnioche) Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1594071&r1=1594070&r2=1594071&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Mon May 12 19:39:43 2014 @@ -48,23 +48,38 @@ public class HttpRobotRulesParser extend allowForbidden = conf.getBoolean("http.robots.403.allow", false); } + /** Compose unique key to store and access robot rules in cache for given URL */ + protected static String getCacheKey(URL url) { + String protocol = url.getProtocol().toLowerCase(); // normalize to lower case + String host = url.getHost().toLowerCase(); // normalize to lower case + int port = url.getPort(); + if (port == -1) { + port = url.getDefaultPort(); + } + /* Robot rules apply only to host, protocol, and port where robots.txt is + * hosted (cf. NUTCH-1752). Consequently */ + String cacheKey = protocol + ":" + host + ":" + port; + return cacheKey; + } + /** - * The hosts for which the caching of robots rules is yet to be done, - * it sends a Http request to the host corresponding to the {@link URL} - * passed, gets robots file, parses the rules and caches the rules object - * to avoid re-work in future. + * Get the rules from robots.txt which applies for the given {@code url}. + * Robot rules are cached for a unique combination of host, protocol, and + * port. If no rules are found in the cache, a HTTP request is send to fetch + * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the + * rules are cached to avoid re-fetching and re-parsing it again. * - * @param http The {@link Protocol} object - * @param url URL - * - * @return robotRules A {@link BaseRobotRules} object for the rules + * @param http + * The {@link Protocol} object + * @param url + * URL robots.txt applies to + * + * @return {@link BaseRobotRules} holding the rules from robots.txt */ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { - String protocol = url.getProtocol().toLowerCase(); // normalize to lower case - String host = url.getHost().toLowerCase(); // normalize to lower case - - BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host); + String cacheKey = getCacheKey(url); + BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey); boolean cacheRule = true; @@ -114,10 +129,10 @@ public class HttpRobotRulesParser extend } if (cacheRule) { - CACHE.put(protocol + ":" + host, robotRules); // cache rules for host - if (redir != null && !redir.getHost().equals(host)) { + CACHE.put(cacheKey, robotRules); // cache rules for host + if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { // cache also for the redirected host - CACHE.put(protocol + ":" + redir.getHost(), robotRules); + CACHE.put(getCacheKey(redir), robotRules); } } }