Author: ab Date: Mon Aug 14 08:08:14 2006 New Revision: 431366 URL: http://svn.apache.org/viewvc?rev=431366&view=rev Log: Apply patches in rev 431364.
Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt lucene/nutch/branches/branch-0.8/conf/nutch-default.xml lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?rev=431366&r1=431365&r2=431366&view=diff ============================================================================== --- lucene/nutch/branches/branch-0.8/CHANGES.txt (original) +++ lucene/nutch/branches/branch-0.8/CHANGES.txt Mon Aug 14 08:08:14 2006 @@ -10,6 +10,9 @@ 3. NUTCH-344 - Fix for thread blocking issue (Greg Kim via siren) + 4. Optionally skip pages with abnormally large Crawl-Delay values + (Dennis Kubes via ab) + Release 0.8 - 2006-07-25 0. Totally new architecture, based on hadoop Modified: lucene/nutch/branches/branch-0.8/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/conf/nutch-default.xml?rev=431366&r1=431365&r2=431366&view=diff ============================================================================== --- lucene/nutch/branches/branch-0.8/conf/nutch-default.xml (original) +++ lucene/nutch/branches/branch-0.8/conf/nutch-default.xml Mon Aug 14 08:08:14 2006 @@ -380,6 +380,18 @@ </property> <property> + <name>fetcher.max.crawl.delay</name> + <value>30</value> + <description> + If the Crawl-Delay in robots.txt is set to greater than this value (in + seconds) then the fetcher will skip this page, generating an error report. + If set to -1 the fetcher will never skip such pages and will wait the + amount of time retrieved from robots.txt Crawl-Delay, however long that + might be. + </description> +</property> + +<property> <name>fetcher.threads.fetch</name> <value>10</value> <description>The number of FetcherThreads the fetcher should use. Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=431366&r1=431365&r2=431366&view=diff ============================================================================== --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Aug 14 08:08:14 2006 @@ -199,6 +199,7 @@ case ProtocolStatus.NOTFOUND: case ProtocolStatus.ACCESS_DENIED: case ProtocolStatus.ROBOTS_DENIED: + case ProtocolStatus.WOULDBLOCK: case ProtocolStatus.NOTMODIFIED: output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE); break; Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=431366&r1=431365&r2=431366&view=diff ============================================================================== --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java Mon Aug 14 08:08:14 2006 @@ -25,7 +25,7 @@ import org.apache.hadoop.io.WritableUtils; /** - * @author Andrzej Bialecki <[EMAIL PROTECTED]> + * @author Andrzej Bialecki */ public class ProtocolStatus extends VersionedWritable { @@ -55,11 +55,15 @@ /** Access denied by robots.txt rules. */ public static final int ROBOTS_DENIED = 18; /** Too many redirects. */ - public static final int REDIR_EXCEEDED = 19; + public static final int REDIR_EXCEEDED = 19; /** Not fetching. */ public static final int NOTFETCHING = 20; /** Unchanged since the last fetch. */ public static final int NOTMODIFIED = 21; + /** Request was refused by protocol plugins, because it would block. + * The expected number of milliseconds to wait before retry may be provided + * in args. */ + public static final int WOULDBLOCK = 22; // Useful static instances for status codes that don't usually require any // additional arguments. @@ -72,6 +76,7 @@ public static final ProtocolStatus STATUS_REDIR_EXCEEDED = new ProtocolStatus(REDIR_EXCEEDED); public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING); public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED); + public static final ProtocolStatus STATUS_WOULDBLOCK = new ProtocolStatus(WOULDBLOCK); private int code; private long lastModified; @@ -93,6 +98,7 @@ codeToName.put(new Integer(REDIR_EXCEEDED), "redir_exceeded"); codeToName.put(new Integer(NOTFETCHING), "notfetching"); codeToName.put(new Integer(NOTMODIFIED), "notmodified"); + codeToName.put(new Integer(WOULDBLOCK), "wouldblock"); } public ProtocolStatus() { @@ -183,6 +189,7 @@ code == REDIR_EXCEEDED || code == RETRY || code == TEMP_MOVED || + code == WOULDBLOCK || code == PROTO_NOT_FOUND; } Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=431366&r1=431365&r2=431366&view=diff ============================================================================== --- lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Aug 14 08:08:14 2006 @@ -125,6 +125,9 @@ /** Do we use HTTP/1.1? */ protected boolean useHttp11 = false; + + /** Skip page if Crawl-Delay longer than this value. */ + protected long maxCrawlDelay = -1L; /** Creates a new instance of HttpBase */ public HttpBase() { @@ -152,6 +155,7 @@ this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000); + this.maxCrawlDelay = (long)(conf.getInt("fetcher.max.crawl.delay", -1) * 1000); // backward-compatible default setting this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true); this.useHttp11 = conf.getBoolean("http.http11", false); @@ -185,6 +189,14 @@ long crawlDelay = robots.getCrawlDelay(this, u); long delay = crawlDelay > 0 ? crawlDelay : serverDelay; + if (maxCrawlDelay >= 0 && delay > maxCrawlDelay) { + // skip this page, otherwise the thread would block for too long. + LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max=" + + (maxCrawlDelay / 1000) + ", Crawl-Delay=" + (delay / 1000)); + Content c = new Content(u.toString(), u.toString(), EMPTY_CONTENT, + null, null, this.conf); + return new ProtocolOutput(c, ProtocolStatus.STATUS_WOULDBLOCK); + } String host = blockAddr(u, delay); Response response; try { ------------------------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs