Author: snagel Date: Wed Jan 22 21:13:01 2014 New Revision: 1560512 URL: http://svn.apache.org/r1560512 Log: NUTCH-1413 Record response time
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1560512&r1=1560511&r2=1560512&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jan 22 21:13:01 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1413 Record response time (Yasin Kılınç, Talat Uyarer, snagel) + * NUTCH-1325 HostDB for Nutch (markus, tejasp) * NUTCH-1680 CrawlDbReader to dump minRetry value (markus) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1560512&r1=1560511&r2=1560512&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Wed Jan 22 21:13:01 2014 @@ -266,6 +266,16 @@ </description> </property> +<property> + <name>http.store.responsetime</name> + <value>true</value> + <description>Enables us to record the response time of the + host which is the time period between start connection to end + connection of a pages host. The response time in milliseconds + is stored in CrawlDb in CrawlDatum's meta data under key "_rs_" + </description> +</property> + <!-- FTP properties --> <property> Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1560512&r1=1560511&r2=1560512&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Wed Jan 22 21:13:01 2014 @@ -37,6 +37,7 @@ import org.apache.nutch.util.DeflateUtil // Hadoop imports import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; // crawler-commons imports @@ -47,7 +48,8 @@ import crawlercommons.robots.BaseRobotRu */ public abstract class HttpBase implements Protocol { - + public static final Text RESPONSE_TIME = new Text("_rs_"); + public static final int BUFFER_SIZE = 8 * 1024; private static final byte[] EMPTY_CONTENT = new byte[0]; @@ -92,6 +94,12 @@ public abstract class HttpBase implement /** Do we use HTTP/1.1? */ protected boolean useHttp11 = false; + + /** + * Record response time in CrawlDatum's meta data, see property + * http.store.responsetime. + */ + protected boolean responseTime = true; /** Skip page if Crawl-Delay longer than this value. */ protected long maxCrawlDelay = -1L; @@ -123,6 +131,7 @@ public abstract class HttpBase implement this.accept = conf.get("http.accept", accept); // backward-compatible default setting this.useHttp11 = conf.getBoolean("http.useHttp11", false); + this.responseTime = conf.getBoolean("http.store.responsetime", true); this.robots.setConf(conf); logConf(); } @@ -137,8 +146,15 @@ public abstract class HttpBase implement String urlString = url.toString(); try { URL u = new URL(urlString); + + long startTime = System.currentTimeMillis(); Response response = getResponse(u, datum, false); // make a request + if(this.responseTime) { + int elapsedTime = (int) (System.currentTimeMillis() - startTime); + datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime)); + } + int code = response.getCode(); byte[] content = response.getContent(); Content c = new Content(u.toString(), u.toString(),