Author: snagel
Date: Wed Jan 22 21:13:01 2014
New Revision: 1560512
URL: http://svn.apache.org/r1560512
Log:
NUTCH-1413 Record response time
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1560512&r1=1560511&r2=1560512&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 22 21:13:01 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1413 Record response time (Yasin Kılınç, Talat Uyarer, snagel)
+
* NUTCH-1325 HostDB for Nutch (markus, tejasp)
* NUTCH-1680 CrawlDbReader to dump minRetry value (markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1560512&r1=1560511&r2=1560512&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Jan 22 21:13:01 2014
@@ -266,6 +266,16 @@
</description>
</property>
+<property>
+ <name>http.store.responsetime</name>
+ <value>true</value>
+ <description>Enables us to record the response time of the
+ host which is the time period between start connection to end
+ connection of a pages host. The response time in milliseconds
+ is stored in CrawlDb in CrawlDatum's meta data under key "_rs_"
+ </description>
+</property>
+
<!-- FTP properties -->
<property>
Modified:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1560512&r1=1560511&r2=1560512&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Wed Jan 22 21:13:01 2014
@@ -37,6 +37,7 @@ import org.apache.nutch.util.DeflateUtil
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
// crawler-commons imports
@@ -47,7 +48,8 @@ import crawlercommons.robots.BaseRobotRu
*/
public abstract class HttpBase implements Protocol {
-
+ public static final Text RESPONSE_TIME = new Text("_rs_");
+
public static final int BUFFER_SIZE = 8 * 1024;
private static final byte[] EMPTY_CONTENT = new byte[0];
@@ -92,6 +94,12 @@ public abstract class HttpBase implement
/** Do we use HTTP/1.1? */
protected boolean useHttp11 = false;
+
+ /**
+ * Record response time in CrawlDatum's meta data, see property
+ * http.store.responsetime.
+ */
+ protected boolean responseTime = true;
/** Skip page if Crawl-Delay longer than this value. */
protected long maxCrawlDelay = -1L;
@@ -123,6 +131,7 @@ public abstract class HttpBase implement
this.accept = conf.get("http.accept", accept);
// backward-compatible default setting
this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+ this.responseTime = conf.getBoolean("http.store.responsetime", true);
this.robots.setConf(conf);
logConf();
}
@@ -137,8 +146,15 @@ public abstract class HttpBase implement
String urlString = url.toString();
try {
URL u = new URL(urlString);
+
+ long startTime = System.currentTimeMillis();
Response response = getResponse(u, datum, false); // make a request
+ if(this.responseTime) {
+ int elapsedTime = (int) (System.currentTimeMillis() - startTime);
+ datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
+ }
+
int code = response.getCode();
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(),