Author: snagel
Date: Wed Jan 22 21:13:01 2014
New Revision: 1560512

URL: http://svn.apache.org/r1560512
Log:
NUTCH-1413 Record response time

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1560512&r1=1560511&r2=1560512&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 22 21:13:01 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1413 Record response time (Yasin Kılınç, Talat Uyarer, snagel)
+
 * NUTCH-1325 HostDB for Nutch (markus, tejasp)
 
 * NUTCH-1680 CrawlDbReader to dump minRetry value (markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1560512&r1=1560511&r2=1560512&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Jan 22 21:13:01 2014
@@ -266,6 +266,16 @@
   </description>
 </property>
 
+<property>
+  <name>http.store.responsetime</name>
+  <value>true</value>
+  <description>Enables us to record the response time of the 
+  host which is the time period between start connection to end 
+  connection of a pages host. The response time in milliseconds
+  is stored in CrawlDb in CrawlDatum's meta data under key &quot;_rs_&quot;
+  </description>
+</property>
+
 <!-- FTP properties -->
 
 <property>

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1560512&r1=1560511&r2=1560512&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Wed Jan 22 21:13:01 2014
@@ -37,6 +37,7 @@ import org.apache.nutch.util.DeflateUtil
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 
 // crawler-commons imports
@@ -47,7 +48,8 @@ import crawlercommons.robots.BaseRobotRu
  */
 public abstract class HttpBase implements Protocol {
   
-  
+  public static final Text RESPONSE_TIME = new Text("_rs_");
+
   public static final int BUFFER_SIZE = 8 * 1024;
   
   private static final byte[] EMPTY_CONTENT = new byte[0];
@@ -92,6 +94,12 @@ public abstract class HttpBase implement
   
   /** Do we use HTTP/1.1? */
   protected boolean useHttp11 = false;
+
+  /**
+   * Record response time in CrawlDatum's meta data, see property
+   * http.store.responsetime.
+   */
+  protected boolean responseTime = true;    
   
   /** Skip page if Crawl-Delay longer than this value. */
   protected long maxCrawlDelay = -1L;
@@ -123,6 +131,7 @@ public abstract class HttpBase implement
       this.accept = conf.get("http.accept", accept);
       // backward-compatible default setting
       this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+      this.responseTime = conf.getBoolean("http.store.responsetime", true);
       this.robots.setConf(conf);
       logConf();
   }
@@ -137,8 +146,15 @@ public abstract class HttpBase implement
     String urlString = url.toString();
     try {
       URL u = new URL(urlString);
+      
+      long startTime = System.currentTimeMillis();
       Response response = getResponse(u, datum, false); // make a request
       
+      if(this.responseTime) {
+        int elapsedTime = (int) (System.currentTimeMillis() - startTime);
+        datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
+      }
+      
       int code = response.getCode();
       byte[] content = response.getContent();
       Content c = new Content(u.toString(), u.toString(),


Reply via email to