[ 
https://issues.apache.org/jira/browse/NUTCH-2618?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16549239#comment-16549239
 ] 

ASF GitHub Bot commented on NUTCH-2618:
---------------------------------------

sebastian-nagel closed pull request #360: NUTCH-2618 protocol-okhttp not to use 
http.timeout for max duration to fetch document
URL: https://github.com/apache/nutch/pull/360
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index ccce56b20..cb3a2a804 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -223,6 +223,20 @@
   </description>
 </property>
 
+<property>
+  <name>http.time.limit</name>
+  <value>-1</value>
+  <description>The time limit in seconds to fetch a single document.
+  If this value is nonnegative (>=0), the HTTP protocol implementation
+  will stop reading from a socket after http.time.limit seconds have
+  been spent for fetching this document.  The HTTP response is then
+  marked as truncated.  The http.time.limit should be set to a longer
+  time period than http.timeout, as it applies to the entire duration
+  to fetch a document, not only the network timeout of a single I/O
+  operation.  Note: supported only by protocol-okhttp.
+  </description>
+</property>
+
 <property>
   <name>http.proxy.host</name>
   <value></value>
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java 
b/src/java/org/apache/nutch/net/protocols/Response.java
index 7096c934d..92d41e328 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -47,10 +47,30 @@
   public static final String FETCH_TIME = "nutch.fetch.time";
 
   /**
-   * Key to hold boolean whether content has been trimmed because it exceeds
-   * <code>http.content.limit</code>
+   * Key to hold boolean whether content has been truncated, e.g., because it
+   * exceeds <code>http.content.limit</code>
    */
-  public static final String TRIMMED_CONTENT = "http.content.trimmed";
+  public static final String TRUNCATED_CONTENT = "http.content.truncated";
+
+  /**
+   * Key to hold reason why content has been truncated, see
+   * {@link TruncatedContentReason}
+   */
+  public static final String TRUNCATED_CONTENT_REASON = 
"http.content.truncated.reason";
+
+  public static enum TruncatedContentReason {
+    NOT_TRUNCATED,
+    /** fetch exceeded configured http.content.limit */
+    LENGTH,
+    /** fetch exceeded configured http.fetch.duration */
+    TIME,
+    /** network disconnect during fetch */
+    DISCONNECT,
+    /** implementation internal reason */
+    INTERNAL,
+    /** unknown reason */
+    UNSPECIFIED
+  };
 
   /** Returns the URL used to retrieve this response. */
   public URL getUrl();
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 0bfbff4cc..83fc8da82 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -88,6 +88,9 @@
   /** The length limit for downloaded content, in bytes. */
   protected int maxContent = 64 * 1024;
 
+  /** The time limit to download the entire content, in seconds. */
+  protected int maxDuration = 300;
+
   /** The Nutch 'User-Agent' request header */
   protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
       "http://nutch.apache.org/bot.html";, "[email protected]");
@@ -186,6 +189,7 @@ public void setConf(Configuration conf) {
     this.useProxy = (proxyHost != null && proxyHost.length() > 0);
     this.timeout = conf.getInt("http.timeout", 10000);
     this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+    this.maxDuration = conf.getInt("http.time.limit", -1);
     this.userAgent = getAgentString(conf.get("http.agent.name"),
         conf.get("http.agent.version"), conf.get("http.agent.description"),
         conf.get("http.agent.url"), conf.get("http.agent.email"));
@@ -442,6 +446,14 @@ public int getMaxContent() {
     return maxContent;
   }
 
+  /**
+   * The time limit to download the entire content, in seconds. See the 
property
+   * <code>http.time.limit</code>.
+   */
+  public int getMaxDuration() {
+    return maxDuration;
+  }
+
   public String getUserAgent() {
     if (userAgentNames != null) {
       return userAgentNames
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
index da24d7ca2..34bff3ae0 100644
--- 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
@@ -20,8 +20,8 @@
 import java.lang.invoke.MethodHandles;
 import java.net.URL;
 import java.util.Base64;
+import java.util.Locale;
 
-import org.apache.commons.lang.mutable.MutableBoolean;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
@@ -46,6 +46,27 @@
   private int code;
   private Metadata headers = new Metadata();
 
+  /** Container to store whether and why content has been truncated */
+  public static class TruncatedContent {
+
+    private TruncatedContentReason value = 
TruncatedContentReason.NOT_TRUNCATED;
+
+    public TruncatedContent() {
+    }
+
+    public void setReason(TruncatedContentReason val) {
+      value = val;
+   }
+
+    public TruncatedContentReason getReason() {
+       return value;
+    }
+
+    public boolean booleanValue() {
+      return value != TruncatedContentReason.NOT_TRUNCATED;
+    }
+  }
+
   public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum)
       throws ProtocolException, IOException {
 
@@ -91,16 +112,19 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum 
datum)
     LOG.debug("{} - {} {} {}", url, response.protocol(), response.code(),
         response.message());
 
-    MutableBoolean trimmed = new MutableBoolean();
-    content = toByteArray(response.body(), trimmed, okhttp.getMaxContent(),
-        okhttp.getTimeout());
+    TruncatedContent truncated = new TruncatedContent();
+    content = toByteArray(response.body(), truncated, okhttp.getMaxContent(),
+        okhttp.getMaxDuration());
     responsemetadata.add(FETCH_TIME, 
Long.toString(System.currentTimeMillis()));
-    if (trimmed.booleanValue()) {
+    if (truncated.booleanValue()) {
       if (!call.isCanceled()) {
         call.cancel();
       }
-      responsemetadata.set(TRIMMED_CONTENT, "true");
-      LOG.debug("HTTP content trimmed to {} bytes", content.length);
+      responsemetadata.set(TRUNCATED_CONTENT, "true");
+      responsemetadata.set(TRUNCATED_CONTENT_REASON,
+          truncated.getReason().toString().toLowerCase(Locale.ROOT));
+      LOG.debug("HTTP content truncated to {} bytes (reason: {})",
+          content.length, truncated.getReason());
     }
 
     code = response.code();
@@ -109,15 +133,15 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum 
datum)
   }
 
   private final byte[] toByteArray(final ResponseBody responseBody,
-      MutableBoolean trimmed, int maxContent, int timeout) throws IOException {
+      TruncatedContent truncated, int maxContent, int maxDuration) throws 
IOException {
 
     if (responseBody == null) {
       return new byte[] {};
     }
 
     long endDueFor = -1;
-    if (timeout != -1) {
-      endDueFor = System.currentTimeMillis() + timeout;
+    if (maxDuration != -1) {
+      endDueFor = System.currentTimeMillis() + (maxDuration * 1000);
     }
 
     int maxContentBytes = Integer.MAX_VALUE;
@@ -143,19 +167,19 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum 
datum)
         break;
       }
       if (endDueFor != -1 && endDueFor <= System.currentTimeMillis()) {
-        LOG.debug("timeout reached");
-        trimmed.setValue(true);
+        LOG.debug("max. fetch duration reached");
+        truncated.setReason(TruncatedContentReason.TIME);
         break;
       }
       if (contentBytesBuffered > maxContentBytes) {
         LOG.debug("content limit reached");
-        trimmed.setValue(true);
+        truncated.setReason(TruncatedContentReason.LENGTH);
       }
     }
     int bytesToCopy = contentBytesBuffered;
     if (maxContent != -1 && contentBytesBuffered > maxContent) {
       // okhttp's internal buffer is larger than maxContent
-      trimmed.setValue(true);
+      truncated.setReason(TruncatedContentReason.LENGTH);
       bytesToCopy = maxContentBytes;
     }
     byte[] arr = new byte[bytesToCopy];


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> protocol-okhttp not to use http.timeout for max duration to fetch document
> --------------------------------------------------------------------------
>
>                 Key: NUTCH-2618
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2618
>             Project: Nutch
>          Issue Type: Bug
>          Components: protocol
>    Affects Versions: 1.15
>            Reporter: Sebastian Nagel
>            Priority: Major
>             Fix For: 1.15
>
>
> Protocol-okhttp (NUTCH-2576) uses the HTTP network timeout ({{http.timeout}}) 
> as time limit for the max duration to fetch a document. The timeout value 
> (default = 10 sec.) is usually to small to fetch larger documents. The max 
> fetch duration should be separately configurable, e.g., by a property 
> {{http.time.limit}} (similar to {{http.content.limit}}).



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to