[
https://issues.apache.org/jira/browse/NUTCH-2619?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16549232#comment-16549232
]
ASF GitHub Bot commented on NUTCH-2619:
---------------------------------------
sebastian-nagel closed pull request #361: NUTCH-2619 protocol-okhttp: allow to
keep partially fetched docs as truncated
URL: https://github.com/apache/nutch/pull/361
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index ccce56b20..2c07c8a43 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -223,6 +223,32 @@
</description>
</property>
+<property>
+ <name>http.time.limit</name>
+ <value>-1</value>
+ <description>The time limit in seconds to fetch a single document.
+ If this value is nonnegative (>=0), the HTTP protocol implementation
+ will stop reading from a socket after http.time.limit seconds have
+ been spent for fetching this document. The HTTP response is then
+ marked as truncated. The http.time.limit should be set to a longer
+ time period than http.timeout, as it applies to the entire duration
+ to fetch a document, not only the network timeout of a single I/O
+ operation. Note: supported only by protocol-okhttp.
+ </description>
+</property>
+
+<property>
+ <name>http.partial.truncated</name>
+ <value>false</value>
+ <description>
+ If true the HTTP protocol implementation may store the content of
+ partial fetches and mark the response as truncated instead of
+ throwing an exception which will cause the fetch to fail. This
+ allows to use the data which has already been fetched, instead of
+ retrying the fetch later. Note: supported only by protocol-okhttp.
+ </description>
+</property>
+
<property>
<name>http.proxy.host</name>
<value></value>
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java
b/src/java/org/apache/nutch/net/protocols/Response.java
index 7096c934d..a609b8f0f 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -47,10 +47,30 @@
public static final String FETCH_TIME = "nutch.fetch.time";
/**
- * Key to hold boolean whether content has been trimmed because it exceeds
- * <code>http.content.limit</code>
+ * Key to hold boolean whether content has been truncated, e.g., because it
+ * exceeds <code>http.content.limit</code>
*/
- public static final String TRIMMED_CONTENT = "http.content.trimmed";
+ public static final String TRUNCATED_CONTENT = "http.content.truncated";
+
+ /**
+ * Key to hold reason why content has been truncated, see
+ * {@link TruncatedContentReason}
+ */
+ public static final String TRUNCATED_CONTENT_REASON =
"http.content.truncated.reason";
+
+ public static enum TruncatedContentReason {
+ NOT_TRUNCATED,
+ /** fetch exceeded configured http.content.limit */
+ LENGTH,
+ /** fetch exceeded configured http.fetch.duration */
+ TIME,
+ /** network disconnect or timeout during fetch */
+ DISCONNECT,
+ /** implementation internal reason */
+ INTERNAL,
+ /** unknown reason */
+ UNSPECIFIED
+ };
/** Returns the URL used to retrieve this response. */
public URL getUrl();
diff --git
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 0bfbff4cc..42f479312 100644
---
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -88,6 +88,12 @@
/** The length limit for downloaded content, in bytes. */
protected int maxContent = 64 * 1024;
+ /** The time limit to download the entire content, in seconds. */
+ protected int maxDuration = 300;
+
+ /** Whether to save partial fetches as truncated content. */
+ protected boolean partialAsTruncated = false;
+
/** The Nutch 'User-Agent' request header */
protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
"http://nutch.apache.org/bot.html", "[email protected]");
@@ -186,6 +192,9 @@ public void setConf(Configuration conf) {
this.useProxy = (proxyHost != null && proxyHost.length() > 0);
this.timeout = conf.getInt("http.timeout", 10000);
this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+ this.maxDuration = conf.getInt("http.time.limit", -1);
+ this.partialAsTruncated = conf
+ .getBoolean("http.partial.truncated", false);
this.userAgent = getAgentString(conf.get("http.agent.name"),
conf.get("http.agent.version"), conf.get("http.agent.description"),
conf.get("http.agent.url"), conf.get("http.agent.email"));
@@ -195,7 +204,7 @@ public void setConf(Configuration conf) {
this.accept = conf.get("http.accept", accept).trim();
this.mimeTypes = new MimeUtil(conf);
// backward-compatible default setting
- this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+ this.useHttp11 = conf.getBoolean("http.useHttp11", true);
this.useHttp2 = conf.getBoolean("http.useHttp2", false);
this.responseTime = conf.getBoolean("http.store.responsetime", true);
this.storeIPAddress = conf.getBoolean("store.ip.address", false);
@@ -442,6 +451,22 @@ public int getMaxContent() {
return maxContent;
}
+ /**
+ * The time limit to download the entire content, in seconds. See the
property
+ * <code>http.time.limit</code>.
+ */
+ public int getMaxDuration() {
+ return maxDuration;
+ }
+
+ /**
+ * Whether to save partial fetches as truncated content, cf. the property
+ * <code>http.partial.truncated</code>.
+ */
+ public boolean isStorePartialAsTruncated() {
+ return partialAsTruncated;
+ }
+
public String getUserAgent() {
if (userAgentNames != null) {
return userAgentNames
diff --git
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
index da24d7ca2..b720c287f 100644
---
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
+++
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
@@ -20,8 +20,8 @@
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.util.Base64;
+import java.util.Locale;
-import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
@@ -46,6 +46,27 @@
private int code;
private Metadata headers = new Metadata();
+ /** Container to store whether and why content has been truncated */
+ public static class TruncatedContent {
+
+ private TruncatedContentReason value =
TruncatedContentReason.NOT_TRUNCATED;
+
+ public TruncatedContent() {
+ }
+
+ public void setReason(TruncatedContentReason val) {
+ value = val;
+ }
+
+ public TruncatedContentReason getReason() {
+ return value;
+ }
+
+ public boolean booleanValue() {
+ return value != TruncatedContentReason.NOT_TRUNCATED;
+ }
+ }
+
public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum)
throws ProtocolException, IOException {
@@ -91,16 +112,19 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum
datum)
LOG.debug("{} - {} {} {}", url, response.protocol(), response.code(),
response.message());
- MutableBoolean trimmed = new MutableBoolean();
- content = toByteArray(response.body(), trimmed, okhttp.getMaxContent(),
- okhttp.getTimeout());
+ TruncatedContent truncated = new TruncatedContent();
+ content = toByteArray(response.body(), truncated, okhttp.getMaxContent(),
+ okhttp.getMaxDuration(), okhttp.isStorePartialAsTruncated());
responsemetadata.add(FETCH_TIME,
Long.toString(System.currentTimeMillis()));
- if (trimmed.booleanValue()) {
+ if (truncated.booleanValue()) {
if (!call.isCanceled()) {
call.cancel();
}
- responsemetadata.set(TRIMMED_CONTENT, "true");
- LOG.debug("HTTP content trimmed to {} bytes", content.length);
+ responsemetadata.set(TRUNCATED_CONTENT, "true");
+ responsemetadata.set(TRUNCATED_CONTENT_REASON,
+ truncated.getReason().toString().toLowerCase(Locale.ROOT));
+ LOG.debug("HTTP content truncated to {} bytes (reason: {})",
+ content.length, truncated.getReason());
}
code = response.code();
@@ -109,15 +133,16 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum
datum)
}
private final byte[] toByteArray(final ResponseBody responseBody,
- MutableBoolean trimmed, int maxContent, int timeout) throws IOException {
+ TruncatedContent truncated, int maxContent, int maxDuration,
+ boolean partialAsTruncated) throws IOException {
if (responseBody == null) {
return new byte[] {};
}
long endDueFor = -1;
- if (timeout != -1) {
- endDueFor = System.currentTimeMillis() + timeout;
+ if (maxDuration != -1) {
+ endDueFor = System.currentTimeMillis() + (maxDuration * 1000);
}
int maxContentBytes = Integer.MAX_VALUE;
@@ -132,7 +157,17 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum
datum)
while (contentBytesBuffered < maxContentBytes) {
contentBytesRequested += Math.min(bufferGrowStepBytes,
(maxContentBytes - contentBytesBuffered));
- boolean success = source.request(contentBytesRequested);
+ boolean success = false;
+ try {
+ success = source.request(contentBytesRequested);
+ } catch (IOException e) {
+ if (partialAsTruncated && contentBytesBuffered > 0) {
+ // treat already fetched content as truncated
+ truncated.setReason(TruncatedContentReason.DISCONNECT);
+ } else {
+ throw e;
+ }
+ }
contentBytesBuffered = (int) source.buffer().size();
if (LOG.isDebugEnabled()) {
LOG.debug("total bytes requested = {}, buffered = {}",
@@ -143,19 +178,19 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum
datum)
break;
}
if (endDueFor != -1 && endDueFor <= System.currentTimeMillis()) {
- LOG.debug("timeout reached");
- trimmed.setValue(true);
+ LOG.debug("max. fetch duration reached");
+ truncated.setReason(TruncatedContentReason.TIME);
break;
}
if (contentBytesBuffered > maxContentBytes) {
LOG.debug("content limit reached");
- trimmed.setValue(true);
+ truncated.setReason(TruncatedContentReason.LENGTH);
}
}
int bytesToCopy = contentBytesBuffered;
if (maxContent != -1 && contentBytesBuffered > maxContent) {
// okhttp's internal buffer is larger than maxContent
- trimmed.setValue(true);
+ truncated.setReason(TruncatedContentReason.LENGTH);
bytesToCopy = maxContentBytes;
}
byte[] arr = new byte[bytesToCopy];
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> protocol-okhttp: allow to keep partially fetched docs as truncated
> ------------------------------------------------------------------
>
> Key: NUTCH-2619
> URL: https://issues.apache.org/jira/browse/NUTCH-2619
> Project: Nutch
> Issue Type: Improvement
> Components: protocol
> Affects Versions: 1.15
> Reporter: Sebastian Nagel
> Priority: Minor
> Fix For: 1.16
>
>
> Sometimes fetching a larger document times out after some content has already
> been downloaded. For some use cases it may be better to save this partially
> fetched document and mark it as truncated, instead of retrying the fetch
> later (may fail for the same reason again).
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)