Author: snagel
Date: Fri Mar 27 21:42:35 2015
New Revision: 1669692
URL: http://svn.apache.org/r1669692
Log:
NUTCH-1941 Optional rolling http.agent.names
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Mar 27 21:42:35 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development 2.4-SNAPSHOT
+* NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via
snagel)
+
* NUTCH-1925 Upgrade to Apache Tika 1.7 palsulich.p2.v2.patch (Tyler Palsulich
via lewismc)
* NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Fri Mar 27 21:42:35 2015
@@ -162,6 +162,26 @@
</property>
<property>
+ <name>http.agent.rotate</name>
+ <value>false</value>
+ <description>
+ If true, instead of http.agent.name, alternating agent names are
+ chosen from a list provided via http.agent.rotate.file.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.rotate.file</name>
+ <value>agents.txt</value>
+ <description>
+ File containing alternative user agent names to be used instead of
+ http.agent.name on a rotating basis if http.agent.rotate is true.
+ Each line of the file should contain exactly one agent
+ specification including name, version, description, URL, etc.
+ </description>
+</property>
+
+<property>
<name>http.agent.host</name>
<value></value>
<description>Name or IP address of the host on which the Nutch crawler
Modified:
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Fri Mar 27 21:42:35 2015
@@ -17,16 +17,22 @@
package org.apache.nutch.protocol.http.api;
// JDK imports
+import java.io.BufferedReader;
import java.io.IOException;
+import java.io.Reader;
import java.net.URL;
import java.nio.ByteBuffer;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
@@ -53,6 +59,8 @@ public abstract class HttpBase implement
private HttpRobotRulesParser robots = null;
+ private ArrayList<String> userAgentNames = null;
+
/** The proxy hostname. */
protected String proxyHost = null;
@@ -132,6 +140,45 @@ public abstract class HttpBase implement
this.responseTime = conf.getBoolean("http.store.responsetime", true);
this.robots.setConf(conf);
+ // NUTCH-1941: read list of alternating agent names
+ if (conf.getBoolean("http.agent.rotate", false)) {
+ String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
+ BufferedReader br = null;
+ try {
+ Reader reader = conf.getConfResourceAsReader(agentsFile);
+ br = new BufferedReader(reader);
+ userAgentNames = new ArrayList<String>();
+ String word = "";
+ while ((word = br.readLine()) != null) {
+ if (!word.trim().isEmpty())
+ userAgentNames.add(word.trim());
+ }
+
+ if (userAgentNames.size() == 0) {
+ logger.warn("Empty list of user agents in http.agent.rotate.file {}",
+ agentsFile);
+ userAgentNames = null;
+ }
+
+ } catch (Exception e) {
+ logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
+ StringUtils.stringifyException(e));
+ userAgentNames = null;
+ } finally {
+ if (br != null) {
+ try {
+ br.close();
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ }
+ if (userAgentNames == null) {
+ logger
+ .warn("Falling back to fixed user agent set via property
http.agent.name");
+ }
+ }
+
String[] protocols = conf.getStrings("http.tls.supported.protocols",
"TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
@@ -298,6 +345,9 @@ public abstract class HttpBase implement
}
public String getUserAgent() {
+ if (userAgentNames!=null) {
+ return
userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1));
+ }
return userAgent;
}
Modified:
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
Fri Mar 27 21:42:35 2015
@@ -203,7 +203,7 @@ public class Http extends HttpBase {
HostConfiguration hostConf = client.getHostConfiguration();
ArrayList<Header> headers = new ArrayList<Header>();
// Set the User Agent in the header
- headers.add(new Header("User-Agent", userAgent));
+ //headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
// prefer English
headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3"));
// prefer UTF-8
Modified:
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Fri Mar 27 21:42:35 2015
@@ -30,6 +30,8 @@ import org.apache.commons.httpclient.coo
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpClient;
+
// Nutch imports
import org.apache.nutch.metadata.Metadata;
@@ -96,7 +98,9 @@ public class HttpResponse implements Res
// XXX little danger in retrying...
// params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
try {
- code = Http.getClient().executeMethod(get);
+ HttpClient client = Http.getClient();
+ client.getParams().setParameter("http.useragent", http.getUserAgent());
// NUTCH-1941
+ code = client.executeMethod(get);
Header[] heads = get.getResponseHeaders();
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Mar 27 21:42:35 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via
snagel)
+
* NUTCH-1959 Improving CommonCrawlFormat implementations (Giuseppe Totaro via
mattmann)
* NUTCH-1974 keyPrefix option for CommonCrawlDataDumper tool (Giuseppe Totaro
via mattmann)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Mar 27 21:42:35 2015
@@ -161,6 +161,26 @@
</property>
<property>
+ <name>http.agent.rotate</name>
+ <value>false</value>
+ <description>
+ If true, instead of http.agent.name, alternating agent names are
+ chosen from a list provided via http.agent.rotate.file.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.rotate.file</name>
+ <value>agents.txt</value>
+ <description>
+ File containing alternative user agent names to be used instead of
+ http.agent.name on a rotating basis if http.agent.rotate is true.
+ Each line of the file should contain exactly one agent
+ specification including name, version, description, URL, etc.
+ </description>
+</property>
+
+<property>
<name>http.agent.host</name>
<value></value>
<description>Name or IP address of the host on which the Nutch crawler
Modified:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Fri Mar 27 21:42:35 2015
@@ -17,12 +17,15 @@
package org.apache.nutch.protocol.http.api;
// JDK imports
+import java.io.BufferedReader;
import java.io.IOException;
+import java.io.Reader;
import java.net.URL;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
-
+import java.util.concurrent.ThreadLocalRandom;
// Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -37,6 +40,7 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.util.GZIPUtils;
import org.apache.nutch.util.DeflateUtils;
+import org.apache.hadoop.util.StringUtils;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
@@ -56,6 +60,8 @@ public abstract class HttpBase implement
private HttpRobotRulesParser robots = null;
+ private ArrayList<String> userAgentNames = null;
+
/** The proxy hostname. */
protected String proxyHost = null;
@@ -143,6 +149,45 @@ public abstract class HttpBase implement
this.enableIfModifiedsinceHeader =
conf.getBoolean("http.enable.if.modified.since.header", true);
this.robots.setConf(conf);
+ // NUTCH-1941: read list of alternating agent names
+ if (conf.getBoolean("http.agent.rotate", false)) {
+ String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
+ BufferedReader br = null;
+ try {
+ Reader reader = conf.getConfResourceAsReader(agentsFile);
+ br = new BufferedReader(reader);
+ userAgentNames = new ArrayList<String>();
+ String word = "";
+ while ((word = br.readLine()) != null) {
+ if (!word.trim().isEmpty())
+ userAgentNames.add(word.trim());
+ }
+
+ if (userAgentNames.size() == 0) {
+ logger.warn("Empty list of user agents in http.agent.rotate.file {}",
+ agentsFile);
+ userAgentNames = null;
+ }
+
+ } catch (Exception e) {
+ logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
+ StringUtils.stringifyException(e));
+ userAgentNames = null;
+ } finally {
+ if (br != null) {
+ try {
+ br.close();
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ }
+ if (userAgentNames == null) {
+ logger
+ .warn("Falling back to fixed user agent set via property
http.agent.name");
+ }
+ }
+
String[] protocols = conf.getStrings("http.tls.supported.protocols",
"TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
@@ -312,6 +357,9 @@ public abstract class HttpBase implement
}
public String getUserAgent() {
+ if (userAgentNames!=null) {
+ return
userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1));
+ }
return userAgent;
}
Modified:
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
(original)
+++
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
Fri Mar 27 21:42:35 2015
@@ -203,7 +203,7 @@ public class Http extends HttpBase {
HostConfiguration hostConf = client.getHostConfiguration();
ArrayList<Header> headers = new ArrayList<Header>();
// Set the User Agent in the header
- headers.add(new Header("User-Agent", userAgent));
+ //headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
// prefer English
headers.add(new Header("Accept-Language", acceptLanguage));
// prefer UTF-8
Modified:
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Fri Mar 27 21:42:35 2015
@@ -29,6 +29,8 @@ import org.apache.commons.httpclient.coo
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpClient;
+
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
@@ -95,7 +97,9 @@ public class HttpResponse implements Res
// XXX little danger in retrying...
// params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
try {
- code = Http.getClient().executeMethod(get);
+ HttpClient client = Http.getClient();
+ client.getParams().setParameter("http.useragent", http.getUserAgent());
// NUTCH-1941
+ code = client.executeMethod(get);
Header[] heads = get.getResponseHeaders();