Author: snagel Date: Fri Mar 27 21:42:35 2015 New Revision: 1669692 URL: http://svn.apache.org/r1669692 Log: NUTCH-1941 Optional rolling http.agent.names
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/nutch-default.xml nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Mar 27 21:42:35 2015 @@ -2,6 +2,8 @@ Nutch Change Log Current Development 2.4-SNAPSHOT +* NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via snagel) + * NUTCH-1925 Upgrade to Apache Tika 1.7 palsulich.p2.v2.patch (Tyler Palsulich via lewismc) * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus) Modified: nutch/branches/2.x/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/branches/2.x/conf/nutch-default.xml (original) +++ nutch/branches/2.x/conf/nutch-default.xml Fri Mar 27 21:42:35 2015 @@ -162,6 +162,26 @@ </property> <property> + <name>http.agent.rotate</name> + <value>false</value> + <description> + If true, instead of http.agent.name, alternating agent names are + chosen from a list provided via http.agent.rotate.file. + </description> +</property> + +<property> + <name>http.agent.rotate.file</name> + <value>agents.txt</value> + <description> + File containing alternative user agent names to be used instead of + http.agent.name on a rotating basis if http.agent.rotate is true. + Each line of the file should contain exactly one agent + specification including name, version, description, URL, etc. + </description> +</property> + +<property> <name>http.agent.host</name> <value></value> <description>Name or IP address of the host on which the Nutch crawler Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri Mar 27 21:42:35 2015 @@ -17,16 +17,22 @@ package org.apache.nutch.protocol.http.api; // JDK imports +import java.io.BufferedReader; import java.io.IOException; +import java.io.Reader; import java.net.URL; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Set; +import java.util.concurrent.ThreadLocalRandom; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.avro.util.Utf8; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; @@ -53,6 +59,8 @@ public abstract class HttpBase implement private HttpRobotRulesParser robots = null; + private ArrayList<String> userAgentNames = null; + /** The proxy hostname. */ protected String proxyHost = null; @@ -132,6 +140,45 @@ public abstract class HttpBase implement this.responseTime = conf.getBoolean("http.store.responsetime", true); this.robots.setConf(conf); + // NUTCH-1941: read list of alternating agent names + if (conf.getBoolean("http.agent.rotate", false)) { + String agentsFile = conf.get("http.agent.rotate.file", "agents.txt"); + BufferedReader br = null; + try { + Reader reader = conf.getConfResourceAsReader(agentsFile); + br = new BufferedReader(reader); + userAgentNames = new ArrayList<String>(); + String word = ""; + while ((word = br.readLine()) != null) { + if (!word.trim().isEmpty()) + userAgentNames.add(word.trim()); + } + + if (userAgentNames.size() == 0) { + logger.warn("Empty list of user agents in http.agent.rotate.file {}", + agentsFile); + userAgentNames = null; + } + + } catch (Exception e) { + logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, + StringUtils.stringifyException(e)); + userAgentNames = null; + } finally { + if (br != null) { + try { + br.close(); + } catch (IOException e) { + // ignore + } + } + } + if (userAgentNames == null) { + logger + .warn("Falling back to fixed user agent set via property http.agent.name"); + } + } + String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", @@ -298,6 +345,9 @@ public abstract class HttpBase implement } public String getUserAgent() { + if (userAgentNames!=null) { + return userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1)); + } return userAgent; } Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Fri Mar 27 21:42:35 2015 @@ -203,7 +203,7 @@ public class Http extends HttpBase { HostConfiguration hostConf = client.getHostConfiguration(); ArrayList<Header> headers = new ArrayList<Header>(); // Set the User Agent in the header - headers.add(new Header("User-Agent", userAgent)); + //headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941 // prefer English headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3")); // prefer UTF-8 Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Fri Mar 27 21:42:35 2015 @@ -30,6 +30,8 @@ import org.apache.commons.httpclient.coo import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpClient; + // Nutch imports import org.apache.nutch.metadata.Metadata; @@ -96,7 +98,9 @@ public class HttpResponse implements Res // XXX little danger in retrying... // params.setParameter(HttpMethodParams.RETRY_HANDLER, null); try { - code = Http.getClient().executeMethod(get); + HttpClient client = Http.getClient(); + client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941 + code = client.executeMethod(get); Header[] heads = get.getResponseHeaders(); Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Mar 27 21:42:35 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via snagel) + * NUTCH-1959 Improving CommonCrawlFormat implementations (Giuseppe Totaro via mattmann) * NUTCH-1974 keyPrefix option for CommonCrawlDataDumper tool (Giuseppe Totaro via mattmann) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Fri Mar 27 21:42:35 2015 @@ -161,6 +161,26 @@ </property> <property> + <name>http.agent.rotate</name> + <value>false</value> + <description> + If true, instead of http.agent.name, alternating agent names are + chosen from a list provided via http.agent.rotate.file. + </description> +</property> + +<property> + <name>http.agent.rotate.file</name> + <value>agents.txt</value> + <description> + File containing alternative user agent names to be used instead of + http.agent.name on a rotating basis if http.agent.rotate is true. + Each line of the file should contain exactly one agent + specification including name, version, description, URL, etc. + </description> +</property> + +<property> <name>http.agent.host</name> <value></value> <description>Name or IP address of the host on which the Nutch crawler Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri Mar 27 21:42:35 2015 @@ -17,12 +17,15 @@ package org.apache.nutch.protocol.http.api; // JDK imports +import java.io.BufferedReader; import java.io.IOException; +import java.io.Reader; import java.net.URL; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Set; - +import java.util.concurrent.ThreadLocalRandom; // Logging imports import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,6 +40,7 @@ import org.apache.nutch.protocol.Protoco import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.util.GZIPUtils; import org.apache.nutch.util.DeflateUtils; +import org.apache.hadoop.util.StringUtils; // Hadoop imports import org.apache.hadoop.conf.Configuration; @@ -56,6 +60,8 @@ public abstract class HttpBase implement private HttpRobotRulesParser robots = null; + private ArrayList<String> userAgentNames = null; + /** The proxy hostname. */ protected String proxyHost = null; @@ -143,6 +149,45 @@ public abstract class HttpBase implement this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true); this.robots.setConf(conf); + // NUTCH-1941: read list of alternating agent names + if (conf.getBoolean("http.agent.rotate", false)) { + String agentsFile = conf.get("http.agent.rotate.file", "agents.txt"); + BufferedReader br = null; + try { + Reader reader = conf.getConfResourceAsReader(agentsFile); + br = new BufferedReader(reader); + userAgentNames = new ArrayList<String>(); + String word = ""; + while ((word = br.readLine()) != null) { + if (!word.trim().isEmpty()) + userAgentNames.add(word.trim()); + } + + if (userAgentNames.size() == 0) { + logger.warn("Empty list of user agents in http.agent.rotate.file {}", + agentsFile); + userAgentNames = null; + } + + } catch (Exception e) { + logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, + StringUtils.stringifyException(e)); + userAgentNames = null; + } finally { + if (br != null) { + try { + br.close(); + } catch (IOException e) { + // ignore + } + } + } + if (userAgentNames == null) { + logger + .warn("Falling back to fixed user agent set via property http.agent.name"); + } + } + String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", @@ -312,6 +357,9 @@ public abstract class HttpBase implement } public String getUserAgent() { + if (userAgentNames!=null) { + return userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1)); + } return userAgent; } Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Fri Mar 27 21:42:35 2015 @@ -203,7 +203,7 @@ public class Http extends HttpBase { HostConfiguration hostConf = client.getHostConfiguration(); ArrayList<Header> headers = new ArrayList<Header>(); // Set the User Agent in the header - headers.add(new Header("User-Agent", userAgent)); + //headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941 // prefer English headers.add(new Header("Accept-Language", acceptLanguage)); // prefer UTF-8 Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1669692&r1=1669691&r2=1669692&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Fri Mar 27 21:42:35 2015 @@ -29,6 +29,8 @@ import org.apache.commons.httpclient.coo import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpClient; + // Nutch imports import org.apache.nutch.crawl.CrawlDatum; @@ -95,7 +97,9 @@ public class HttpResponse implements Res // XXX little danger in retrying... // params.setParameter(HttpMethodParams.RETRY_HANDLER, null); try { - code = Http.getClient().executeMethod(get); + HttpClient client = Http.getClient(); + client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941 + code = client.executeMethod(get); Header[] heads = get.getResponseHeaders();