Author: snagel
Date: Fri Mar 27 21:42:35 2015
New Revision: 1669692

URL: http://svn.apache.org/r1669692
Log:
NUTCH-1941 Optional rolling http.agent.names

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/nutch-default.xml
    
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
    
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
    
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Mar 27 21:42:35 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.4-SNAPSHOT
 
+* NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via 
snagel)
+
 * NUTCH-1925 Upgrade to Apache Tika 1.7 palsulich.p2.v2.patch (Tyler Palsulich 
via lewismc)
 
 * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Fri Mar 27 21:42:35 2015
@@ -162,6 +162,26 @@
 </property>
 
 <property>
+  <name>http.agent.rotate</name>
+  <value>false</value>
+  <description>
+    If true, instead of http.agent.name, alternating agent names are
+    chosen from a list provided via http.agent.rotate.file.
+  </description>
+</property>
+
+<property>
+  <name>http.agent.rotate.file</name>
+  <value>agents.txt</value>
+  <description>
+    File containing alternative user agent names to be used instead of
+    http.agent.name on a rotating basis if http.agent.rotate is true.
+    Each line of the file should contain exactly one agent
+    specification including name, version, description, URL, etc.
+  </description>
+</property>
+
+<property>
   <name>http.agent.host</name>
   <value></value>
   <description>Name or IP address of the host on which the Nutch crawler

Modified: 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Fri Mar 27 21:42:35 2015
@@ -17,16 +17,22 @@
 package org.apache.nutch.protocol.http.api;
 
 // JDK imports
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.Reader;
 import java.net.URL;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
@@ -53,6 +59,8 @@ public abstract class HttpBase implement
 
   private HttpRobotRulesParser robots = null;
 
+  private ArrayList<String> userAgentNames = null;
+
   /** The proxy hostname. */
   protected String proxyHost = null;
 
@@ -132,6 +140,45 @@ public abstract class HttpBase implement
     this.responseTime = conf.getBoolean("http.store.responsetime", true);
     this.robots.setConf(conf);
 
+    // NUTCH-1941: read list of alternating agent names
+    if (conf.getBoolean("http.agent.rotate", false)) {
+      String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
+      BufferedReader br = null;
+      try {
+        Reader reader = conf.getConfResourceAsReader(agentsFile);
+        br = new BufferedReader(reader);
+        userAgentNames = new ArrayList<String>();
+        String word = "";
+        while ((word = br.readLine()) != null) {
+          if (!word.trim().isEmpty())
+            userAgentNames.add(word.trim());
+        }
+
+        if (userAgentNames.size() == 0) {
+          logger.warn("Empty list of user agents in http.agent.rotate.file {}",
+              agentsFile);
+          userAgentNames = null;
+        }
+
+      } catch (Exception e) {
+        logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
+            StringUtils.stringifyException(e));
+        userAgentNames = null;
+      } finally {
+        if (br != null) {
+          try {
+            br.close();
+          } catch (IOException e) {
+            // ignore
+          }
+        }
+      }
+      if (userAgentNames == null) {
+        logger
+            .warn("Falling back to fixed user agent set via property 
http.agent.name");
+      }
+    }
+
     String[] protocols = conf.getStrings("http.tls.supported.protocols",
         "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
     String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
@@ -298,6 +345,9 @@ public abstract class HttpBase implement
   }
 
   public String getUserAgent() {
+    if (userAgentNames!=null) {
+      return 
userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1));
+    }
     return userAgent;
   }
 

Modified: 
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Fri Mar 27 21:42:35 2015
@@ -203,7 +203,7 @@ public class Http extends HttpBase {
     HostConfiguration hostConf = client.getHostConfiguration();
     ArrayList<Header> headers = new ArrayList<Header>();
     // Set the User Agent in the header
-    headers.add(new Header("User-Agent", userAgent));
+    //headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
     // prefer English
     headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3"));
     // prefer UTF-8

Modified: 
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Fri Mar 27 21:42:35 2015
@@ -30,6 +30,8 @@ import org.apache.commons.httpclient.coo
 import org.apache.commons.httpclient.methods.GetMethod;
 import org.apache.commons.httpclient.params.HttpMethodParams;
 import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpClient;
+
 
 // Nutch imports
 import org.apache.nutch.metadata.Metadata;
@@ -96,7 +98,9 @@ public class HttpResponse implements Res
     // XXX little danger in retrying...
     // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
     try {
-      code = Http.getClient().executeMethod(get);
+      HttpClient client = Http.getClient();
+      client.getParams().setParameter("http.useragent", http.getUserAgent()); 
// NUTCH-1941
+      code = client.executeMethod(get);
 
       Header[] heads = get.getResponseHeaders();
 

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Mar 27 21:42:35 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1941 Optional rolling http.agent.name's (Asitang Mishra, lewismc via 
snagel)
+
 * NUTCH-1959 Improving CommonCrawlFormat implementations (Giuseppe Totaro via 
mattmann)
 
 * NUTCH-1974 keyPrefix option for CommonCrawlDataDumper tool (Giuseppe Totaro 
via mattmann)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Mar 27 21:42:35 2015
@@ -161,6 +161,26 @@
 </property>
 
 <property>
+  <name>http.agent.rotate</name>
+  <value>false</value>
+  <description>
+    If true, instead of http.agent.name, alternating agent names are
+    chosen from a list provided via http.agent.rotate.file.
+  </description>
+</property>
+
+<property>
+  <name>http.agent.rotate.file</name>
+  <value>agents.txt</value>
+  <description>
+    File containing alternative user agent names to be used instead of
+    http.agent.name on a rotating basis if http.agent.rotate is true.
+    Each line of the file should contain exactly one agent
+    specification including name, version, description, URL, etc.
+  </description>
+</property>
+
+<property>
   <name>http.agent.host</name>
   <value></value>
   <description>Name or IP address of the host on which the Nutch crawler

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Fri Mar 27 21:42:35 2015
@@ -17,12 +17,15 @@
 package org.apache.nutch.protocol.http.api;
 
 // JDK imports
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.Reader;
 import java.net.URL;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
-
+import java.util.concurrent.ThreadLocalRandom;
 // Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -37,6 +40,7 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.util.GZIPUtils;
 import org.apache.nutch.util.DeflateUtils;
+import org.apache.hadoop.util.StringUtils;
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
@@ -56,6 +60,8 @@ public abstract class HttpBase implement
 
   private HttpRobotRulesParser robots = null;
 
+  private ArrayList<String> userAgentNames = null;
+
   /** The proxy hostname. */
   protected String proxyHost = null;
 
@@ -143,6 +149,45 @@ public abstract class HttpBase implement
     this.enableIfModifiedsinceHeader = 
conf.getBoolean("http.enable.if.modified.since.header", true);
     this.robots.setConf(conf);
 
+    // NUTCH-1941: read list of alternating agent names
+    if (conf.getBoolean("http.agent.rotate", false)) {
+      String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
+      BufferedReader br = null;
+      try {
+        Reader reader = conf.getConfResourceAsReader(agentsFile);
+        br = new BufferedReader(reader);
+        userAgentNames = new ArrayList<String>();
+        String word = "";
+        while ((word = br.readLine()) != null) {
+          if (!word.trim().isEmpty())
+            userAgentNames.add(word.trim());
+        }
+
+        if (userAgentNames.size() == 0) {
+          logger.warn("Empty list of user agents in http.agent.rotate.file {}",
+              agentsFile);
+          userAgentNames = null;
+        }
+
+      } catch (Exception e) {
+        logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
+            StringUtils.stringifyException(e));
+        userAgentNames = null;
+      } finally {
+        if (br != null) {
+          try {
+            br.close();
+          } catch (IOException e) {
+            // ignore
+          }
+        }
+      }
+      if (userAgentNames == null) {
+        logger
+            .warn("Falling back to fixed user agent set via property 
http.agent.name");
+      }
+    }
+
     String[] protocols = conf.getStrings("http.tls.supported.protocols",
         "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
     String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
@@ -312,6 +357,9 @@ public abstract class HttpBase implement
   }
 
   public String getUserAgent() {
+    if (userAgentNames!=null) {
+      return 
userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1));
+    }
     return userAgent;
   }
 

Modified: 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Fri Mar 27 21:42:35 2015
@@ -203,7 +203,7 @@ public class Http extends HttpBase {
     HostConfiguration hostConf = client.getHostConfiguration();
     ArrayList<Header> headers = new ArrayList<Header>();
     // Set the User Agent in the header
-    headers.add(new Header("User-Agent", userAgent));
+    //headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
     // prefer English
     headers.add(new Header("Accept-Language", acceptLanguage));
     // prefer UTF-8

Modified: 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1669692&r1=1669691&r2=1669692&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Fri Mar 27 21:42:35 2015
@@ -29,6 +29,8 @@ import org.apache.commons.httpclient.coo
 import org.apache.commons.httpclient.methods.GetMethod;
 import org.apache.commons.httpclient.params.HttpMethodParams;
 import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpClient;
+
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
@@ -95,7 +97,9 @@ public class HttpResponse implements Res
     // XXX little danger in retrying...
     // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
     try {
-      code = Http.getClient().executeMethod(get);
+      HttpClient client = Http.getClient();
+      client.getParams().setParameter("http.useragent", http.getUserAgent()); 
// NUTCH-1941
+      code = client.executeMethod(get);
 
       Header[] heads = get.getResponseHeaders();
 


Reply via email to