Author: lewismc
Date: Wed May 27 23:28:26 2015
New Revision: 1682136
URL: http://svn.apache.org/r1682136
Log:
NUTCH-208 http: proxy exception list:
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682136r1=1682135r2=1682136view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 27 23:28:26 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-208 http: proxy exception list: (Matthias Günter, siren, markus,
lewismc)
+
* NUTCH-2007 add test libs to classpath of bin/nutch junit (snagel)
* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist (totaro)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1682136r1=1682135r2=1682136view=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed May 27 23:28:26 2015
@@ -278,6 +278,13 @@
/property
property
+ namehttp.proxy.exception.list/name
+ value/value
+ descriptionA comma separated list of URL's and hosts that don't use the
proxy
+ (e.g. intranets). Example: www.apache.org/description
+/property
+
+property
namehttp.verbose/name
valuefalse/value
descriptionIf true, HTTP will log more verbosely./description
Modified:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1682136r1=1682135r2=1682136view=diff
==
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Wed May 27 23:28:26 2015
@@ -21,6 +21,7 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
+import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
@@ -67,6 +68,9 @@ public abstract class HttpBase implement
/** The proxy port. */
protected int proxyPort = 8080;
+
+ /** The proxy exception list. */
+ protected HashMap proxyException = new HashMap();
/** Indicates if a proxy is used */
protected boolean useProxy = false;
@@ -135,6 +139,7 @@ public abstract class HttpBase implement
this.conf = conf;
this.proxyHost = conf.get(http.proxy.host);
this.proxyPort = conf.getInt(http.proxy.port, 8080);
+this.proxyException =
arrayToMap(conf.getStrings(http.proxy.exception.list));
this.useProxy = (proxyHost != null proxyHost.length() 0);
this.timeout = conf.getInt(http.timeout, 1);
this.maxContent = conf.getInt(http.content.limit, 64 * 1024);
@@ -340,7 +345,12 @@ public abstract class HttpBase implement
return proxyPort;
}
- public boolean useProxy() {
+ public boolean useProxy(URL url) {
+if (!useProxy){
+ return false;
+} else if (proxyException.get(url.getHost())!=null){
+ return false;
+}
return useProxy;
}
@@ -434,6 +444,7 @@ public abstract class HttpBase implement
if (logger.isInfoEnabled()) {
logger.info(http.proxy.host = + proxyHost);
logger.info(http.proxy.port = + proxyPort);
+ logger.info(http.proxy.exception.list = + useProxy);
logger.info(http.timeout = + timeout);
logger.info(http.content.limit = + maxContent);
logger.info(http.agent = + userAgent);
@@ -547,4 +558,22 @@ public abstract class HttpBase implement
public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
return robots.getRobotRulesSet(this, url);
}
+
+ /**
+ * Transforming a String[] into a HashMap for faster searching
+ * @param input String[]
+ * @return a new HashMap
+ */
+ private HashMap arrayToMap(String[]input){
+if (input==null ||input.length==0) {
+ return new HashMap();
+}
+HashMap hm=new HashMap();
+for (int i=0;iinput.length;i++){
+ if (!.equals(input[i].trim())){
+hm.put(input[i],input[i]);
+ }
+}
+return hm;
+ }
}
Modified:
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: