Author: lewismc
Date: Wed May 27 23:28:26 2015
New Revision: 1682136
URL: http://svn.apache.org/r1682136
Log:
NUTCH-208 http: proxy exception list:
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 27 23:28:26 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-208 http: proxy exception list: (Matthias Günter, siren, markus,
lewismc)
+
* NUTCH-2007 add test libs to classpath of bin/nutch junit (snagel)
* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist (totaro)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed May 27 23:28:26 2015
@@ -278,6 +278,13 @@
</property>
<property>
+ <name>http.proxy.exception.list</name>
+ <value></value>
+ <description>A comma separated list of URL's and hosts that don't use the
proxy
+ (e.g. intranets). Example: www.apache.org</description>
+</property>
+
+<property>
<name>http.verbose</name>
<value>false</value>
<description>If true, HTTP will log more verbosely.</description>
Modified:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Wed May 27 23:28:26 2015
@@ -21,6 +21,7 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
+import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
@@ -67,6 +68,9 @@ public abstract class HttpBase implement
/** The proxy port. */
protected int proxyPort = 8080;
+
+ /** The proxy exception list. */
+ protected HashMap proxyException = new HashMap();
/** Indicates if a proxy is used */
protected boolean useProxy = false;
@@ -135,6 +139,7 @@ public abstract class HttpBase implement
this.conf = conf;
this.proxyHost = conf.get("http.proxy.host");
this.proxyPort = conf.getInt("http.proxy.port", 8080);
+ this.proxyException =
arrayToMap(conf.getStrings("http.proxy.exception.list"));
this.useProxy = (proxyHost != null && proxyHost.length() > 0);
this.timeout = conf.getInt("http.timeout", 10000);
this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
@@ -340,7 +345,12 @@ public abstract class HttpBase implement
return proxyPort;
}
- public boolean useProxy() {
+ public boolean useProxy(URL url) {
+ if (!useProxy){
+ return false;
+ } else if (proxyException.get(url.getHost())!=null){
+ return false;
+ }
return useProxy;
}
@@ -434,6 +444,7 @@ public abstract class HttpBase implement
if (logger.isInfoEnabled()) {
logger.info("http.proxy.host = " + proxyHost);
logger.info("http.proxy.port = " + proxyPort);
+ logger.info("http.proxy.exception.list = " + useProxy);
logger.info("http.timeout = " + timeout);
logger.info("http.content.limit = " + maxContent);
logger.info("http.agent = " + userAgent);
@@ -547,4 +558,22 @@ public abstract class HttpBase implement
public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
return robots.getRobotRulesSet(this, url);
}
+
+ /**
+ * Transforming a String[] into a HashMap for faster searching
+ * @param input String[]
+ * @return a new HashMap
+ */
+ private HashMap arrayToMap(String[]input){
+ if (input==null ||input.length==0) {
+ return new HashMap();
+ }
+ HashMap hm=new HashMap();
+ for (int i=0;i<input.length;i++){
+ if (!"".equals(input[i].trim())){
+ hm.put(input[i],input[i]);
+ }
+ }
+ return hm;
+ }
}
Modified:
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Wed May 27 23:28:26 2015
@@ -117,8 +117,8 @@ public class HttpResponse implements Res
socket.setSoTimeout(http.getTimeout());
// connect
- String sockHost = http.useProxy() ? http.getProxyHost() : host;
- int sockPort = http.useProxy() ? http.getProxyPort() : port;
+ String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+ int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
@@ -158,7 +158,7 @@ public class HttpResponse implements Res
OutputStream req = socket.getOutputStream();
StringBuffer reqStr = new StringBuffer("GET ");
- if (http.useProxy()) {
+ if (http.useProxy(url)) {
reqStr.append(url.getProtocol() + "://" + host + portString + path);
} else {
reqStr.append(path);
@@ -329,7 +329,6 @@ public class HttpResponse implements Res
* @throws HttpException
* @throws IOException
*/
- @SuppressWarnings("unused")
private void readChunkedContent(PushbackInputStream in, StringBuffer line)
throws HttpException, IOException {
boolean doneChunks = false;
Modified:
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
Wed May 27 23:28:26 2015
@@ -90,8 +90,8 @@ public class HttpResponse implements Res
socket.setSoTimeout(http.getTimeout());
// connect
- String sockHost = http.useProxy() ? http.getProxyHost() : host;
- int sockPort = http.useProxy() ? http.getProxyPort() : port;
+ String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+ int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
@@ -99,7 +99,7 @@ public class HttpResponse implements Res
OutputStream req = socket.getOutputStream();
StringBuffer reqStr = new StringBuffer("GET ");
- if (http.useProxy()) {
+ if (http.useProxy(url)) {
reqStr.append(url.getProtocol() + "://" + host + portString + path);
} else {
reqStr.append(path);