svn commit: r1682090 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/protocol/RobotRulesParser.java

2015-05-27 Thread totaro
Author: totaro
Date: Wed May 27 18:09:37 2015
New Revision: 1682090

URL: http://svn.apache.org/r1682090
Log:
Fix for NUTCH-1995: The result of conf.getStrings(http.robot.rules.whitelist) 
is now checked for null

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682090r1=1682089r2=1682090view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 27 18:09:37 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist
+
 * NUTCH-2013 Fetcher: missing logs fetching ... on stdout (snagel)
 
 * NUTCH-2014 Fetcher hang-up on completion (snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1682090r1=1682089r2=1682090view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Wed 
May 27 18:09:37 2015
@@ -129,18 +129,22 @@ public abstract class RobotRulesParser i
 }
 
 String[] confWhiteList = conf.getStrings(http.robot.rules.whitelist);
-
-for (int i = 0; i  confWhiteList.length; i++) {
-  if (confWhiteList[i].isEmpty()) {
- LOG.info(Empty whitelisted URL skipped!);
- continue;
-  }
-  whiteList.add(confWhiteList[i]);
+if (confWhiteList == null) {
+  LOG.info(robots.txt whitelist not configured.);
 }
-
-if (whiteList.size()  0) {
-  matcher = new SuffixStringMatcher(whiteList);
-  LOG.info(Whitelisted hosts:  + whiteList);
+else {
+  for (int i = 0; i  confWhiteList.length; i++) {
+if (confWhiteList[i].isEmpty()) {
+ LOG.info(Empty whitelisted URL skipped!);
+ continue;
+}
+whiteList.add(confWhiteList[i]);
+  }
+  
+  if (whiteList.size()  0) {
+matcher = new SuffixStringMatcher(whiteList);
+LOG.info(Whitelisted hosts:  + whiteList);
+  }
 }
   }
 




svn commit: r1682103 - in /nutch/trunk: CHANGES.txt src/bin/nutch

2015-05-27 Thread snagel
Author: snagel
Date: Wed May 27 19:31:51 2015
New Revision: 1682103

URL: http://svn.apache.org/r1682103
Log:
NUTCH-2007 add test libs to classpath of bin/nutch junit

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/bin/nutch

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682103r1=1682102r2=1682103view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 27 19:31:51 2015
@@ -2,7 +2,9 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
-* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist
+* NUTCH-2007 add test libs to classpath of bin/nutch junit (snagel)
+
+* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist (totaro)
 
 * NUTCH-2013 Fetcher: missing logs fetching ... on stdout (snagel)
 

Modified: nutch/trunk/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1682103r1=1682102r2=1682103view=diff
==
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Wed May 27 19:31:51 2015
@@ -270,6 +270,11 @@ elif [ $COMMAND = plugin ] ; then
   CLASS=org.apache.nutch.plugin.PluginRepository
 elif [ $COMMAND = junit ] ; then
   CLASSPATH=$CLASSPATH:$NUTCH_HOME/test/classes/
+  if $local; then
+for f in $NUTCH_HOME/test/lib/*.jar; do
+  CLASSPATH=${CLASSPATH}:$f;
+done
+  fi
   CLASS=org.junit.runner.JUnitCore
 elif [ $COMMAND = startserver ] ; then
   CLASS=org.apache.nutch.service.NutchServer




svn commit: r1682136 - in /nutch/trunk: ./ conf/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/protocol-

2015-05-27 Thread lewismc
Author: lewismc
Date: Wed May 27 23:28:26 2015
New Revision: 1682136

URL: http://svn.apache.org/r1682136
Log:
NUTCH-208 http: proxy exception list:

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml

nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682136r1=1682135r2=1682136view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 27 23:28:26 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-208 http: proxy exception list: (Matthias Günter, siren, markus, 
lewismc)
+
 * NUTCH-2007 add test libs to classpath of bin/nutch junit (snagel)
 
 * NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist (totaro)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1682136r1=1682135r2=1682136view=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed May 27 23:28:26 2015
@@ -278,6 +278,13 @@
 /property
 
 property
+  namehttp.proxy.exception.list/name
+  value/value
+  descriptionA comma separated list of URL's and hosts that don't use the 
proxy 
+  (e.g. intranets). Example: www.apache.org/description
+/property
+
+property
   namehttp.verbose/name
   valuefalse/value
   descriptionIf true, HTTP will log more verbosely./description

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1682136r1=1682135r2=1682136view=diff
==
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Wed May 27 23:28:26 2015
@@ -21,6 +21,7 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.net.URL;
+import java.util.*;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
@@ -67,6 +68,9 @@ public abstract class HttpBase implement
 
   /** The proxy port. */
   protected int proxyPort = 8080;
+  
+  /** The proxy exception list. */
+  protected HashMap proxyException = new HashMap(); 
 
   /** Indicates if a proxy is used */
   protected boolean useProxy = false;
@@ -135,6 +139,7 @@ public abstract class HttpBase implement
 this.conf = conf;
 this.proxyHost = conf.get(http.proxy.host);
 this.proxyPort = conf.getInt(http.proxy.port, 8080);
+this.proxyException = 
arrayToMap(conf.getStrings(http.proxy.exception.list));
 this.useProxy = (proxyHost != null  proxyHost.length()  0);
 this.timeout = conf.getInt(http.timeout, 1);
 this.maxContent = conf.getInt(http.content.limit, 64 * 1024);
@@ -340,7 +345,12 @@ public abstract class HttpBase implement
 return proxyPort;
   }
 
-  public boolean useProxy() {
+  public boolean useProxy(URL url) {
+if (!useProxy){
+  return false;
+} else if (proxyException.get(url.getHost())!=null){
+  return false;
+}
 return useProxy;
   }
 
@@ -434,6 +444,7 @@ public abstract class HttpBase implement
 if (logger.isInfoEnabled()) {
   logger.info(http.proxy.host =  + proxyHost);
   logger.info(http.proxy.port =  + proxyPort);
+  logger.info(http.proxy.exception.list =  + useProxy);
   logger.info(http.timeout =  + timeout);
   logger.info(http.content.limit =  + maxContent);
   logger.info(http.agent =  + userAgent);
@@ -547,4 +558,22 @@ public abstract class HttpBase implement
   public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
 return robots.getRobotRulesSet(this, url);
   }
+  
+  /**
+   * Transforming a String[] into a HashMap for faster searching
+   * @param input String[]
+   * @return a new HashMap
+   */
+  private HashMap arrayToMap(String[]input){
+if (input==null ||input.length==0) {
+  return new HashMap();
+}
+HashMap hm=new HashMap();
+for (int i=0;iinput.length;i++){
+  if (!.equals(input[i].trim())){
+hm.put(input[i],input[i]);
+  }
+}
+return hm;
+  }
 }

Modified: 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: