[16/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

thammegowda Sat, 16 Jul 2016 12:48:41 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
 
b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
new file mode 100644
index 0000000..9f616fe
--- /dev/null
+++ 
b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -0,0 +1,587 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http.api;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.net.URL;
+import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+// Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.GZIPUtils;
+import org.apache.nutch.util.DeflateUtils;
+import org.apache.hadoop.util.StringUtils;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
+
+public abstract class HttpBase implements Protocol {
+
+  public static final Text RESPONSE_TIME = new Text("_rs_");
+
+  public static final int BUFFER_SIZE = 8 * 1024;
+
+  private static final byte[] EMPTY_CONTENT = new byte[0];
+
+  private HttpRobotRulesParser robots = null;
+
+  private ArrayList<String> userAgentNames = null;
+
+  /** The proxy hostname. */
+  protected String proxyHost = null;
+
+  /** The proxy port. */
+  protected int proxyPort = 8080;
+  
+  /** The proxy exception list. */
+  protected HashMap proxyException = new HashMap(); 
+
+  /** Indicates if a proxy is used */
+  protected boolean useProxy = false;
+
+  /** The network timeout in millisecond */
+  protected int timeout = 10000;
+
+  /** The length limit for downloaded content, in bytes. */
+  protected int maxContent = 64 * 1024;
+
+  /** The Nutch 'User-Agent' request header */
+  protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
+      "http://nutch.apache.org/bot.html";, "[email protected]");
+
+  /** The "Accept-Language" request header value. */
+  protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
+
+  /** The "Accept" request header value. */
+  protected String accept = 
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
+
+  /** The default logger */
+  private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
+
+  /** The specified logger */
+  private Logger logger = LOGGER;
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  /** Do we use HTTP/1.1? */
+  protected boolean useHttp11 = false;
+
+  /**
+   * Record response time in CrawlDatum's meta data, see property
+   * http.store.responsetime.
+   */
+  protected boolean responseTime = true;
+
+  /** Skip page if Crawl-Delay longer than this value. */
+  protected long maxCrawlDelay = -1L;
+
+  /** Which TLS/SSL protocols to support */
+  protected Set<String> tlsPreferredProtocols;
+
+  /** Which TLS/SSL cipher suites to support */
+  protected Set<String> tlsPreferredCipherSuites;
+  
+  /** Configuration directive for If-Modified-Since HTTP header */
+  public boolean enableIfModifiedsinceHeader = true;
+
+  /** Creates a new instance of HttpBase */
+  public HttpBase() {
+    this(null);
+  }
+
+  /** Creates a new instance of HttpBase */
+  public HttpBase(Logger logger) {
+    if (logger != null) {
+      this.logger = logger;
+    }
+    robots = new HttpRobotRulesParser();
+  }
+
+  // Inherited Javadoc
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.proxyHost = conf.get("http.proxy.host");
+    this.proxyPort = conf.getInt("http.proxy.port", 8080);
+    this.proxyException = 
arrayToMap(conf.getStrings("http.proxy.exception.list"));
+    this.useProxy = (proxyHost != null && proxyHost.length() > 0);
+    this.timeout = conf.getInt("http.timeout", 10000);
+    this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+    this.userAgent = getAgentString(conf.get("http.agent.name"),
+        conf.get("http.agent.version"), conf.get("http.agent.description"),
+        conf.get("http.agent.url"), conf.get("http.agent.email"));
+    this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
+    this.accept = conf.get("http.accept", accept);
+    // backward-compatible default setting
+    this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+    this.responseTime = conf.getBoolean("http.store.responsetime", true);
+    this.enableIfModifiedsinceHeader = 
conf.getBoolean("http.enable.if.modified.since.header", true);
+    this.robots.setConf(conf);
+
+    // NUTCH-1941: read list of alternating agent names
+    if (conf.getBoolean("http.agent.rotate", false)) {
+      String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
+      BufferedReader br = null;
+      try {
+        Reader reader = conf.getConfResourceAsReader(agentsFile);
+        br = new BufferedReader(reader);
+        userAgentNames = new ArrayList<String>();
+        String word = "";
+        while ((word = br.readLine()) != null) {
+          if (!word.trim().isEmpty())
+            userAgentNames.add(word.trim());
+        }
+
+        if (userAgentNames.size() == 0) {
+          logger.warn("Empty list of user agents in http.agent.rotate.file {}",
+              agentsFile);
+          userAgentNames = null;
+        }
+
+      } catch (Exception e) {
+        logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
+            StringUtils.stringifyException(e));
+        userAgentNames = null;
+      } finally {
+        if (br != null) {
+          try {
+            br.close();
+          } catch (IOException e) {
+            // ignore
+          }
+        }
+      }
+      if (userAgentNames == null) {
+        logger
+            .warn("Falling back to fixed user agent set via property 
http.agent.name");
+      }
+    }
+
+    String[] protocols = conf.getStrings("http.tls.supported.protocols",
+        "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+    String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
+        "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
+        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
+        "TLS_RSA_WITH_AES_256_CBC_SHA256",
+        "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
+        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
+        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
+        "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
+        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+        "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
+        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
+        "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
+        "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
+        "TLS_ECDH_RSA_WITH_RC4_128_SHA",
+        "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
+        "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
+        "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
+        "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
+        "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
+        "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
+        "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
+        "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
+        "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
+        "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
+        "TLS_KRB5_WITH_DES_CBC_MD5");
+
+    tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
+    tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
+
+    logConf();
+  }
+
+  // Inherited Javadoc
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+
+    String urlString = url.toString();
+    try {
+      URL u = new URL(urlString);
+
+      long startTime = System.currentTimeMillis();
+      Response response = getResponse(u, datum, false); // make a request
+
+      if (this.responseTime) {
+        int elapsedTime = (int) (System.currentTimeMillis() - startTime);
+        datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
+      }
+
+      int code = response.getCode();
+      datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
+        new Text(Integer.toString(code)));
+
+      byte[] content = response.getContent();
+      Content c = new Content(u.toString(), u.toString(),
+          (content == null ? EMPTY_CONTENT : content),
+          response.getHeader("Content-Type"), response.getHeaders(), 
this.conf);
+
+      if (code == 200) { // got a good response
+        return new ProtocolOutput(c); // return it
+
+      } else if (code >= 300 && code < 400) { // handle redirect
+        String location = response.getHeader("Location");
+        // some broken servers, such as MS IIS, use lowercase header name...
+        if (location == null)
+          location = response.getHeader("location");
+        if (location == null)
+          location = "";
+        u = new URL(u, location);
+        int protocolStatusCode;
+        switch (code) {
+        case 300: // multiple choices, preferred value in Location
+          protocolStatusCode = ProtocolStatus.MOVED;
+          break;
+        case 301: // moved permanently
+        case 305: // use proxy (Location is URL of proxy)
+          protocolStatusCode = ProtocolStatus.MOVED;
+          break;
+        case 302: // found (temporarily moved)
+        case 303: // see other (redirect after POST)
+        case 307: // temporary redirect
+          protocolStatusCode = ProtocolStatus.TEMP_MOVED;
+          break;
+        case 304: // not modified
+          protocolStatusCode = ProtocolStatus.NOTMODIFIED;
+          break;
+        default:
+          protocolStatusCode = ProtocolStatus.MOVED;
+        }
+        // handle this in the higher layer.
+        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, 
u));
+      } else if (code == 400) { // bad request, mark as GONE
+        if (logger.isTraceEnabled()) {
+          logger.trace("400 Bad request: " + u);
+        }
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
u));
+      } else if (code == 401) { // requires authorization, but no valid auth
+                                // provided.
+        if (logger.isTraceEnabled()) {
+          logger.trace("401 Authentication Required");
+        }
+        return new ProtocolOutput(c, new ProtocolStatus(
+            ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+                + urlString));
+      } else if (code == 404) {
+        return new ProtocolOutput(c, new ProtocolStatus(
+            ProtocolStatus.NOTFOUND, u));
+      } else if (code == 410) { // permanently GONE
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
+            "Http: " + code + " url=" + u));
+      } else {
+        return new ProtocolOutput(c, new ProtocolStatus(
+            ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
+      }
+    } catch (Throwable e) {
+      logger.error("Failed to get protocol output", e);
+      return new ProtocolOutput(null, new ProtocolStatus(e));
+    }
+  }
+
+  /*
+   * -------------------------- * </implementation:Protocol> *
+   * --------------------------
+   */
+
+  public String getProxyHost() {
+    return proxyHost;
+  }
+
+  public int getProxyPort() {
+    return proxyPort;
+  }
+
+  public boolean useProxy(URL url) {
+    if (!useProxy){
+      return false;
+    } else if (proxyException.get(url.getHost())!=null){
+      return false;
+    }
+    return useProxy;
+  }
+
+  public int getTimeout() {
+    return timeout;
+  }
+  
+  public boolean isIfModifiedSinceEnabled() {
+    return enableIfModifiedsinceHeader;
+  }
+
+  public int getMaxContent() {
+    return maxContent;
+  }
+
+  public String getUserAgent() {
+    if (userAgentNames!=null) {
+      return 
userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1));
+    }
+    return userAgent;
+  }
+
+  /**
+   * Value of "Accept-Language" request header sent by Nutch.
+   * 
+   * @return The value of the header "Accept-Language" header.
+   */
+  public String getAcceptLanguage() {
+    return acceptLanguage;
+  }
+
+  public String getAccept() {
+    return accept;
+  }
+
+  public boolean getUseHttp11() {
+    return useHttp11;
+  }
+
+  public Set<String> getTlsPreferredCipherSuites() {
+    return tlsPreferredCipherSuites;
+  }
+
+  public Set<String> getTlsPreferredProtocols() {
+    return tlsPreferredProtocols;
+  }
+
+  private static String getAgentString(String agentName, String agentVersion,
+      String agentDesc, String agentURL, String agentEmail) {
+
+    if ((agentName == null) || (agentName.trim().length() == 0)) {
+      // TODO : NUTCH-258
+      if (LOGGER.isErrorEnabled()) {
+        LOGGER.error("No User-Agent string set (http.agent.name)!");
+      }
+    }
+
+    StringBuffer buf = new StringBuffer();
+
+    buf.append(agentName);
+    if (agentVersion != null) {
+      buf.append("/");
+      buf.append(agentVersion);
+    }
+    if (((agentDesc != null) && (agentDesc.length() != 0))
+        || ((agentEmail != null) && (agentEmail.length() != 0))
+        || ((agentURL != null) && (agentURL.length() != 0))) {
+      buf.append(" (");
+
+      if ((agentDesc != null) && (agentDesc.length() != 0)) {
+        buf.append(agentDesc);
+        if ((agentURL != null) || (agentEmail != null))
+          buf.append("; ");
+      }
+
+      if ((agentURL != null) && (agentURL.length() != 0)) {
+        buf.append(agentURL);
+        if (agentEmail != null)
+          buf.append("; ");
+      }
+
+      if ((agentEmail != null) && (agentEmail.length() != 0))
+        buf.append(agentEmail);
+
+      buf.append(")");
+    }
+    return buf.toString();
+  }
+
+  protected void logConf() {
+    if (logger.isInfoEnabled()) {
+      logger.info("http.proxy.host = " + proxyHost);
+      logger.info("http.proxy.port = " + proxyPort);
+      logger.info("http.proxy.exception.list = " + useProxy);
+      logger.info("http.timeout = " + timeout);
+      logger.info("http.content.limit = " + maxContent);
+      logger.info("http.agent = " + userAgent);
+      logger.info("http.accept.language = " + acceptLanguage);
+      logger.info("http.accept = " + accept);
+    }
+  }
+
+  public byte[] processGzipEncoded(byte[] compressed, URL url)
+      throws IOException {
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("uncompressing....");
+    }
+
+    // content can be empty (i.e. redirection) in which case
+    // there is nothing to unzip
+    if (compressed.length == 0)
+      return compressed;
+
+    byte[] content;
+    if (getMaxContent() >= 0) {
+      content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
+    } else {
+      content = GZIPUtils.unzipBestEffort(compressed);
+    }
+
+    if (content == null)
+      throw new IOException("unzipBestEffort returned null");
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("fetched " + compressed.length
+          + " bytes of compressed content (expanded to " + content.length
+          + " bytes) from " + url);
+    }
+    return content;
+  }
+
+  public byte[] processDeflateEncoded(byte[] compressed, URL url)
+      throws IOException {
+
+    // content can be empty (i.e. redirection) in which case
+    // there is nothing to deflate
+    if (compressed.length == 0)
+      return compressed;
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("inflating....");
+    }
+
+    byte[] content;
+    if (getMaxContent() >= 0) {
+      content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
+    } else {
+      content = DeflateUtils.inflateBestEffort(compressed);
+    }
+
+    if (content == null)
+      throw new IOException("inflateBestEffort returned null");
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("fetched " + compressed.length
+          + " bytes of compressed content (expanded to " + content.length
+          + " bytes) from " + url);
+    }
+    return content;
+  }
+
+  protected static void main(HttpBase http, String[] args) throws Exception {
+    boolean verbose = false;
+    String url = null;
+
+    String usage = "Usage: Http [-verbose] [-timeout N] url";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    for (int i = 0; i < args.length; i++) { // parse command line
+      if (args[i].equals("-timeout")) { // found -timeout option
+        http.timeout = Integer.parseInt(args[++i]) * 1000;
+      } else if (args[i].equals("-verbose")) { // found -verbose option
+        verbose = true;
+      } else if (i != args.length - 1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else
+        // root is required parameter
+        url = args[i];
+    }
+
+    // if (verbose) {
+    // LOGGER.setLevel(Level.FINE);
+    // }
+
+    ProtocolOutput out = http
+        .getProtocolOutput(new Text(url), new CrawlDatum());
+    Content content = out.getContent();
+
+    System.out.println("Status: " + out.getStatus());
+    if (content != null) {
+      System.out.println("Content Type: " + content.getContentType());
+      System.out.println("Content Length: "
+          + content.getMetadata().get(Response.CONTENT_LENGTH));
+      System.out.println("Content:");
+      String text = new String(content.getContent());
+      System.out.println(text);
+    }
+  }
+
+  protected abstract Response getResponse(URL url, CrawlDatum datum,
+      boolean followRedirects) throws ProtocolException, IOException;
+
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+    return robots.getRobotRulesSet(this, url);
+  }
+  
+  /**
+   * Transforming a String[] into a HashMap for faster searching
+   * @param input String[]
+   * @return a new HashMap
+   */
+  private HashMap arrayToMap(String[]input){
+    if (input==null ||input.length==0) {
+      return new HashMap();
+    }
+    HashMap hm=new HashMap();
+    for (int i=0;i<input.length;i++){
+      if (!"".equals(input[i].trim())){
+        hm.put(input[i],input[i]);
+      }
+    }
+    return hm;
+  }
+}


http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
 
b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
new file mode 100644
index 0000000..ff7ef5b
--- /dev/null
+++ 
b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http.api;
+
+// Nutch imports
+import org.apache.nutch.protocol.ProtocolException;
+
+public class HttpException extends ProtocolException {
+
+  public HttpException() {
+    super();
+  }
+
+  public HttpException(String message) {
+    super(message);
+  }
+
+  public HttpException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public HttpException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 
b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
new file mode 100644
index 0000000..185ca15
--- /dev/null
+++ 
b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.RobotRulesParser;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to HTTP protocol. 
It
+ * extends the generic {@link RobotRulesParser} class and contains Http 
protocol
+ * specific implementation for obtaining the robots file.
+ */
+public class HttpRobotRulesParser extends RobotRulesParser {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HttpRobotRulesParser.class);
+  protected boolean allowForbidden = false;
+
+  HttpRobotRulesParser() {
+  }
+
+  public HttpRobotRulesParser(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    allowForbidden = conf.getBoolean("http.robots.403.allow", true);
+  }
+
+  /** Compose unique key to store and access robot rules in cache for given 
URL */
+  protected static String getCacheKey(URL url) {
+    String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+                                                       // case
+    String host = url.getHost().toLowerCase(); // normalize to lower case
+    int port = url.getPort();
+    if (port == -1) {
+      port = url.getDefaultPort();
+    }
+    /*
+     * Robot rules apply only to host, protocol, and port where robots.txt is
+     * hosted (cf. NUTCH-1752). Consequently
+     */
+    String cacheKey = protocol + ":" + host + ":" + port;
+    return cacheKey;
+  }
+
+  /**
+   * Get the rules from robots.txt which applies for the given {@code url}.
+   * Robot rules are cached for a unique combination of host, protocol, and
+   * port. If no rules are found in the cache, a HTTP request is send to fetch
+   * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
+   * rules are cached to avoid re-fetching and re-parsing it again.
+   * 
+   * @param http
+   *          The {@link Protocol} object
+   * @param url
+   *          URL robots.txt applies to
+   * 
+   * @return {@link BaseRobotRules} holding the rules from robots.txt
+   */
+  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+
+    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
+      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+    }
+
+    String cacheKey = getCacheKey(url);
+    BaseRobotRules robotRules = CACHE.get(cacheKey);
+
+    if (robotRules != null) {
+      return robotRules; // cached rule
+    } else if (LOG.isTraceEnabled()) {
+      LOG.trace("cache miss " + url);
+    }
+
+    boolean cacheRule = true;
+    URL redir = null;
+
+    if (isWhiteListed(url)) {
+      // check in advance whether a host is whitelisted
+      // (we do not need to fetch robots.txt)
+      robotRules = EMPTY_RULES;
+      LOG.info("Whitelisted host found for: {}", url);
+      LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
+          url.getHost());
+
+    } else {
+      try {
+        Response response = ((HttpBase) http).getResponse(new URL(url,
+            "/robots.txt"), new CrawlDatum(), true);
+        // try one level of redirection ?
+        if (response.getCode() == 301 || response.getCode() == 302) {
+          String redirection = response.getHeader("Location");
+          if (redirection == null) {
+            // some versions of MS IIS are known to mangle this header
+            redirection = response.getHeader("location");
+          }
+          if (redirection != null) {
+            if (!redirection.startsWith("http")) {
+              // RFC says it should be absolute, but apparently it isn't
+              redir = new URL(url, redirection);
+            } else {
+              redir = new URL(redirection);
+            }
+
+            response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
+                true);
+          }
+        }
+
+        if (response.getCode() == 200) // found rules: parse them
+          robotRules = parseRules(url.toString(), response.getContent(),
+              response.getHeader("Content-Type"), agentNames);
+
+        else if ((response.getCode() == 403) && (!allowForbidden))
+          robotRules = FORBID_ALL_RULES; // use forbid all
+        else if (response.getCode() >= 500) {
+          cacheRule = false; // try again later to fetch robots.txt
+          robotRules = EMPTY_RULES;
+        } else
+          robotRules = EMPTY_RULES; // use default rules
+      } catch (Throwable t) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+        }
+        cacheRule = false; // try again later to fetch robots.txt
+        robotRules = EMPTY_RULES;
+      }
+    }
+
+    if (cacheRule) {
+      CACHE.put(cacheKey, robotRules); // cache rules for host
+      if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
+        // cache also for the redirected host
+        CACHE.put(getCacheKey(redir), robotRules);
+      }
+    }
+
+    return robotRules;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
 
b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
new file mode 100644
index 0000000..972bb3c
--- /dev/null
+++ 
b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>Common API used by HTTP plugins ({@link org.apache.nutch.protocol.http 
http},
+{@link org.apache.nutch.protocol.httpclient httpclient})</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 
b/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
new file mode 100644
index 0000000..23e4ef6
--- /dev/null
+++ 
b/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * JUnit test case which tests 1. that robots filtering is performed correctly
+ * as per the agent name 2. that crawl delay is extracted correctly from the
+ * robots file
+ * 
+ */
+public class TestRobotRulesParser {
+
+  private static final String CONTENT_TYPE = "text/plain";
+  private static final String SINGLE_AGENT = "Agent1";
+  private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
+  private static final String UNKNOWN_AGENT = "AgentABC";
+  private static final String CR = "\r";
+
+  private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
+      + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
+      + CR
+      + "Crawl-delay: 10"
+      + CR // set crawl delay for Agent1 as 10 sec
+      + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
+      + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
+      + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
+                                                                          // 
crawl
+                                                                          // 
delay
+                                                                          // 
for
+                                                                          // 
other
+                                                                          // 
agents
+
+  private static final String[] TEST_PATHS = new String[] {
+      "http://example.com/a";, "http://example.com/a/bloh/foo.html";,
+      "http://example.com/b";, "http://example.com/c";,
+      "http://example.com/b/a/index.html";,
+      "http://example.com/foo/bar/baz.html"; };
+
+  private static final boolean[] RESULTS = new boolean[] { false, // /a
+      false, // /a/bloh/foo.html
+      true, // /b
+      true, // /c
+      false, // /b/a/index.html
+      true // /foo/bar/baz.html
+  };
+
+  private HttpRobotRulesParser parser;
+  private BaseRobotRules rules;
+
+  public TestRobotRulesParser() {
+    parser = new HttpRobotRulesParser();
+  }
+
+  /**
+   * Test that the robots rules are interpreted correctly by the robots rules
+   * parser.
+   */
+  @Test
+  public void testRobotsAgent() {
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT);
+
+    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+      Assert.assertTrue(
+          "testing on agent (" + SINGLE_AGENT + "), and " + "path "
+              + TEST_PATHS[counter] + " got "
+              + rules.isAllowed(TEST_PATHS[counter]),
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+    }
+
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, MULTIPLE_AGENTS);
+
+    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+      Assert.assertTrue(
+          "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+              + TEST_PATHS[counter] + " got "
+              + rules.isAllowed(TEST_PATHS[counter]),
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+    }
+  }
+
+  /**
+   * Test that the crawl delay is extracted from the robots file for respective
+   * agent. If its not specified for a given agent, default value must be
+   * returned.
+   */
+  @Test
+  public void testCrawlDelay() {
+    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
+    // returned by the parser
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT);
+    Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
+        (rules.getCrawlDelay() == 10000));
+
+    // for UNKNOWN_AGENT, the default crawl delay must be returned.
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, UNKNOWN_AGENT);
+    Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
+        (rules.getCrawlDelay() == Long.MIN_VALUE));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/build.xml 
b/nutch-plugins/lib-nekohtml/build.xml
new file mode 100644
index 0000000..4bca1af
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/build.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-nekohtml" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+  <!--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.
+   ! -->
+  <target name="compile" depends="init, resolve-default"/>
+
+  <target name="jar" depends="compile"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/ivy.xml 
b/nutch-plugins/lib-nekohtml/ivy.xml
new file mode 100644
index 0000000..ed70b80
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" 
conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/plugin.xml 
b/nutch-plugins/lib-nekohtml/plugin.xml
new file mode 100644
index 0000000..513c9a7
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/plugin.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! NekoHTML is a simple HTML scanner and tag balancer that enables 
+ ! application programmers to parse HTML documents and access the 
+ ! information using standard XML interfaces.
+ ! (http://sourceforge.net/projects/nekohtml/)
+ ! 
+ ! License : 
https://nekohtml.svn.sourceforge.net/svnroot/nekohtml/trunk/LICENSE.txt
+ !-->
+<plugin
+   id="lib-nekohtml"
+   name="CyberNeko HTML Parser"
+   version="1.9.19"
+   provider-name="net.sourceforge.nekohtml">
+
+   <runtime>
+     <library name="nekohtml-1.9.19.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/pom.xml 
b/nutch-plugins/lib-nekohtml/pom.xml
new file mode 100644
index 0000000..df544bb
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-nekohtml</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-nekohtml</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>net.sourceforge.nekohtml</groupId>
+            <artifactId>nekohtml</artifactId>
+            <version>1.9.22</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/build.xml 
b/nutch-plugins/lib-regex-filter/build.xml
new file mode 100644
index 0000000..9702ca2
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-regex-filter" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/ivy.xml 
b/nutch-plugins/lib-regex-filter/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/plugin.xml 
b/nutch-plugins/lib-regex-filter/plugin.xml
new file mode 100644
index 0000000..42de8f1
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/plugin.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for RegExp based URL filters
+ !-->
+<plugin
+   id="lib-regex-filter"
+   name="Regex URL Filter Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-regex-filter.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/pom.xml 
b/nutch-plugins/lib-regex-filter/pom.xml
new file mode 100644
index 0000000..1074ad7
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/pom.xml
@@ -0,0 +1,54 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-regex-filter</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-regex-filter</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <version>2.6</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
 
b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
new file mode 100644
index 0000000..e408586
--- /dev/null
+++ 
b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+/**
+ * A generic regular expression rule.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexRule {
+
+  private final boolean sign;
+  
+  private final String hostOrDomain;
+  
+  private final String regex;
+
+  /**
+   * Constructs a new regular expression rule.
+   * 
+   * @param sign
+   *          specifies if this rule must filter-in or filter-out. A
+   *          <code>true</code> value means that any url matching this rule 
must
+   *          be accepted, a <code>false</code> value means that any url
+   *          matching this rule must be rejected.
+   * @param regex
+   *          is the regular expression used for matching (see
+   *          {@link #match(String)} method).
+   */
+  protected RegexRule(boolean sign, String regex) {
+    this(sign, regex, null);
+  }
+  
+  /**
+   * Constructs a new regular expression rule.
+   * 
+   * @param sign
+   *          specifies if this rule must filter-in or filter-out. A
+   *          <code>true</code> value means that any url matching this rule 
must
+   *          be accepted, a <code>false</code> value means that any url
+   *          matching this rule must be rejected.
+   * @param regex
+   *          is the regular expression used for matching (see
+   *          {@link #match(String)} method).
+   * @param hostOrDomain
+   *          the host or domain to which this regex belongs
+   */
+  protected RegexRule(boolean sign, String regex, String hostOrDomain) {
+    this.sign = sign;
+    this.hostOrDomain = hostOrDomain;
+    this.regex = regex;
+  }
+
+  /**
+   * Return if this rule is used for filtering-in or out.
+   * 
+   * @return <code>true</code> if any url matching this rule must be accepted,
+   *         otherwise <code>false</code>.
+   */
+  protected boolean accept() {
+    return sign;
+  }
+
+  /**
+   * Return if this rule is used for filtering-in or out.
+   *
+   * @return host or domain this regex rule belongs to
+   */
+  protected String hostOrDomain() { return hostOrDomain; }
+  
+  /**
+   * Return if this rule's regex.
+   *
+   * @return this regex
+   */
+  protected String regex() { return regex; }
+
+  /**
+   * Checks if a url matches this rule.
+   * 
+   * @param url
+   *          is the url to check.
+   * @return <code>true</code> if the specified url matches this rule, 
otherwise
+   *         <code>false</code>.
+   */
+  protected abstract boolean match(String url);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 
b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
new file mode 100644
index 0000000..f5cc081
--- /dev/null
+++ 
b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -0,0 +1,315 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.File;
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.util.List;
+import java.util.ArrayList;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.net.*;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
+ * expressions.
+ * 
+ * <p>
+ * The regular expressions rules are expressed in a file. The file of rules is
+ * determined for each implementation using the
+ * {@link #getRulesReader(Configuration conf)} method.
+ * </p>
+ * 
+ * <p>
+ * The format of this file is made of many rules (one per line):<br/>
+ * <code>
+ * [+-]&lt;regex&gt;
+ * </code><br/>
+ * where plus (<code>+</code>)means go ahead and index it and minus (
+ * <code>-</code>)means no.
+ * </p>
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexURLFilterBase implements URLFilter {
+
+  /** My logger */
+  private final static Logger LOG = LoggerFactory
+      .getLogger(RegexURLFilterBase.class);
+
+  /** An array of applicable rules */
+  private List<RegexRule> rules;
+
+  /** The current configuration */
+  private Configuration conf;
+
+  /**
+   * Constructs a new empty RegexURLFilterBase
+   */
+  public RegexURLFilterBase() {
+  }
+
+  /**
+   * Constructs a new RegexURLFilter and init it with a file of rules.
+   * 
+   * @param filename
+   *          is the name of rules file.
+   */
+  public RegexURLFilterBase(File filename) throws IOException,
+      IllegalArgumentException {
+    this(new FileReader(filename));
+  }
+
+  /**
+   * Constructs a new RegexURLFilter and inits it with a list of rules.
+   * 
+   * @param rules
+   *          string with a list of rules, one rule per line
+   * @throws IOException
+   * @throws IllegalArgumentException
+   */
+  public RegexURLFilterBase(String rules) throws IOException,
+      IllegalArgumentException {
+    this(new StringReader(rules));
+  }
+
+  /**
+   * Constructs a new RegexURLFilter and init it with a Reader of rules.
+   * 
+   * @param reader
+   *          is a reader of rules.
+   */
+  protected RegexURLFilterBase(Reader reader) throws IOException,
+      IllegalArgumentException {
+    rules = readRules(reader);
+  }
+
+  /**
+   * Creates a new {@link RegexRule}.
+   * 
+   * @param sign
+   *          of the regular expression. A <code>true</code> value means that
+   *          any URL matching this rule must be included, whereas a
+   *          <code>false</code> value means that any URL matching this rule
+   *          must be excluded.
+   * @param regex
+   *          is the regular expression associated to this rule.
+   */
+  protected abstract RegexRule createRule(boolean sign, String regex);
+  
+  /**
+   * Creates a new {@link RegexRule}.
+   * @param 
+   *        sign of the regular expression.
+   *        A <code>true</code> value means that any URL matching this rule
+   *        must be included, whereas a <code>false</code>
+   *        value means that any URL matching this rule must be excluded.
+   * @param regex
+   *        is the regular expression associated to this rule.
+   * @param hostOrDomain
+   *        the host or domain to which this regex belongs
+   */
+  protected abstract RegexRule createRule(boolean sign, String regex, String 
hostOrDomain);
+
+  /**
+   * Returns the name of the file of rules to use for a particular
+   * implementation.
+   * 
+   * @param conf
+   *          is the current configuration.
+   * @return the name of the resource containing the rules to use.
+   */
+  protected abstract Reader getRulesReader(Configuration conf)
+      throws IOException;
+
+  /*
+   * -------------------------- * <implementation:URLFilter> *
+   * --------------------------
+   */
+
+  // Inherited Javadoc
+  public String filter(String url) {
+    String host = URLUtil.getHost(url);
+    String domain = null;
+    
+    try {
+      domain = URLUtil.getDomainName(url);
+    } catch (MalformedURLException e) {
+      // shouldnt happen here right?
+    }
+    
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("URL belongs to host " + host + " and domain " + domain);
+    }
+
+    for (RegexRule rule : rules) {
+      // Skip the skip for rules that don't share the same host and domain
+      if (rule.hostOrDomain() != null &&
+            !rule.hostOrDomain().equals(host) &&
+            !rule.hostOrDomain().equals(domain)) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + 
rule.hostOrDomain());
+        }
+
+        continue;
+      }
+    
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " 
and domain " + domain);
+      }
+
+      if (rule.match(url)) {
+        return rule.accept() ? url : null;
+      }
+    }
+    ;
+    return null;
+  }
+
+  /*
+   * --------------------------- * </implementation:URLFilter> *
+   * ---------------------------
+   */
+
+  /*
+   * ----------------------------- * <implementation:Configurable> *
+   * -----------------------------
+   */
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    Reader reader = null;
+    try {
+      reader = getRulesReader(conf);
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
+      throw new RuntimeException(e.getMessage(), e);
+    }
+    try {
+      rules = readRules(reader);
+    } catch (IOException e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
+      throw new RuntimeException(e.getMessage(), e);
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /*
+   * ------------------------------ * </implementation:Configurable> *
+   * ------------------------------
+   */
+
+  /**
+   * Read the specified file of rules.
+   * 
+   * @param reader
+   *          is a reader of regular expressions rules.
+   * @return the corresponding {@RegexRule rules}.
+   */
+  private List<RegexRule> readRules(Reader reader) throws IOException,
+      IllegalArgumentException {
+
+    BufferedReader in = new BufferedReader(reader);
+    List<RegexRule> rules = new ArrayList<RegexRule>();
+    String line;
+    String hostOrDomain = null;
+    
+    while ((line = in.readLine()) != null) {
+      if (line.length() == 0) {
+        continue;
+      }
+      char first = line.charAt(0);
+      boolean sign = false;
+      switch (first) {
+      case '+':
+        sign = true;
+        break;
+      case '-':
+        sign = false;
+        break;
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
+        continue;
+      case '>':
+        hostOrDomain = line.substring(1).trim();
+        continue;
+      case '<':
+        hostOrDomain = null;
+        continue;
+      default:
+        throw new IOException("Invalid first character: " + line);
+      }
+
+      String regex = line.substring(1);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain);
+      }
+      RegexRule rule = createRule(sign, regex, hostOrDomain);
+      rules.add(rule);
+    }
+    return rules;
+  }
+
+  /**
+   * Filter the standard input using a RegexURLFilterBase.
+   * 
+   * @param filter
+   *          is the RegexURLFilterBase to use for filtering the standard 
input.
+   * @param args
+   *          some optional parameters (not used).
+   */
+  public static void main(RegexURLFilterBase filter, String args[])
+      throws IOException, IllegalArgumentException {
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
+        System.out.print("+");
+        System.out.println(out);
+      } else {
+        System.out.print("-");
+        System.out.println(line);
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
 
b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
new file mode 100644
index 0000000..b849353
--- /dev/null
+++ 
b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} library,
+ * abstracting away from regular expression implementations.
+ */
+package org.apache.nutch.urlfilter.api;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
 
b/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
new file mode 100644
index 0000000..0b58231
--- /dev/null
+++ 
b/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.nutch.net.URLFilter;
+
+/**
+ * JUnit based test of class <code>RegexURLFilterBase</code>.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexURLFilterBaseTest {
+
+  /** My logger */
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(RegexURLFilterBaseTest.class);
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  protected abstract URLFilter getURLFilter(Reader rules);
+
+  protected void bench(int loops, String file) {
+    try {
+      bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
+  protected void bench(int loops, Reader rules, Reader urls) {
+    long start = System.currentTimeMillis();
+    try {
+      URLFilter filter = getURLFilter(rules);
+      FilteredURL[] expected = readURLFile(urls);
+      for (int i = 0; i < loops; i++) {
+        test(filter, expected);
+      }
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+    LOG.info("bench time (" + loops + ") "
+        + (System.currentTimeMillis() - start) + "ms");
+  }
+
+  protected void test(String file) {
+    try {
+      test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
+  protected void test(Reader rules, Reader urls) {
+    try {
+      test(getURLFilter(rules), readURLFile(urls));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
+  protected void test(URLFilter filter, FilteredURL[] expected) {
+    for (int i = 0; i < expected.length; i++) {
+      String result = filter.filter(expected[i].url);
+      if (result != null) {
+        Assert.assertTrue(expected[i].url, expected[i].sign);
+      } else {
+        Assert.assertFalse(expected[i].url, expected[i].sign);
+      }
+    }
+  }
+
+  private static FilteredURL[] readURLFile(Reader reader) throws IOException {
+    BufferedReader in = new BufferedReader(reader);
+    List<FilteredURL> list = new ArrayList<FilteredURL>();
+    String line;
+    while ((line = in.readLine()) != null) {
+      if (line.length() != 0) {
+        list.add(new FilteredURL(line));
+      }
+    }
+    return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
+  }
+
+  private static class FilteredURL {
+
+    boolean sign;
+    String url;
+
+    FilteredURL(String line) {
+      switch (line.charAt(0)) {
+      case '+':
+        sign = true;
+        break;
+      case '-':
+        sign = false;
+        break;
+      default:
+        // Simply ignore...
+      }
+      url = line.substring(1);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/build-ivy.xml 
b/nutch-plugins/lib-selenium/build-ivy.xml
new file mode 100644
index 0000000..3abcf6d
--- /dev/null
+++ b/nutch-plugins/lib-selenium/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without 
any special installation -->
+        <get 
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar";
 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not 
already dropped
+              it into ant's lib dir (note that the latter copy will always 
take precedence).
+              We will not fail as long as local lib dir exists (it may be 
empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/build.xml 
b/nutch-plugins/lib-selenium/build.xml
new file mode 100644
index 0000000..7c6d98d
--- /dev/null
+++ b/nutch-plugins/lib-selenium/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">    
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+  </path>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt 
b/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt
new file mode 100644
index 0000000..1892a62
--- /dev/null
+++ b/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt
@@ -0,0 +1,15 @@
+1. Upgrade various driver versions dependency in 
src/plugin/lib-selenium/ivy.xml
+
+2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
+
+   To get a list of dependencies and their versions execute:
+    $ ant -f ./build-ivy.xml
+    $ ls lib | sed 's/^/     <library name="/g' | sed 's/$/">\n       <export 
name="*"\/>\n     <\/library>/g'
+
+   Note that all dependent libraries are exported for a "library" plugin 
("lib-selenium").
+
+   N.B. The above Regex + Sed commands may not work if you are using MacOSX's 
Sed. In this instance you can instal GNU Sed as follows
+
+   $ brew install gnu-sed --with-default-names
+
+   You can then restart your terminal and the Regex + Sed command should work 
just fine!

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/ivy.xml 
b/nutch-plugins/lib-selenium/ivy.xml
new file mode 100644
index 0000000..701b725
--- /dev/null
+++ b/nutch-plugins/lib-selenium/ivy.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" 
rev="2.48.2" />
+    
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/plugin.xml 
b/nutch-plugins/lib-selenium/plugin.xml
new file mode 100644
index 0000000..a86d665
--- /dev/null
+++ b/nutch-plugins/lib-selenium/plugin.xml
@@ -0,0 +1,175 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+   id="lib-selenium"
+   name="HTTP Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-selenium.jar">
+        <export name="*"/>
+     </library>
+     <!-- all classes from dependent libraries are exported -->
+     <library name="cglib-nodep-2.1_3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-codec-1.10.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-collections-3.2.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-exec-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-io-2.4.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-jxpath-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-lang3-3.4.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-logging-1.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="cssparser-0.9.16.jar">
+       <export name="*"/>
+     </library>
+     <library name="gson-2.3.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="guava-18.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="htmlunit-2.18.jar">
+       <export name="*"/>
+     </library>
+     <library name="htmlunit-core-js-2.17.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpclient-4.5.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpcore-4.4.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpmime-4.5.jar">
+       <export name="*"/>
+     </library>
+     <library name="ini4j-0.5.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-io-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-util-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="jna-4.1.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="jna-platform-4.1.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="nekohtml-1.9.22.jar">
+       <export name="*"/>
+     </library>
+     <library name="netty-3.5.2.Final.jar">
+       <export name="*"/>
+     </library>
+     <library name="operadriver-1.5.jar">
+       <export name="*"/>
+     </library>
+     <library name="operalaunchers-1.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="phantomjsdriver-1.2.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="protobuf-java-2.4.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="sac-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-api-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-chrome-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-edge-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-firefox-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-htmlunit-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-ie-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-java-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-leg-rc-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-remote-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-safari-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-support-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="serializer-2.7.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="webbit-0.4.14.jar">
+       <export name="*"/>
+     </library>
+     <library name="websocket-api-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="websocket-client-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="websocket-common-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="xalan-2.7.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="xercesImpl-2.11.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="xml-apis-1.4.01.jar">
+       <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/pom.xml 
b/nutch-plugins/lib-selenium/pom.xml
new file mode 100644
index 0000000..fed912d
--- /dev/null
+++ b/nutch-plugins/lib-selenium/pom.xml
@@ -0,0 +1,49 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-selenium</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-selenium</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.seleniumhq.selenium</groupId> 
<artifactId>selenium-java</artifactId> <version>2.48.2</version>
+        </dependency>
+        <dependency>
+            <groupId>com.opera</groupId> <artifactId>operadriver</artifactId> 
<version>1.5</version>
+        </dependency>
+        <dependency>
+            <groupId>com.codeborne</groupId> 
<artifactId>phantomjsdriver</artifactId> <version>1.2.1</version>
+        </dependency>
+    </dependencies>
+
+</project>

[16/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Reply via email to