http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java new file mode 100644 index 0000000..9f616fe --- /dev/null +++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -0,0 +1,587 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.http.api; + +// JDK imports +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.net.URL; +import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ThreadLocalRandom; +// Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.util.GZIPUtils; +import org.apache.nutch.util.DeflateUtils; +import org.apache.hadoop.util.StringUtils; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; + +// crawler-commons imports +import crawlercommons.robots.BaseRobotRules; + +public abstract class HttpBase implements Protocol { + + public static final Text RESPONSE_TIME = new Text("_rs_"); + + public static final int BUFFER_SIZE = 8 * 1024; + + private static final byte[] EMPTY_CONTENT = new byte[0]; + + private HttpRobotRulesParser robots = null; + + private ArrayList<String> userAgentNames = null; + + /** The proxy hostname. */ + protected String proxyHost = null; + + /** The proxy port. */ + protected int proxyPort = 8080; + + /** The proxy exception list. */ + protected HashMap proxyException = new HashMap(); + + /** Indicates if a proxy is used */ + protected boolean useProxy = false; + + /** The network timeout in millisecond */ + protected int timeout = 10000; + + /** The length limit for downloaded content, in bytes. */ + protected int maxContent = 64 * 1024; + + /** The Nutch 'User-Agent' request header */ + protected String userAgent = getAgentString("NutchCVS", null, "Nutch", + "http://nutch.apache.org/bot.html", "[email protected]"); + + /** The "Accept-Language" request header value. */ + protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3"; + + /** The "Accept" request header value. */ + protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; + + /** The default logger */ + private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class); + + /** The specified logger */ + private Logger logger = LOGGER; + + /** The nutch configuration */ + private Configuration conf = null; + + /** Do we use HTTP/1.1? */ + protected boolean useHttp11 = false; + + /** + * Record response time in CrawlDatum's meta data, see property + * http.store.responsetime. + */ + protected boolean responseTime = true; + + /** Skip page if Crawl-Delay longer than this value. */ + protected long maxCrawlDelay = -1L; + + /** Which TLS/SSL protocols to support */ + protected Set<String> tlsPreferredProtocols; + + /** Which TLS/SSL cipher suites to support */ + protected Set<String> tlsPreferredCipherSuites; + + /** Configuration directive for If-Modified-Since HTTP header */ + public boolean enableIfModifiedsinceHeader = true; + + /** Creates a new instance of HttpBase */ + public HttpBase() { + this(null); + } + + /** Creates a new instance of HttpBase */ + public HttpBase(Logger logger) { + if (logger != null) { + this.logger = logger; + } + robots = new HttpRobotRulesParser(); + } + + // Inherited Javadoc + public void setConf(Configuration conf) { + this.conf = conf; + this.proxyHost = conf.get("http.proxy.host"); + this.proxyPort = conf.getInt("http.proxy.port", 8080); + this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list")); + this.useProxy = (proxyHost != null && proxyHost.length() > 0); + this.timeout = conf.getInt("http.timeout", 10000); + this.maxContent = conf.getInt("http.content.limit", 64 * 1024); + this.userAgent = getAgentString(conf.get("http.agent.name"), + conf.get("http.agent.version"), conf.get("http.agent.description"), + conf.get("http.agent.url"), conf.get("http.agent.email")); + this.acceptLanguage = conf.get("http.accept.language", acceptLanguage); + this.accept = conf.get("http.accept", accept); + // backward-compatible default setting + this.useHttp11 = conf.getBoolean("http.useHttp11", false); + this.responseTime = conf.getBoolean("http.store.responsetime", true); + this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true); + this.robots.setConf(conf); + + // NUTCH-1941: read list of alternating agent names + if (conf.getBoolean("http.agent.rotate", false)) { + String agentsFile = conf.get("http.agent.rotate.file", "agents.txt"); + BufferedReader br = null; + try { + Reader reader = conf.getConfResourceAsReader(agentsFile); + br = new BufferedReader(reader); + userAgentNames = new ArrayList<String>(); + String word = ""; + while ((word = br.readLine()) != null) { + if (!word.trim().isEmpty()) + userAgentNames.add(word.trim()); + } + + if (userAgentNames.size() == 0) { + logger.warn("Empty list of user agents in http.agent.rotate.file {}", + agentsFile); + userAgentNames = null; + } + + } catch (Exception e) { + logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, + StringUtils.stringifyException(e)); + userAgentNames = null; + } finally { + if (br != null) { + try { + br.close(); + } catch (IOException e) { + // ignore + } + } + } + if (userAgentNames == null) { + logger + .warn("Falling back to fixed user agent set via property http.agent.name"); + } + } + + String[] protocols = conf.getStrings("http.tls.supported.protocols", + "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); + String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", + "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", + "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", + "TLS_RSA_WITH_AES_256_CBC_SHA256", + "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384", + "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", + "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256", + "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256", + "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA", + "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", + "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256", + "TLS_RSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256", + "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", + "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256", + "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA", + "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA", + "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA", + "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", + "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA", + "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA", + "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", + "TLS_ECDH_RSA_WITH_RC4_128_SHA", + "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA", + "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA", + "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA", + "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5", + "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256", + "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA", + "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA", + "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5", + "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA", + "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA", + "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA", + "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA", + "TLS_KRB5_WITH_DES_CBC_MD5"); + + tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); + tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); + + logConf(); + } + + // Inherited Javadoc + public Configuration getConf() { + return this.conf; + } + + public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { + + String urlString = url.toString(); + try { + URL u = new URL(urlString); + + long startTime = System.currentTimeMillis(); + Response response = getResponse(u, datum, false); // make a request + + if (this.responseTime) { + int elapsedTime = (int) (System.currentTimeMillis() - startTime); + datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime)); + } + + int code = response.getCode(); + datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, + new Text(Integer.toString(code))); + + byte[] content = response.getContent(); + Content c = new Content(u.toString(), u.toString(), + (content == null ? EMPTY_CONTENT : content), + response.getHeader("Content-Type"), response.getHeaders(), this.conf); + + if (code == 200) { // got a good response + return new ProtocolOutput(c); // return it + + } else if (code >= 300 && code < 400) { // handle redirect + String location = response.getHeader("Location"); + // some broken servers, such as MS IIS, use lowercase header name... + if (location == null) + location = response.getHeader("location"); + if (location == null) + location = ""; + u = new URL(u, location); + int protocolStatusCode; + switch (code) { + case 300: // multiple choices, preferred value in Location + protocolStatusCode = ProtocolStatus.MOVED; + break; + case 301: // moved permanently + case 305: // use proxy (Location is URL of proxy) + protocolStatusCode = ProtocolStatus.MOVED; + break; + case 302: // found (temporarily moved) + case 303: // see other (redirect after POST) + case 307: // temporary redirect + protocolStatusCode = ProtocolStatus.TEMP_MOVED; + break; + case 304: // not modified + protocolStatusCode = ProtocolStatus.NOTMODIFIED; + break; + default: + protocolStatusCode = ProtocolStatus.MOVED; + } + // handle this in the higher layer. + return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u)); + } else if (code == 400) { // bad request, mark as GONE + if (logger.isTraceEnabled()) { + logger.trace("400 Bad request: " + u); + } + return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u)); + } else if (code == 401) { // requires authorization, but no valid auth + // provided. + if (logger.isTraceEnabled()) { + logger.trace("401 Authentication Required"); + } + return new ProtocolOutput(c, new ProtocolStatus( + ProtocolStatus.ACCESS_DENIED, "Authentication required: " + + urlString)); + } else if (code == 404) { + return new ProtocolOutput(c, new ProtocolStatus( + ProtocolStatus.NOTFOUND, u)); + } else if (code == 410) { // permanently GONE + return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, + "Http: " + code + " url=" + u)); + } else { + return new ProtocolOutput(c, new ProtocolStatus( + ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u)); + } + } catch (Throwable e) { + logger.error("Failed to get protocol output", e); + return new ProtocolOutput(null, new ProtocolStatus(e)); + } + } + + /* + * -------------------------- * </implementation:Protocol> * + * -------------------------- + */ + + public String getProxyHost() { + return proxyHost; + } + + public int getProxyPort() { + return proxyPort; + } + + public boolean useProxy(URL url) { + if (!useProxy){ + return false; + } else if (proxyException.get(url.getHost())!=null){ + return false; + } + return useProxy; + } + + public int getTimeout() { + return timeout; + } + + public boolean isIfModifiedSinceEnabled() { + return enableIfModifiedsinceHeader; + } + + public int getMaxContent() { + return maxContent; + } + + public String getUserAgent() { + if (userAgentNames!=null) { + return userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1)); + } + return userAgent; + } + + /** + * Value of "Accept-Language" request header sent by Nutch. + * + * @return The value of the header "Accept-Language" header. + */ + public String getAcceptLanguage() { + return acceptLanguage; + } + + public String getAccept() { + return accept; + } + + public boolean getUseHttp11() { + return useHttp11; + } + + public Set<String> getTlsPreferredCipherSuites() { + return tlsPreferredCipherSuites; + } + + public Set<String> getTlsPreferredProtocols() { + return tlsPreferredProtocols; + } + + private static String getAgentString(String agentName, String agentVersion, + String agentDesc, String agentURL, String agentEmail) { + + if ((agentName == null) || (agentName.trim().length() == 0)) { + // TODO : NUTCH-258 + if (LOGGER.isErrorEnabled()) { + LOGGER.error("No User-Agent string set (http.agent.name)!"); + } + } + + StringBuffer buf = new StringBuffer(); + + buf.append(agentName); + if (agentVersion != null) { + buf.append("/"); + buf.append(agentVersion); + } + if (((agentDesc != null) && (agentDesc.length() != 0)) + || ((agentEmail != null) && (agentEmail.length() != 0)) + || ((agentURL != null) && (agentURL.length() != 0))) { + buf.append(" ("); + + if ((agentDesc != null) && (agentDesc.length() != 0)) { + buf.append(agentDesc); + if ((agentURL != null) || (agentEmail != null)) + buf.append("; "); + } + + if ((agentURL != null) && (agentURL.length() != 0)) { + buf.append(agentURL); + if (agentEmail != null) + buf.append("; "); + } + + if ((agentEmail != null) && (agentEmail.length() != 0)) + buf.append(agentEmail); + + buf.append(")"); + } + return buf.toString(); + } + + protected void logConf() { + if (logger.isInfoEnabled()) { + logger.info("http.proxy.host = " + proxyHost); + logger.info("http.proxy.port = " + proxyPort); + logger.info("http.proxy.exception.list = " + useProxy); + logger.info("http.timeout = " + timeout); + logger.info("http.content.limit = " + maxContent); + logger.info("http.agent = " + userAgent); + logger.info("http.accept.language = " + acceptLanguage); + logger.info("http.accept = " + accept); + } + } + + public byte[] processGzipEncoded(byte[] compressed, URL url) + throws IOException { + + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("uncompressing...."); + } + + // content can be empty (i.e. redirection) in which case + // there is nothing to unzip + if (compressed.length == 0) + return compressed; + + byte[] content; + if (getMaxContent() >= 0) { + content = GZIPUtils.unzipBestEffort(compressed, getMaxContent()); + } else { + content = GZIPUtils.unzipBestEffort(compressed); + } + + if (content == null) + throw new IOException("unzipBestEffort returned null"); + + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("fetched " + compressed.length + + " bytes of compressed content (expanded to " + content.length + + " bytes) from " + url); + } + return content; + } + + public byte[] processDeflateEncoded(byte[] compressed, URL url) + throws IOException { + + // content can be empty (i.e. redirection) in which case + // there is nothing to deflate + if (compressed.length == 0) + return compressed; + + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("inflating...."); + } + + byte[] content; + if (getMaxContent() >= 0) { + content = DeflateUtils.inflateBestEffort(compressed, getMaxContent()); + } else { + content = DeflateUtils.inflateBestEffort(compressed); + } + + if (content == null) + throw new IOException("inflateBestEffort returned null"); + + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("fetched " + compressed.length + + " bytes of compressed content (expanded to " + content.length + + " bytes) from " + url); + } + return content; + } + + protected static void main(HttpBase http, String[] args) throws Exception { + boolean verbose = false; + String url = null; + + String usage = "Usage: Http [-verbose] [-timeout N] url"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + for (int i = 0; i < args.length; i++) { // parse command line + if (args[i].equals("-timeout")) { // found -timeout option + http.timeout = Integer.parseInt(args[++i]) * 1000; + } else if (args[i].equals("-verbose")) { // found -verbose option + verbose = true; + } else if (i != args.length - 1) { + System.err.println(usage); + System.exit(-1); + } else + // root is required parameter + url = args[i]; + } + + // if (verbose) { + // LOGGER.setLevel(Level.FINE); + // } + + ProtocolOutput out = http + .getProtocolOutput(new Text(url), new CrawlDatum()); + Content content = out.getContent(); + + System.out.println("Status: " + out.getStatus()); + if (content != null) { + System.out.println("Content Type: " + content.getContentType()); + System.out.println("Content Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); + System.out.println("Content:"); + String text = new String(content.getContent()); + System.out.println(text); + } + } + + protected abstract Response getResponse(URL url, CrawlDatum datum, + boolean followRedirects) throws ProtocolException, IOException; + + public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { + return robots.getRobotRulesSet(this, url); + } + + /** + * Transforming a String[] into a HashMap for faster searching + * @param input String[] + * @return a new HashMap + */ + private HashMap arrayToMap(String[]input){ + if (input==null ||input.length==0) { + return new HashMap(); + } + HashMap hm=new HashMap(); + for (int i=0;i<input.length;i++){ + if (!"".equals(input[i].trim())){ + hm.put(input[i],input[i]); + } + } + return hm; + } +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java new file mode 100644 index 0000000..ff7ef5b --- /dev/null +++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.http.api; + +// Nutch imports +import org.apache.nutch.protocol.ProtocolException; + +public class HttpException extends ProtocolException { + + public HttpException() { + super(); + } + + public HttpException(String message) { + super(message); + } + + public HttpException(String message, Throwable cause) { + super(message, cause); + } + + public HttpException(Throwable cause) { + super(cause); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java new file mode 100644 index 0000000..185ca15 --- /dev/null +++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -0,0 +1,167 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.http.api; + +import java.net.URL; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.RobotRulesParser; + +import crawlercommons.robots.BaseRobotRules; + +/** + * This class is used for parsing robots for urls belonging to HTTP protocol. It + * extends the generic {@link RobotRulesParser} class and contains Http protocol + * specific implementation for obtaining the robots file. + */ +public class HttpRobotRulesParser extends RobotRulesParser { + + public static final Logger LOG = LoggerFactory + .getLogger(HttpRobotRulesParser.class); + protected boolean allowForbidden = false; + + HttpRobotRulesParser() { + } + + public HttpRobotRulesParser(Configuration conf) { + setConf(conf); + } + + public void setConf(Configuration conf) { + super.setConf(conf); + allowForbidden = conf.getBoolean("http.robots.403.allow", true); + } + + /** Compose unique key to store and access robot rules in cache for given URL */ + protected static String getCacheKey(URL url) { + String protocol = url.getProtocol().toLowerCase(); // normalize to lower + // case + String host = url.getHost().toLowerCase(); // normalize to lower case + int port = url.getPort(); + if (port == -1) { + port = url.getDefaultPort(); + } + /* + * Robot rules apply only to host, protocol, and port where robots.txt is + * hosted (cf. NUTCH-1752). Consequently + */ + String cacheKey = protocol + ":" + host + ":" + port; + return cacheKey; + } + + /** + * Get the rules from robots.txt which applies for the given {@code url}. + * Robot rules are cached for a unique combination of host, protocol, and + * port. If no rules are found in the cache, a HTTP request is send to fetch + * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the + * rules are cached to avoid re-fetching and re-parsing it again. + * + * @param http + * The {@link Protocol} object + * @param url + * URL robots.txt applies to + * + * @return {@link BaseRobotRules} holding the rules from robots.txt + */ + public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { + + if (LOG.isTraceEnabled() && isWhiteListed(url)) { + LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url); + } + + String cacheKey = getCacheKey(url); + BaseRobotRules robotRules = CACHE.get(cacheKey); + + if (robotRules != null) { + return robotRules; // cached rule + } else if (LOG.isTraceEnabled()) { + LOG.trace("cache miss " + url); + } + + boolean cacheRule = true; + URL redir = null; + + if (isWhiteListed(url)) { + // check in advance whether a host is whitelisted + // (we do not need to fetch robots.txt) + robotRules = EMPTY_RULES; + LOG.info("Whitelisted host found for: {}", url); + LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", + url.getHost()); + + } else { + try { + Response response = ((HttpBase) http).getResponse(new URL(url, + "/robots.txt"), new CrawlDatum(), true); + // try one level of redirection ? + if (response.getCode() == 301 || response.getCode() == 302) { + String redirection = response.getHeader("Location"); + if (redirection == null) { + // some versions of MS IIS are known to mangle this header + redirection = response.getHeader("location"); + } + if (redirection != null) { + if (!redirection.startsWith("http")) { + // RFC says it should be absolute, but apparently it isn't + redir = new URL(url, redirection); + } else { + redir = new URL(redirection); + } + + response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), + true); + } + } + + if (response.getCode() == 200) // found rules: parse them + robotRules = parseRules(url.toString(), response.getContent(), + response.getHeader("Content-Type"), agentNames); + + else if ((response.getCode() == 403) && (!allowForbidden)) + robotRules = FORBID_ALL_RULES; // use forbid all + else if (response.getCode() >= 500) { + cacheRule = false; // try again later to fetch robots.txt + robotRules = EMPTY_RULES; + } else + robotRules = EMPTY_RULES; // use default rules + } catch (Throwable t) { + if (LOG.isInfoEnabled()) { + LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); + } + cacheRule = false; // try again later to fetch robots.txt + robotRules = EMPTY_RULES; + } + } + + if (cacheRule) { + CACHE.put(cacheKey, robotRules); // cache rules for host + if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { + // cache also for the redirected host + CACHE.put(getCacheKey(redir), robotRules); + } + } + + return robotRules; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html new file mode 100644 index 0000000..972bb3c --- /dev/null +++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html @@ -0,0 +1,6 @@ +<html> +<body> +<p>Common API used by HTTP plugins ({@link org.apache.nutch.protocol.http http}, +{@link org.apache.nutch.protocol.httpclient httpclient})</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java new file mode 100644 index 0000000..23e4ef6 --- /dev/null +++ b/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java @@ -0,0 +1,123 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.http.api; + +import org.junit.Assert; +import org.junit.Test; + +import crawlercommons.robots.BaseRobotRules; + +/** + * JUnit test case which tests 1. that robots filtering is performed correctly + * as per the agent name 2. that crawl delay is extracted correctly from the + * robots file + * + */ +public class TestRobotRulesParser { + + private static final String CONTENT_TYPE = "text/plain"; + private static final String SINGLE_AGENT = "Agent1"; + private static final String MULTIPLE_AGENTS = "Agent2, Agent1"; + private static final String UNKNOWN_AGENT = "AgentABC"; + private static final String CR = "\r"; + + private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR + + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + + CR + + "Crawl-delay: 10" + + CR // set crawl delay for Agent1 as 10 sec + + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh" + + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20" + + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no + // crawl + // delay + // for + // other + // agents + + private static final String[] TEST_PATHS = new String[] { + "http://example.com/a", "http://example.com/a/bloh/foo.html", + "http://example.com/b", "http://example.com/c", + "http://example.com/b/a/index.html", + "http://example.com/foo/bar/baz.html" }; + + private static final boolean[] RESULTS = new boolean[] { false, // /a + false, // /a/bloh/foo.html + true, // /b + true, // /c + false, // /b/a/index.html + true // /foo/bar/baz.html + }; + + private HttpRobotRulesParser parser; + private BaseRobotRules rules; + + public TestRobotRulesParser() { + parser = new HttpRobotRulesParser(); + } + + /** + * Test that the robots rules are interpreted correctly by the robots rules + * parser. + */ + @Test + public void testRobotsAgent() { + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, SINGLE_AGENT); + + for (int counter = 0; counter < TEST_PATHS.length; counter++) { + Assert.assertTrue( + "testing on agent (" + SINGLE_AGENT + "), and " + "path " + + TEST_PATHS[counter] + " got " + + rules.isAllowed(TEST_PATHS[counter]), + rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); + } + + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, MULTIPLE_AGENTS); + + for (int counter = 0; counter < TEST_PATHS.length; counter++) { + Assert.assertTrue( + "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path " + + TEST_PATHS[counter] + " got " + + rules.isAllowed(TEST_PATHS[counter]), + rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); + } + } + + /** + * Test that the crawl delay is extracted from the robots file for respective + * agent. If its not specified for a given agent, default value must be + * returned. + */ + @Test + public void testCrawlDelay() { + // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be + // returned by the parser + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, SINGLE_AGENT); + Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ", + (rules.getCrawlDelay() == 10000)); + + // for UNKNOWN_AGENT, the default crawl delay must be returned. + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, UNKNOWN_AGENT); + Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ", + (rules.getCrawlDelay() == Long.MIN_VALUE)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-nekohtml/build.xml b/nutch-plugins/lib-nekohtml/build.xml new file mode 100644 index 0000000..4bca1af --- /dev/null +++ b/nutch-plugins/lib-nekohtml/build.xml @@ -0,0 +1,30 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="lib-nekohtml" default="jar"> + + <import file="../build-plugin.xml"/> + + <!-- + ! Override the compile and jar targets, + ! since there is nothing to compile here. + ! --> + <target name="compile" depends="init, resolve-default"/> + + <target name="jar" depends="compile"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-nekohtml/ivy.xml b/nutch-plugins/lib-nekohtml/ivy.xml new file mode 100644 index 0000000..ed70b80 --- /dev/null +++ b/nutch-plugins/lib-nekohtml/ivy.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" conf="*->master"/> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-nekohtml/plugin.xml b/nutch-plugins/lib-nekohtml/plugin.xml new file mode 100644 index 0000000..513c9a7 --- /dev/null +++ b/nutch-plugins/lib-nekohtml/plugin.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- + ! NekoHTML is a simple HTML scanner and tag balancer that enables + ! application programmers to parse HTML documents and access the + ! information using standard XML interfaces. + ! (http://sourceforge.net/projects/nekohtml/) + ! + ! License : https://nekohtml.svn.sourceforge.net/svnroot/nekohtml/trunk/LICENSE.txt + !--> +<plugin + id="lib-nekohtml" + name="CyberNeko HTML Parser" + version="1.9.19" + provider-name="net.sourceforge.nekohtml"> + + <runtime> + <library name="nekohtml-1.9.19.jar"> + <export name="*"/> + </library> + </runtime> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-nekohtml/pom.xml b/nutch-plugins/lib-nekohtml/pom.xml new file mode 100644 index 0000000..df544bb --- /dev/null +++ b/nutch-plugins/lib-nekohtml/pom.xml @@ -0,0 +1,45 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>lib-nekohtml</artifactId> + <packaging>jar</packaging> + + <name>lib-nekohtml</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>net.sourceforge.nekohtml</groupId> + <artifactId>nekohtml</artifactId> + <version>1.9.22</version> + </dependency> + </dependencies> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-regex-filter/build.xml b/nutch-plugins/lib-regex-filter/build.xml new file mode 100644 index 0000000..9702ca2 --- /dev/null +++ b/nutch-plugins/lib-regex-filter/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="lib-regex-filter" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-regex-filter/ivy.xml b/nutch-plugins/lib-regex-filter/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/lib-regex-filter/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-regex-filter/plugin.xml b/nutch-plugins/lib-regex-filter/plugin.xml new file mode 100644 index 0000000..42de8f1 --- /dev/null +++ b/nutch-plugins/lib-regex-filter/plugin.xml @@ -0,0 +1,33 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- + ! A common framework for RegExp based URL filters + !--> +<plugin + id="lib-regex-filter" + name="Regex URL Filter Framework" + version="1.0" + provider-name="org.apache.nutch"> + + <runtime> + <library name="lib-regex-filter.jar"> + <export name="*"/> + </library> + </runtime> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-regex-filter/pom.xml b/nutch-plugins/lib-regex-filter/pom.xml new file mode 100644 index 0000000..1074ad7 --- /dev/null +++ b/nutch-plugins/lib-regex-filter/pom.xml @@ -0,0 +1,54 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>lib-regex-filter</artifactId> + <packaging>jar</packaging> + + <name>lib-regex-filter</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <version>2.6</version> + <executions> + <execution> + <goals> + <goal>test-jar</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java new file mode 100644 index 0000000..e408586 --- /dev/null +++ b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java @@ -0,0 +1,102 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.api; + +/** + * A generic regular expression rule. + * + * @author Jérôme Charron + */ +public abstract class RegexRule { + + private final boolean sign; + + private final String hostOrDomain; + + private final String regex; + + /** + * Constructs a new regular expression rule. + * + * @param sign + * specifies if this rule must filter-in or filter-out. A + * <code>true</code> value means that any url matching this rule must + * be accepted, a <code>false</code> value means that any url + * matching this rule must be rejected. + * @param regex + * is the regular expression used for matching (see + * {@link #match(String)} method). + */ + protected RegexRule(boolean sign, String regex) { + this(sign, regex, null); + } + + /** + * Constructs a new regular expression rule. + * + * @param sign + * specifies if this rule must filter-in or filter-out. A + * <code>true</code> value means that any url matching this rule must + * be accepted, a <code>false</code> value means that any url + * matching this rule must be rejected. + * @param regex + * is the regular expression used for matching (see + * {@link #match(String)} method). + * @param hostOrDomain + * the host or domain to which this regex belongs + */ + protected RegexRule(boolean sign, String regex, String hostOrDomain) { + this.sign = sign; + this.hostOrDomain = hostOrDomain; + this.regex = regex; + } + + /** + * Return if this rule is used for filtering-in or out. + * + * @return <code>true</code> if any url matching this rule must be accepted, + * otherwise <code>false</code>. + */ + protected boolean accept() { + return sign; + } + + /** + * Return if this rule is used for filtering-in or out. + * + * @return host or domain this regex rule belongs to + */ + protected String hostOrDomain() { return hostOrDomain; } + + /** + * Return if this rule's regex. + * + * @return this regex + */ + protected String regex() { return regex; } + + /** + * Checks if a url matches this rule. + * + * @param url + * is the url to check. + * @return <code>true</code> if the specified url matches this rule, otherwise + * <code>false</code>. + */ + protected abstract boolean match(String url); + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java new file mode 100644 index 0000000..f5cc081 --- /dev/null +++ b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java @@ -0,0 +1,315 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.api; + +// JDK imports +import java.io.File; +import java.io.Reader; +import java.io.FileReader; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.IOException; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.util.List; +import java.util.ArrayList; + +// Commons Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.net.*; +import org.apache.nutch.util.URLUtil; + +/** + * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular + * expressions. + * + * <p> + * The regular expressions rules are expressed in a file. The file of rules is + * determined for each implementation using the + * {@link #getRulesReader(Configuration conf)} method. + * </p> + * + * <p> + * The format of this file is made of many rules (one per line):<br/> + * <code> + * [+-]<regex> + * </code><br/> + * where plus (<code>+</code>)means go ahead and index it and minus ( + * <code>-</code>)means no. + * </p> + * + * @author Jérôme Charron + */ +public abstract class RegexURLFilterBase implements URLFilter { + + /** My logger */ + private final static Logger LOG = LoggerFactory + .getLogger(RegexURLFilterBase.class); + + /** An array of applicable rules */ + private List<RegexRule> rules; + + /** The current configuration */ + private Configuration conf; + + /** + * Constructs a new empty RegexURLFilterBase + */ + public RegexURLFilterBase() { + } + + /** + * Constructs a new RegexURLFilter and init it with a file of rules. + * + * @param filename + * is the name of rules file. + */ + public RegexURLFilterBase(File filename) throws IOException, + IllegalArgumentException { + this(new FileReader(filename)); + } + + /** + * Constructs a new RegexURLFilter and inits it with a list of rules. + * + * @param rules + * string with a list of rules, one rule per line + * @throws IOException + * @throws IllegalArgumentException + */ + public RegexURLFilterBase(String rules) throws IOException, + IllegalArgumentException { + this(new StringReader(rules)); + } + + /** + * Constructs a new RegexURLFilter and init it with a Reader of rules. + * + * @param reader + * is a reader of rules. + */ + protected RegexURLFilterBase(Reader reader) throws IOException, + IllegalArgumentException { + rules = readRules(reader); + } + + /** + * Creates a new {@link RegexRule}. + * + * @param sign + * of the regular expression. A <code>true</code> value means that + * any URL matching this rule must be included, whereas a + * <code>false</code> value means that any URL matching this rule + * must be excluded. + * @param regex + * is the regular expression associated to this rule. + */ + protected abstract RegexRule createRule(boolean sign, String regex); + + /** + * Creates a new {@link RegexRule}. + * @param + * sign of the regular expression. + * A <code>true</code> value means that any URL matching this rule + * must be included, whereas a <code>false</code> + * value means that any URL matching this rule must be excluded. + * @param regex + * is the regular expression associated to this rule. + * @param hostOrDomain + * the host or domain to which this regex belongs + */ + protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain); + + /** + * Returns the name of the file of rules to use for a particular + * implementation. + * + * @param conf + * is the current configuration. + * @return the name of the resource containing the rules to use. + */ + protected abstract Reader getRulesReader(Configuration conf) + throws IOException; + + /* + * -------------------------- * <implementation:URLFilter> * + * -------------------------- + */ + + // Inherited Javadoc + public String filter(String url) { + String host = URLUtil.getHost(url); + String domain = null; + + try { + domain = URLUtil.getDomainName(url); + } catch (MalformedURLException e) { + // shouldnt happen here right? + } + + if (LOG.isDebugEnabled()) { + LOG.debug("URL belongs to host " + host + " and domain " + domain); + } + + for (RegexRule rule : rules) { + // Skip the skip for rules that don't share the same host and domain + if (rule.hostOrDomain() != null && + !rule.hostOrDomain().equals(host) && + !rule.hostOrDomain().equals(domain)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain()); + } + + continue; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain); + } + + if (rule.match(url)) { + return rule.accept() ? url : null; + } + } + ; + return null; + } + + /* + * --------------------------- * </implementation:URLFilter> * + * --------------------------- + */ + + /* + * ----------------------------- * <implementation:Configurable> * + * ----------------------------- + */ + + public void setConf(Configuration conf) { + this.conf = conf; + Reader reader = null; + try { + reader = getRulesReader(conf); + } catch (Exception e) { + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } + throw new RuntimeException(e.getMessage(), e); + } + try { + rules = readRules(reader); + } catch (IOException e) { + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } + throw new RuntimeException(e.getMessage(), e); + } + } + + public Configuration getConf() { + return this.conf; + } + + /* + * ------------------------------ * </implementation:Configurable> * + * ------------------------------ + */ + + /** + * Read the specified file of rules. + * + * @param reader + * is a reader of regular expressions rules. + * @return the corresponding {@RegexRule rules}. + */ + private List<RegexRule> readRules(Reader reader) throws IOException, + IllegalArgumentException { + + BufferedReader in = new BufferedReader(reader); + List<RegexRule> rules = new ArrayList<RegexRule>(); + String line; + String hostOrDomain = null; + + while ((line = in.readLine()) != null) { + if (line.length() == 0) { + continue; + } + char first = line.charAt(0); + boolean sign = false; + switch (first) { + case '+': + sign = true; + break; + case '-': + sign = false; + break; + case ' ': + case '\n': + case '#': // skip blank & comment lines + continue; + case '>': + hostOrDomain = line.substring(1).trim(); + continue; + case '<': + hostOrDomain = null; + continue; + default: + throw new IOException("Invalid first character: " + line); + } + + String regex = line.substring(1); + if (LOG.isTraceEnabled()) { + LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain); + } + RegexRule rule = createRule(sign, regex, hostOrDomain); + rules.add(rule); + } + return rules; + } + + /** + * Filter the standard input using a RegexURLFilterBase. + * + * @param filter + * is the RegexURLFilterBase to use for filtering the standard input. + * @param args + * some optional parameters (not used). + */ + public static void main(RegexURLFilterBase filter, String args[]) + throws IOException, IllegalArgumentException { + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + while ((line = in.readLine()) != null) { + String out = filter.filter(line); + if (out != null) { + System.out.print("+"); + System.out.println(out); + } else { + System.out.print("-"); + System.out.println(line); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java new file mode 100644 index 0000000..b849353 --- /dev/null +++ b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Generic {@link org.apache.nutch.net.URLFilter URL filter} library, + * abstracting away from regular expression implementations. + */ +package org.apache.nutch.urlfilter.api; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java new file mode 100644 index 0000000..0b58231 --- /dev/null +++ b/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java @@ -0,0 +1,134 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.api; + +// JDK imports +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +import org.junit.Assert; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// Nutch imports +import org.apache.nutch.net.URLFilter; + +/** + * JUnit based test of class <code>RegexURLFilterBase</code>. + * + * @author Jérôme Charron + */ +public abstract class RegexURLFilterBaseTest { + + /** My logger */ + protected static final Logger LOG = LoggerFactory + .getLogger(RegexURLFilterBaseTest.class); + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + protected abstract URLFilter getURLFilter(Reader rules); + + protected void bench(int loops, String file) { + try { + bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"), + new FileReader(SAMPLES + SEPARATOR + file + ".urls")); + } catch (Exception e) { + Assert.fail(e.toString()); + } + } + + protected void bench(int loops, Reader rules, Reader urls) { + long start = System.currentTimeMillis(); + try { + URLFilter filter = getURLFilter(rules); + FilteredURL[] expected = readURLFile(urls); + for (int i = 0; i < loops; i++) { + test(filter, expected); + } + } catch (Exception e) { + Assert.fail(e.toString()); + } + LOG.info("bench time (" + loops + ") " + + (System.currentTimeMillis() - start) + "ms"); + } + + protected void test(String file) { + try { + test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"), + new FileReader(SAMPLES + SEPARATOR + file + ".urls")); + } catch (Exception e) { + Assert.fail(e.toString()); + } + } + + protected void test(Reader rules, Reader urls) { + try { + test(getURLFilter(rules), readURLFile(urls)); + } catch (Exception e) { + Assert.fail(e.toString()); + } + } + + protected void test(URLFilter filter, FilteredURL[] expected) { + for (int i = 0; i < expected.length; i++) { + String result = filter.filter(expected[i].url); + if (result != null) { + Assert.assertTrue(expected[i].url, expected[i].sign); + } else { + Assert.assertFalse(expected[i].url, expected[i].sign); + } + } + } + + private static FilteredURL[] readURLFile(Reader reader) throws IOException { + BufferedReader in = new BufferedReader(reader); + List<FilteredURL> list = new ArrayList<FilteredURL>(); + String line; + while ((line = in.readLine()) != null) { + if (line.length() != 0) { + list.add(new FilteredURL(line)); + } + } + return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]); + } + + private static class FilteredURL { + + boolean sign; + String url; + + FilteredURL(String line) { + switch (line.charAt(0)) { + case '+': + sign = true; + break; + case '-': + sign = false; + break; + default: + // Simply ignore... + } + url = line.substring(1); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/build-ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-selenium/build-ivy.xml b/nutch-plugins/lib-selenium/build-ivy.xml new file mode 100644 index 0000000..3abcf6d --- /dev/null +++ b/nutch-plugins/lib-selenium/build-ivy.xml @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="lib-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> + + <property name="ivy.install.version" value="2.1.0" /> + <condition property="ivy.home" value="${env.IVY_HOME}"> + <isset property="env.IVY_HOME" /> + </condition> + <property name="ivy.home" value="${user.home}/.ant" /> + <property name="ivy.checksums" value="" /> + <property name="ivy.jar.dir" value="${ivy.home}/lib" /> + <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> + + <target name="download-ivy" unless="offline"> + + <mkdir dir="${ivy.jar.dir}"/> + <!-- download Ivy from web site so that it can be used even without any special installation --> + <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" + dest="${ivy.jar.file}" usetimestamp="true"/> + </target> + + <target name="init-ivy" depends="download-ivy"> + <!-- try to load ivy here from ivy home, in case the user has not already dropped + it into ant's lib dir (note that the latter copy will always take precedence). + We will not fail as long as local lib dir exists (it may be empty) and + ivy is in at least one of ant's lib dir or the local lib dir. --> + <path id="ivy.lib.path"> + <fileset dir="${ivy.jar.dir}" includes="*.jar"/> + + </path> + <taskdef resource="org/apache/ivy/ant/antlib.xml" + uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> + </target> + + <target name="deps-jar" depends="init-ivy"> + <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-selenium/build.xml b/nutch-plugins/lib-selenium/build.xml new file mode 100644 index 0000000..7c6d98d --- /dev/null +++ b/nutch-plugins/lib-selenium/build.xml @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="lib-selenium" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-http/*.jar" /> + </fileset> + </path> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt b/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt new file mode 100644 index 0000000..1892a62 --- /dev/null +++ b/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt @@ -0,0 +1,15 @@ +1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml + +2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml + + To get a list of dependencies and their versions execute: + $ ant -f ./build-ivy.xml + $ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export name="*"\/>\n <\/library>/g' + + Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). + + N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows + + $ brew install gnu-sed --with-default-names + + You can then restart your terminal and the Regex + Sed command should work just fine! http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-selenium/ivy.xml b/nutch-plugins/lib-selenium/ivy.xml new file mode 100644 index 0000000..701b725 --- /dev/null +++ b/nutch-plugins/lib-selenium/ivy.xml @@ -0,0 +1,52 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../../ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <!-- begin selenium dependencies --> + <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.48.2" /> + + <dependency org="com.opera" name="operadriver" rev="1.5"> + <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> + </dependency> + <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" > + <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> + <exclude org="org.seleniumhq.selenium" name="selenium-java" /> + </dependency> + <!-- end selenium dependencies --> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-selenium/plugin.xml b/nutch-plugins/lib-selenium/plugin.xml new file mode 100644 index 0000000..a86d665 --- /dev/null +++ b/nutch-plugins/lib-selenium/plugin.xml @@ -0,0 +1,175 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- + ! A common framework for http protocol implementations + !--> +<plugin + id="lib-selenium" + name="HTTP Framework" + version="1.0" + provider-name="org.apache.nutch"> + + <runtime> + <library name="lib-selenium.jar"> + <export name="*"/> + </library> + <!-- all classes from dependent libraries are exported --> + <library name="cglib-nodep-2.1_3.jar"> + <export name="*"/> + </library> + <library name="commons-codec-1.10.jar"> + <export name="*"/> + </library> + <library name="commons-collections-3.2.1.jar"> + <export name="*"/> + </library> + <library name="commons-exec-1.3.jar"> + <export name="*"/> + </library> + <library name="commons-io-2.4.jar"> + <export name="*"/> + </library> + <library name="commons-jxpath-1.3.jar"> + <export name="*"/> + </library> + <library name="commons-lang3-3.4.jar"> + <export name="*"/> + </library> + <library name="commons-logging-1.2.jar"> + <export name="*"/> + </library> + <library name="cssparser-0.9.16.jar"> + <export name="*"/> + </library> + <library name="gson-2.3.1.jar"> + <export name="*"/> + </library> + <library name="guava-18.0.jar"> + <export name="*"/> + </library> + <library name="htmlunit-2.18.jar"> + <export name="*"/> + </library> + <library name="htmlunit-core-js-2.17.jar"> + <export name="*"/> + </library> + <library name="httpclient-4.5.1.jar"> + <export name="*"/> + </library> + <library name="httpcore-4.4.3.jar"> + <export name="*"/> + </library> + <library name="httpmime-4.5.jar"> + <export name="*"/> + </library> + <library name="ini4j-0.5.2.jar"> + <export name="*"/> + </library> + <library name="jetty-io-9.2.12.v20150709.jar"> + <export name="*"/> + </library> + <library name="jetty-util-9.2.12.v20150709.jar"> + <export name="*"/> + </library> + <library name="jna-4.1.0.jar"> + <export name="*"/> + </library> + <library name="jna-platform-4.1.0.jar"> + <export name="*"/> + </library> + <library name="nekohtml-1.9.22.jar"> + <export name="*"/> + </library> + <library name="netty-3.5.2.Final.jar"> + <export name="*"/> + </library> + <library name="operadriver-1.5.jar"> + <export name="*"/> + </library> + <library name="operalaunchers-1.1.jar"> + <export name="*"/> + </library> + <library name="phantomjsdriver-1.2.1.jar"> + <export name="*"/> + </library> + <library name="protobuf-java-2.4.1.jar"> + <export name="*"/> + </library> + <library name="sac-1.3.jar"> + <export name="*"/> + </library> + <library name="selenium-api-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-chrome-driver-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-edge-driver-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-firefox-driver-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-htmlunit-driver-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-ie-driver-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-java-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-leg-rc-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-remote-driver-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-safari-driver-2.48.2.jar"> + <export name="*"/> + </library> + <library name="selenium-support-2.48.2.jar"> + <export name="*"/> + </library> + <library name="serializer-2.7.2.jar"> + <export name="*"/> + </library> + <library name="webbit-0.4.14.jar"> + <export name="*"/> + </library> + <library name="websocket-api-9.2.12.v20150709.jar"> + <export name="*"/> + </library> + <library name="websocket-client-9.2.12.v20150709.jar"> + <export name="*"/> + </library> + <library name="websocket-common-9.2.12.v20150709.jar"> + <export name="*"/> + </library> + <library name="xalan-2.7.2.jar"> + <export name="*"/> + </library> + <library name="xercesImpl-2.11.0.jar"> + <export name="*"/> + </library> + <library name="xml-apis-1.4.01.jar"> + <export name="*"/> + </library> + </runtime> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-selenium/pom.xml b/nutch-plugins/lib-selenium/pom.xml new file mode 100644 index 0000000..fed912d --- /dev/null +++ b/nutch-plugins/lib-selenium/pom.xml @@ -0,0 +1,49 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>lib-selenium</artifactId> + <packaging>jar</packaging> + + <name>lib-selenium</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <dependencies> + <dependency> + <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-java</artifactId> <version>2.48.2</version> + </dependency> + <dependency> + <groupId>com.opera</groupId> <artifactId>operadriver</artifactId> <version>1.5</version> + </dependency> + <dependency> + <groupId>com.codeborne</groupId> <artifactId>phantomjsdriver</artifactId> <version>1.2.1</version> + </dependency> + </dependencies> + +</project>
