http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java deleted file mode 100644 index 9f616fe..0000000 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ /dev/null @@ -1,587 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.http.api; - -// JDK imports -import java.io.BufferedReader; -import java.io.IOException; -import java.io.Reader; -import java.net.URL; -import java.util.*; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; -import java.util.concurrent.ThreadLocalRandom; -// Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// Nutch imports -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Nutch; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.ProtocolOutput; -import org.apache.nutch.protocol.ProtocolStatus; -import org.apache.nutch.util.GZIPUtils; -import org.apache.nutch.util.DeflateUtils; -import org.apache.hadoop.util.StringUtils; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; - -// crawler-commons imports -import crawlercommons.robots.BaseRobotRules; - -public abstract class HttpBase implements Protocol { - - public static final Text RESPONSE_TIME = new Text("_rs_"); - - public static final int BUFFER_SIZE = 8 * 1024; - - private static final byte[] EMPTY_CONTENT = new byte[0]; - - private HttpRobotRulesParser robots = null; - - private ArrayList<String> userAgentNames = null; - - /** The proxy hostname. */ - protected String proxyHost = null; - - /** The proxy port. */ - protected int proxyPort = 8080; - - /** The proxy exception list. */ - protected HashMap proxyException = new HashMap(); - - /** Indicates if a proxy is used */ - protected boolean useProxy = false; - - /** The network timeout in millisecond */ - protected int timeout = 10000; - - /** The length limit for downloaded content, in bytes. */ - protected int maxContent = 64 * 1024; - - /** The Nutch 'User-Agent' request header */ - protected String userAgent = getAgentString("NutchCVS", null, "Nutch", - "http://nutch.apache.org/bot.html", "[email protected]"); - - /** The "Accept-Language" request header value. */ - protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3"; - - /** The "Accept" request header value. */ - protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; - - /** The default logger */ - private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class); - - /** The specified logger */ - private Logger logger = LOGGER; - - /** The nutch configuration */ - private Configuration conf = null; - - /** Do we use HTTP/1.1? */ - protected boolean useHttp11 = false; - - /** - * Record response time in CrawlDatum's meta data, see property - * http.store.responsetime. - */ - protected boolean responseTime = true; - - /** Skip page if Crawl-Delay longer than this value. */ - protected long maxCrawlDelay = -1L; - - /** Which TLS/SSL protocols to support */ - protected Set<String> tlsPreferredProtocols; - - /** Which TLS/SSL cipher suites to support */ - protected Set<String> tlsPreferredCipherSuites; - - /** Configuration directive for If-Modified-Since HTTP header */ - public boolean enableIfModifiedsinceHeader = true; - - /** Creates a new instance of HttpBase */ - public HttpBase() { - this(null); - } - - /** Creates a new instance of HttpBase */ - public HttpBase(Logger logger) { - if (logger != null) { - this.logger = logger; - } - robots = new HttpRobotRulesParser(); - } - - // Inherited Javadoc - public void setConf(Configuration conf) { - this.conf = conf; - this.proxyHost = conf.get("http.proxy.host"); - this.proxyPort = conf.getInt("http.proxy.port", 8080); - this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list")); - this.useProxy = (proxyHost != null && proxyHost.length() > 0); - this.timeout = conf.getInt("http.timeout", 10000); - this.maxContent = conf.getInt("http.content.limit", 64 * 1024); - this.userAgent = getAgentString(conf.get("http.agent.name"), - conf.get("http.agent.version"), conf.get("http.agent.description"), - conf.get("http.agent.url"), conf.get("http.agent.email")); - this.acceptLanguage = conf.get("http.accept.language", acceptLanguage); - this.accept = conf.get("http.accept", accept); - // backward-compatible default setting - this.useHttp11 = conf.getBoolean("http.useHttp11", false); - this.responseTime = conf.getBoolean("http.store.responsetime", true); - this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true); - this.robots.setConf(conf); - - // NUTCH-1941: read list of alternating agent names - if (conf.getBoolean("http.agent.rotate", false)) { - String agentsFile = conf.get("http.agent.rotate.file", "agents.txt"); - BufferedReader br = null; - try { - Reader reader = conf.getConfResourceAsReader(agentsFile); - br = new BufferedReader(reader); - userAgentNames = new ArrayList<String>(); - String word = ""; - while ((word = br.readLine()) != null) { - if (!word.trim().isEmpty()) - userAgentNames.add(word.trim()); - } - - if (userAgentNames.size() == 0) { - logger.warn("Empty list of user agents in http.agent.rotate.file {}", - agentsFile); - userAgentNames = null; - } - - } catch (Exception e) { - logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, - StringUtils.stringifyException(e)); - userAgentNames = null; - } finally { - if (br != null) { - try { - br.close(); - } catch (IOException e) { - // ignore - } - } - } - if (userAgentNames == null) { - logger - .warn("Falling back to fixed user agent set via property http.agent.name"); - } - } - - String[] protocols = conf.getStrings("http.tls.supported.protocols", - "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); - String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", - "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", - "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", - "TLS_RSA_WITH_AES_256_CBC_SHA256", - "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384", - "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", - "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256", - "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256", - "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", - "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA", - "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", - "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", - "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA", - "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", - "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256", - "TLS_RSA_WITH_AES_128_CBC_SHA256", - "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256", - "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256", - "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", - "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256", - "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA", - "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA", - "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA", - "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", - "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA", - "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA", - "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", - "TLS_ECDH_RSA_WITH_RC4_128_SHA", - "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA", - "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA", - "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA", - "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", - "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA", - "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5", - "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256", - "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA", - "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA", - "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5", - "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA", - "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA", - "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA", - "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA", - "TLS_KRB5_WITH_DES_CBC_MD5"); - - tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); - tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); - - logConf(); - } - - // Inherited Javadoc - public Configuration getConf() { - return this.conf; - } - - public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { - - String urlString = url.toString(); - try { - URL u = new URL(urlString); - - long startTime = System.currentTimeMillis(); - Response response = getResponse(u, datum, false); // make a request - - if (this.responseTime) { - int elapsedTime = (int) (System.currentTimeMillis() - startTime); - datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime)); - } - - int code = response.getCode(); - datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, - new Text(Integer.toString(code))); - - byte[] content = response.getContent(); - Content c = new Content(u.toString(), u.toString(), - (content == null ? EMPTY_CONTENT : content), - response.getHeader("Content-Type"), response.getHeaders(), this.conf); - - if (code == 200) { // got a good response - return new ProtocolOutput(c); // return it - - } else if (code >= 300 && code < 400) { // handle redirect - String location = response.getHeader("Location"); - // some broken servers, such as MS IIS, use lowercase header name... - if (location == null) - location = response.getHeader("location"); - if (location == null) - location = ""; - u = new URL(u, location); - int protocolStatusCode; - switch (code) { - case 300: // multiple choices, preferred value in Location - protocolStatusCode = ProtocolStatus.MOVED; - break; - case 301: // moved permanently - case 305: // use proxy (Location is URL of proxy) - protocolStatusCode = ProtocolStatus.MOVED; - break; - case 302: // found (temporarily moved) - case 303: // see other (redirect after POST) - case 307: // temporary redirect - protocolStatusCode = ProtocolStatus.TEMP_MOVED; - break; - case 304: // not modified - protocolStatusCode = ProtocolStatus.NOTMODIFIED; - break; - default: - protocolStatusCode = ProtocolStatus.MOVED; - } - // handle this in the higher layer. - return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u)); - } else if (code == 400) { // bad request, mark as GONE - if (logger.isTraceEnabled()) { - logger.trace("400 Bad request: " + u); - } - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u)); - } else if (code == 401) { // requires authorization, but no valid auth - // provided. - if (logger.isTraceEnabled()) { - logger.trace("401 Authentication Required"); - } - return new ProtocolOutput(c, new ProtocolStatus( - ProtocolStatus.ACCESS_DENIED, "Authentication required: " - + urlString)); - } else if (code == 404) { - return new ProtocolOutput(c, new ProtocolStatus( - ProtocolStatus.NOTFOUND, u)); - } else if (code == 410) { // permanently GONE - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, - "Http: " + code + " url=" + u)); - } else { - return new ProtocolOutput(c, new ProtocolStatus( - ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u)); - } - } catch (Throwable e) { - logger.error("Failed to get protocol output", e); - return new ProtocolOutput(null, new ProtocolStatus(e)); - } - } - - /* - * -------------------------- * </implementation:Protocol> * - * -------------------------- - */ - - public String getProxyHost() { - return proxyHost; - } - - public int getProxyPort() { - return proxyPort; - } - - public boolean useProxy(URL url) { - if (!useProxy){ - return false; - } else if (proxyException.get(url.getHost())!=null){ - return false; - } - return useProxy; - } - - public int getTimeout() { - return timeout; - } - - public boolean isIfModifiedSinceEnabled() { - return enableIfModifiedsinceHeader; - } - - public int getMaxContent() { - return maxContent; - } - - public String getUserAgent() { - if (userAgentNames!=null) { - return userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1)); - } - return userAgent; - } - - /** - * Value of "Accept-Language" request header sent by Nutch. - * - * @return The value of the header "Accept-Language" header. - */ - public String getAcceptLanguage() { - return acceptLanguage; - } - - public String getAccept() { - return accept; - } - - public boolean getUseHttp11() { - return useHttp11; - } - - public Set<String> getTlsPreferredCipherSuites() { - return tlsPreferredCipherSuites; - } - - public Set<String> getTlsPreferredProtocols() { - return tlsPreferredProtocols; - } - - private static String getAgentString(String agentName, String agentVersion, - String agentDesc, String agentURL, String agentEmail) { - - if ((agentName == null) || (agentName.trim().length() == 0)) { - // TODO : NUTCH-258 - if (LOGGER.isErrorEnabled()) { - LOGGER.error("No User-Agent string set (http.agent.name)!"); - } - } - - StringBuffer buf = new StringBuffer(); - - buf.append(agentName); - if (agentVersion != null) { - buf.append("/"); - buf.append(agentVersion); - } - if (((agentDesc != null) && (agentDesc.length() != 0)) - || ((agentEmail != null) && (agentEmail.length() != 0)) - || ((agentURL != null) && (agentURL.length() != 0))) { - buf.append(" ("); - - if ((agentDesc != null) && (agentDesc.length() != 0)) { - buf.append(agentDesc); - if ((agentURL != null) || (agentEmail != null)) - buf.append("; "); - } - - if ((agentURL != null) && (agentURL.length() != 0)) { - buf.append(agentURL); - if (agentEmail != null) - buf.append("; "); - } - - if ((agentEmail != null) && (agentEmail.length() != 0)) - buf.append(agentEmail); - - buf.append(")"); - } - return buf.toString(); - } - - protected void logConf() { - if (logger.isInfoEnabled()) { - logger.info("http.proxy.host = " + proxyHost); - logger.info("http.proxy.port = " + proxyPort); - logger.info("http.proxy.exception.list = " + useProxy); - logger.info("http.timeout = " + timeout); - logger.info("http.content.limit = " + maxContent); - logger.info("http.agent = " + userAgent); - logger.info("http.accept.language = " + acceptLanguage); - logger.info("http.accept = " + accept); - } - } - - public byte[] processGzipEncoded(byte[] compressed, URL url) - throws IOException { - - if (LOGGER.isTraceEnabled()) { - LOGGER.trace("uncompressing...."); - } - - // content can be empty (i.e. redirection) in which case - // there is nothing to unzip - if (compressed.length == 0) - return compressed; - - byte[] content; - if (getMaxContent() >= 0) { - content = GZIPUtils.unzipBestEffort(compressed, getMaxContent()); - } else { - content = GZIPUtils.unzipBestEffort(compressed); - } - - if (content == null) - throw new IOException("unzipBestEffort returned null"); - - if (LOGGER.isTraceEnabled()) { - LOGGER.trace("fetched " + compressed.length - + " bytes of compressed content (expanded to " + content.length - + " bytes) from " + url); - } - return content; - } - - public byte[] processDeflateEncoded(byte[] compressed, URL url) - throws IOException { - - // content can be empty (i.e. redirection) in which case - // there is nothing to deflate - if (compressed.length == 0) - return compressed; - - if (LOGGER.isTraceEnabled()) { - LOGGER.trace("inflating...."); - } - - byte[] content; - if (getMaxContent() >= 0) { - content = DeflateUtils.inflateBestEffort(compressed, getMaxContent()); - } else { - content = DeflateUtils.inflateBestEffort(compressed); - } - - if (content == null) - throw new IOException("inflateBestEffort returned null"); - - if (LOGGER.isTraceEnabled()) { - LOGGER.trace("fetched " + compressed.length - + " bytes of compressed content (expanded to " + content.length - + " bytes) from " + url); - } - return content; - } - - protected static void main(HttpBase http, String[] args) throws Exception { - boolean verbose = false; - String url = null; - - String usage = "Usage: Http [-verbose] [-timeout N] url"; - - if (args.length == 0) { - System.err.println(usage); - System.exit(-1); - } - - for (int i = 0; i < args.length; i++) { // parse command line - if (args[i].equals("-timeout")) { // found -timeout option - http.timeout = Integer.parseInt(args[++i]) * 1000; - } else if (args[i].equals("-verbose")) { // found -verbose option - verbose = true; - } else if (i != args.length - 1) { - System.err.println(usage); - System.exit(-1); - } else - // root is required parameter - url = args[i]; - } - - // if (verbose) { - // LOGGER.setLevel(Level.FINE); - // } - - ProtocolOutput out = http - .getProtocolOutput(new Text(url), new CrawlDatum()); - Content content = out.getContent(); - - System.out.println("Status: " + out.getStatus()); - if (content != null) { - System.out.println("Content Type: " + content.getContentType()); - System.out.println("Content Length: " - + content.getMetadata().get(Response.CONTENT_LENGTH)); - System.out.println("Content:"); - String text = new String(content.getContent()); - System.out.println(text); - } - } - - protected abstract Response getResponse(URL url, CrawlDatum datum, - boolean followRedirects) throws ProtocolException, IOException; - - public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { - return robots.getRobotRulesSet(this, url); - } - - /** - * Transforming a String[] into a HashMap for faster searching - * @param input String[] - * @return a new HashMap - */ - private HashMap arrayToMap(String[]input){ - if (input==null ||input.length==0) { - return new HashMap(); - } - HashMap hm=new HashMap(); - for (int i=0;i<input.length;i++){ - if (!"".equals(input[i].trim())){ - hm.put(input[i],input[i]); - } - } - return hm; - } -}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java deleted file mode 100644 index ff7ef5b..0000000 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.http.api; - -// Nutch imports -import org.apache.nutch.protocol.ProtocolException; - -public class HttpException extends ProtocolException { - - public HttpException() { - super(); - } - - public HttpException(String message) { - super(message); - } - - public HttpException(String message, Throwable cause) { - super(message, cause); - } - - public HttpException(Throwable cause) { - super(cause); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java deleted file mode 100644 index 185ca15..0000000 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ /dev/null @@ -1,167 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.http.api; - -import java.net.URL; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.RobotRulesParser; - -import crawlercommons.robots.BaseRobotRules; - -/** - * This class is used for parsing robots for urls belonging to HTTP protocol. It - * extends the generic {@link RobotRulesParser} class and contains Http protocol - * specific implementation for obtaining the robots file. - */ -public class HttpRobotRulesParser extends RobotRulesParser { - - public static final Logger LOG = LoggerFactory - .getLogger(HttpRobotRulesParser.class); - protected boolean allowForbidden = false; - - HttpRobotRulesParser() { - } - - public HttpRobotRulesParser(Configuration conf) { - setConf(conf); - } - - public void setConf(Configuration conf) { - super.setConf(conf); - allowForbidden = conf.getBoolean("http.robots.403.allow", true); - } - - /** Compose unique key to store and access robot rules in cache for given URL */ - protected static String getCacheKey(URL url) { - String protocol = url.getProtocol().toLowerCase(); // normalize to lower - // case - String host = url.getHost().toLowerCase(); // normalize to lower case - int port = url.getPort(); - if (port == -1) { - port = url.getDefaultPort(); - } - /* - * Robot rules apply only to host, protocol, and port where robots.txt is - * hosted (cf. NUTCH-1752). Consequently - */ - String cacheKey = protocol + ":" + host + ":" + port; - return cacheKey; - } - - /** - * Get the rules from robots.txt which applies for the given {@code url}. - * Robot rules are cached for a unique combination of host, protocol, and - * port. If no rules are found in the cache, a HTTP request is send to fetch - * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the - * rules are cached to avoid re-fetching and re-parsing it again. - * - * @param http - * The {@link Protocol} object - * @param url - * URL robots.txt applies to - * - * @return {@link BaseRobotRules} holding the rules from robots.txt - */ - public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { - - if (LOG.isTraceEnabled() && isWhiteListed(url)) { - LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url); - } - - String cacheKey = getCacheKey(url); - BaseRobotRules robotRules = CACHE.get(cacheKey); - - if (robotRules != null) { - return robotRules; // cached rule - } else if (LOG.isTraceEnabled()) { - LOG.trace("cache miss " + url); - } - - boolean cacheRule = true; - URL redir = null; - - if (isWhiteListed(url)) { - // check in advance whether a host is whitelisted - // (we do not need to fetch robots.txt) - robotRules = EMPTY_RULES; - LOG.info("Whitelisted host found for: {}", url); - LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", - url.getHost()); - - } else { - try { - Response response = ((HttpBase) http).getResponse(new URL(url, - "/robots.txt"), new CrawlDatum(), true); - // try one level of redirection ? - if (response.getCode() == 301 || response.getCode() == 302) { - String redirection = response.getHeader("Location"); - if (redirection == null) { - // some versions of MS IIS are known to mangle this header - redirection = response.getHeader("location"); - } - if (redirection != null) { - if (!redirection.startsWith("http")) { - // RFC says it should be absolute, but apparently it isn't - redir = new URL(url, redirection); - } else { - redir = new URL(redirection); - } - - response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), - true); - } - } - - if (response.getCode() == 200) // found rules: parse them - robotRules = parseRules(url.toString(), response.getContent(), - response.getHeader("Content-Type"), agentNames); - - else if ((response.getCode() == 403) && (!allowForbidden)) - robotRules = FORBID_ALL_RULES; // use forbid all - else if (response.getCode() >= 500) { - cacheRule = false; // try again later to fetch robots.txt - robotRules = EMPTY_RULES; - } else - robotRules = EMPTY_RULES; // use default rules - } catch (Throwable t) { - if (LOG.isInfoEnabled()) { - LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); - } - cacheRule = false; // try again later to fetch robots.txt - robotRules = EMPTY_RULES; - } - } - - if (cacheRule) { - CACHE.put(cacheKey, robotRules); // cache rules for host - if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { - // cache also for the redirected host - CACHE.put(getCacheKey(redir), robotRules); - } - } - - return robotRules; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html deleted file mode 100644 index 972bb3c..0000000 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html +++ /dev/null @@ -1,6 +0,0 @@ -<html> -<body> -<p>Common API used by HTTP plugins ({@link org.apache.nutch.protocol.http http}, -{@link org.apache.nutch.protocol.httpclient httpclient})</p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java deleted file mode 100644 index 23e4ef6..0000000 --- a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.http.api; - -import org.junit.Assert; -import org.junit.Test; - -import crawlercommons.robots.BaseRobotRules; - -/** - * JUnit test case which tests 1. that robots filtering is performed correctly - * as per the agent name 2. that crawl delay is extracted correctly from the - * robots file - * - */ -public class TestRobotRulesParser { - - private static final String CONTENT_TYPE = "text/plain"; - private static final String SINGLE_AGENT = "Agent1"; - private static final String MULTIPLE_AGENTS = "Agent2, Agent1"; - private static final String UNKNOWN_AGENT = "AgentABC"; - private static final String CR = "\r"; - - private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR - + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" - + CR - + "Crawl-delay: 10" - + CR // set crawl delay for Agent1 as 10 sec - + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh" - + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20" - + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no - // crawl - // delay - // for - // other - // agents - - private static final String[] TEST_PATHS = new String[] { - "http://example.com/a", "http://example.com/a/bloh/foo.html", - "http://example.com/b", "http://example.com/c", - "http://example.com/b/a/index.html", - "http://example.com/foo/bar/baz.html" }; - - private static final boolean[] RESULTS = new boolean[] { false, // /a - false, // /a/bloh/foo.html - true, // /b - true, // /c - false, // /b/a/index.html - true // /foo/bar/baz.html - }; - - private HttpRobotRulesParser parser; - private BaseRobotRules rules; - - public TestRobotRulesParser() { - parser = new HttpRobotRulesParser(); - } - - /** - * Test that the robots rules are interpreted correctly by the robots rules - * parser. - */ - @Test - public void testRobotsAgent() { - rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), - CONTENT_TYPE, SINGLE_AGENT); - - for (int counter = 0; counter < TEST_PATHS.length; counter++) { - Assert.assertTrue( - "testing on agent (" + SINGLE_AGENT + "), and " + "path " - + TEST_PATHS[counter] + " got " - + rules.isAllowed(TEST_PATHS[counter]), - rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); - } - - rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), - CONTENT_TYPE, MULTIPLE_AGENTS); - - for (int counter = 0; counter < TEST_PATHS.length; counter++) { - Assert.assertTrue( - "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path " - + TEST_PATHS[counter] + " got " - + rules.isAllowed(TEST_PATHS[counter]), - rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); - } - } - - /** - * Test that the crawl delay is extracted from the robots file for respective - * agent. If its not specified for a given agent, default value must be - * returned. - */ - @Test - public void testCrawlDelay() { - // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be - // returned by the parser - rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), - CONTENT_TYPE, SINGLE_AGENT); - Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ", - (rules.getCrawlDelay() == 10000)); - - // for UNKNOWN_AGENT, the default crawl delay must be returned. - rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), - CONTENT_TYPE, UNKNOWN_AGENT); - Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ", - (rules.getCrawlDelay() == Long.MIN_VALUE)); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-nekohtml/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-nekohtml/build.xml b/src/plugin/lib-nekohtml/build.xml deleted file mode 100644 index 4bca1af..0000000 --- a/src/plugin/lib-nekohtml/build.xml +++ /dev/null @@ -1,30 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="lib-nekohtml" default="jar"> - - <import file="../build-plugin.xml"/> - - <!-- - ! Override the compile and jar targets, - ! since there is nothing to compile here. - ! --> - <target name="compile" depends="init, resolve-default"/> - - <target name="jar" depends="compile"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-nekohtml/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-nekohtml/ivy.xml b/src/plugin/lib-nekohtml/ivy.xml deleted file mode 100644 index ed70b80..0000000 --- a/src/plugin/lib-nekohtml/ivy.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" conf="*->master"/> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-nekohtml/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-nekohtml/plugin.xml b/src/plugin/lib-nekohtml/plugin.xml deleted file mode 100644 index 513c9a7..0000000 --- a/src/plugin/lib-nekohtml/plugin.xml +++ /dev/null @@ -1,38 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<!-- - ! NekoHTML is a simple HTML scanner and tag balancer that enables - ! application programmers to parse HTML documents and access the - ! information using standard XML interfaces. - ! (http://sourceforge.net/projects/nekohtml/) - ! - ! License : https://nekohtml.svn.sourceforge.net/svnroot/nekohtml/trunk/LICENSE.txt - !--> -<plugin - id="lib-nekohtml" - name="CyberNeko HTML Parser" - version="1.9.19" - provider-name="net.sourceforge.nekohtml"> - - <runtime> - <library name="nekohtml-1.9.19.jar"> - <export name="*"/> - </library> - </runtime> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/build.xml b/src/plugin/lib-regex-filter/build.xml deleted file mode 100644 index 9702ca2..0000000 --- a/src/plugin/lib-regex-filter/build.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="lib-regex-filter" default="jar-core"> - - <import file="../build-plugin.xml"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/ivy.xml b/src/plugin/lib-regex-filter/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/lib-regex-filter/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/plugin.xml b/src/plugin/lib-regex-filter/plugin.xml deleted file mode 100644 index 42de8f1..0000000 --- a/src/plugin/lib-regex-filter/plugin.xml +++ /dev/null @@ -1,33 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<!-- - ! A common framework for RegExp based URL filters - !--> -<plugin - id="lib-regex-filter" - name="Regex URL Filter Framework" - version="1.0" - provider-name="org.apache.nutch"> - - <runtime> - <library name="lib-regex-filter.jar"> - <export name="*"/> - </library> - </runtime> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java deleted file mode 100644 index e408586..0000000 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java +++ /dev/null @@ -1,102 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.api; - -/** - * A generic regular expression rule. - * - * @author Jérôme Charron - */ -public abstract class RegexRule { - - private final boolean sign; - - private final String hostOrDomain; - - private final String regex; - - /** - * Constructs a new regular expression rule. - * - * @param sign - * specifies if this rule must filter-in or filter-out. A - * <code>true</code> value means that any url matching this rule must - * be accepted, a <code>false</code> value means that any url - * matching this rule must be rejected. - * @param regex - * is the regular expression used for matching (see - * {@link #match(String)} method). - */ - protected RegexRule(boolean sign, String regex) { - this(sign, regex, null); - } - - /** - * Constructs a new regular expression rule. - * - * @param sign - * specifies if this rule must filter-in or filter-out. A - * <code>true</code> value means that any url matching this rule must - * be accepted, a <code>false</code> value means that any url - * matching this rule must be rejected. - * @param regex - * is the regular expression used for matching (see - * {@link #match(String)} method). - * @param hostOrDomain - * the host or domain to which this regex belongs - */ - protected RegexRule(boolean sign, String regex, String hostOrDomain) { - this.sign = sign; - this.hostOrDomain = hostOrDomain; - this.regex = regex; - } - - /** - * Return if this rule is used for filtering-in or out. - * - * @return <code>true</code> if any url matching this rule must be accepted, - * otherwise <code>false</code>. - */ - protected boolean accept() { - return sign; - } - - /** - * Return if this rule is used for filtering-in or out. - * - * @return host or domain this regex rule belongs to - */ - protected String hostOrDomain() { return hostOrDomain; } - - /** - * Return if this rule's regex. - * - * @return this regex - */ - protected String regex() { return regex; } - - /** - * Checks if a url matches this rule. - * - * @param url - * is the url to check. - * @return <code>true</code> if the specified url matches this rule, otherwise - * <code>false</code>. - */ - protected abstract boolean match(String url); - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java deleted file mode 100644 index f5cc081..0000000 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java +++ /dev/null @@ -1,315 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.api; - -// JDK imports -import java.io.File; -import java.io.Reader; -import java.io.FileReader; -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.IOException; -import java.io.StringReader; -import java.net.MalformedURLException; -import java.util.List; -import java.util.ArrayList; - -// Commons Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; - -// Nutch imports -import org.apache.nutch.net.*; -import org.apache.nutch.util.URLUtil; - -/** - * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular - * expressions. - * - * <p> - * The regular expressions rules are expressed in a file. The file of rules is - * determined for each implementation using the - * {@link #getRulesReader(Configuration conf)} method. - * </p> - * - * <p> - * The format of this file is made of many rules (one per line):<br/> - * <code> - * [+-]<regex> - * </code><br/> - * where plus (<code>+</code>)means go ahead and index it and minus ( - * <code>-</code>)means no. - * </p> - * - * @author Jérôme Charron - */ -public abstract class RegexURLFilterBase implements URLFilter { - - /** My logger */ - private final static Logger LOG = LoggerFactory - .getLogger(RegexURLFilterBase.class); - - /** An array of applicable rules */ - private List<RegexRule> rules; - - /** The current configuration */ - private Configuration conf; - - /** - * Constructs a new empty RegexURLFilterBase - */ - public RegexURLFilterBase() { - } - - /** - * Constructs a new RegexURLFilter and init it with a file of rules. - * - * @param filename - * is the name of rules file. - */ - public RegexURLFilterBase(File filename) throws IOException, - IllegalArgumentException { - this(new FileReader(filename)); - } - - /** - * Constructs a new RegexURLFilter and inits it with a list of rules. - * - * @param rules - * string with a list of rules, one rule per line - * @throws IOException - * @throws IllegalArgumentException - */ - public RegexURLFilterBase(String rules) throws IOException, - IllegalArgumentException { - this(new StringReader(rules)); - } - - /** - * Constructs a new RegexURLFilter and init it with a Reader of rules. - * - * @param reader - * is a reader of rules. - */ - protected RegexURLFilterBase(Reader reader) throws IOException, - IllegalArgumentException { - rules = readRules(reader); - } - - /** - * Creates a new {@link RegexRule}. - * - * @param sign - * of the regular expression. A <code>true</code> value means that - * any URL matching this rule must be included, whereas a - * <code>false</code> value means that any URL matching this rule - * must be excluded. - * @param regex - * is the regular expression associated to this rule. - */ - protected abstract RegexRule createRule(boolean sign, String regex); - - /** - * Creates a new {@link RegexRule}. - * @param - * sign of the regular expression. - * A <code>true</code> value means that any URL matching this rule - * must be included, whereas a <code>false</code> - * value means that any URL matching this rule must be excluded. - * @param regex - * is the regular expression associated to this rule. - * @param hostOrDomain - * the host or domain to which this regex belongs - */ - protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain); - - /** - * Returns the name of the file of rules to use for a particular - * implementation. - * - * @param conf - * is the current configuration. - * @return the name of the resource containing the rules to use. - */ - protected abstract Reader getRulesReader(Configuration conf) - throws IOException; - - /* - * -------------------------- * <implementation:URLFilter> * - * -------------------------- - */ - - // Inherited Javadoc - public String filter(String url) { - String host = URLUtil.getHost(url); - String domain = null; - - try { - domain = URLUtil.getDomainName(url); - } catch (MalformedURLException e) { - // shouldnt happen here right? - } - - if (LOG.isDebugEnabled()) { - LOG.debug("URL belongs to host " + host + " and domain " + domain); - } - - for (RegexRule rule : rules) { - // Skip the skip for rules that don't share the same host and domain - if (rule.hostOrDomain() != null && - !rule.hostOrDomain().equals(host) && - !rule.hostOrDomain().equals(domain)) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain()); - } - - continue; - } - - if (LOG.isDebugEnabled()) { - LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain); - } - - if (rule.match(url)) { - return rule.accept() ? url : null; - } - } - ; - return null; - } - - /* - * --------------------------- * </implementation:URLFilter> * - * --------------------------- - */ - - /* - * ----------------------------- * <implementation:Configurable> * - * ----------------------------- - */ - - public void setConf(Configuration conf) { - this.conf = conf; - Reader reader = null; - try { - reader = getRulesReader(conf); - } catch (Exception e) { - if (LOG.isErrorEnabled()) { - LOG.error(e.getMessage()); - } - throw new RuntimeException(e.getMessage(), e); - } - try { - rules = readRules(reader); - } catch (IOException e) { - if (LOG.isErrorEnabled()) { - LOG.error(e.getMessage()); - } - throw new RuntimeException(e.getMessage(), e); - } - } - - public Configuration getConf() { - return this.conf; - } - - /* - * ------------------------------ * </implementation:Configurable> * - * ------------------------------ - */ - - /** - * Read the specified file of rules. - * - * @param reader - * is a reader of regular expressions rules. - * @return the corresponding {@RegexRule rules}. - */ - private List<RegexRule> readRules(Reader reader) throws IOException, - IllegalArgumentException { - - BufferedReader in = new BufferedReader(reader); - List<RegexRule> rules = new ArrayList<RegexRule>(); - String line; - String hostOrDomain = null; - - while ((line = in.readLine()) != null) { - if (line.length() == 0) { - continue; - } - char first = line.charAt(0); - boolean sign = false; - switch (first) { - case '+': - sign = true; - break; - case '-': - sign = false; - break; - case ' ': - case '\n': - case '#': // skip blank & comment lines - continue; - case '>': - hostOrDomain = line.substring(1).trim(); - continue; - case '<': - hostOrDomain = null; - continue; - default: - throw new IOException("Invalid first character: " + line); - } - - String regex = line.substring(1); - if (LOG.isTraceEnabled()) { - LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain); - } - RegexRule rule = createRule(sign, regex, hostOrDomain); - rules.add(rule); - } - return rules; - } - - /** - * Filter the standard input using a RegexURLFilterBase. - * - * @param filter - * is the RegexURLFilterBase to use for filtering the standard input. - * @param args - * some optional parameters (not used). - */ - public static void main(RegexURLFilterBase filter, String args[]) - throws IOException, IllegalArgumentException { - - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); - String line; - while ((line = in.readLine()) != null) { - String out = filter.filter(line); - if (out != null) { - System.out.print("+"); - System.out.println(out); - } else { - System.out.print("-"); - System.out.println(line); - } - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/package-info.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/package-info.java deleted file mode 100644 index b849353..0000000 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Generic {@link org.apache.nutch.net.URLFilter URL filter} library, - * abstracting away from regular expression implementations. - */ -package org.apache.nutch.urlfilter.api; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java deleted file mode 100644 index 0b58231..0000000 --- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java +++ /dev/null @@ -1,134 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.api; - -// JDK imports -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.io.Reader; -import java.util.ArrayList; -import java.util.List; - -import org.junit.Assert; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// Nutch imports -import org.apache.nutch.net.URLFilter; - -/** - * JUnit based test of class <code>RegexURLFilterBase</code>. - * - * @author Jérôme Charron - */ -public abstract class RegexURLFilterBaseTest { - - /** My logger */ - protected static final Logger LOG = LoggerFactory - .getLogger(RegexURLFilterBaseTest.class); - - private final static String SEPARATOR = System.getProperty("file.separator"); - private final static String SAMPLES = System.getProperty("test.data", "."); - - protected abstract URLFilter getURLFilter(Reader rules); - - protected void bench(int loops, String file) { - try { - bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"), - new FileReader(SAMPLES + SEPARATOR + file + ".urls")); - } catch (Exception e) { - Assert.fail(e.toString()); - } - } - - protected void bench(int loops, Reader rules, Reader urls) { - long start = System.currentTimeMillis(); - try { - URLFilter filter = getURLFilter(rules); - FilteredURL[] expected = readURLFile(urls); - for (int i = 0; i < loops; i++) { - test(filter, expected); - } - } catch (Exception e) { - Assert.fail(e.toString()); - } - LOG.info("bench time (" + loops + ") " - + (System.currentTimeMillis() - start) + "ms"); - } - - protected void test(String file) { - try { - test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"), - new FileReader(SAMPLES + SEPARATOR + file + ".urls")); - } catch (Exception e) { - Assert.fail(e.toString()); - } - } - - protected void test(Reader rules, Reader urls) { - try { - test(getURLFilter(rules), readURLFile(urls)); - } catch (Exception e) { - Assert.fail(e.toString()); - } - } - - protected void test(URLFilter filter, FilteredURL[] expected) { - for (int i = 0; i < expected.length; i++) { - String result = filter.filter(expected[i].url); - if (result != null) { - Assert.assertTrue(expected[i].url, expected[i].sign); - } else { - Assert.assertFalse(expected[i].url, expected[i].sign); - } - } - } - - private static FilteredURL[] readURLFile(Reader reader) throws IOException { - BufferedReader in = new BufferedReader(reader); - List<FilteredURL> list = new ArrayList<FilteredURL>(); - String line; - while ((line = in.readLine()) != null) { - if (line.length() != 0) { - list.add(new FilteredURL(line)); - } - } - return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]); - } - - private static class FilteredURL { - - boolean sign; - String url; - - FilteredURL(String line) { - switch (line.charAt(0)) { - case '+': - sign = true; - break; - case '-': - sign = false; - break; - default: - // Simply ignore... - } - url = line.substring(1); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/build-ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-selenium/build-ivy.xml b/src/plugin/lib-selenium/build-ivy.xml deleted file mode 100644 index 3abcf6d..0000000 --- a/src/plugin/lib-selenium/build-ivy.xml +++ /dev/null @@ -1,54 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="lib-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> - - <property name="ivy.install.version" value="2.1.0" /> - <condition property="ivy.home" value="${env.IVY_HOME}"> - <isset property="env.IVY_HOME" /> - </condition> - <property name="ivy.home" value="${user.home}/.ant" /> - <property name="ivy.checksums" value="" /> - <property name="ivy.jar.dir" value="${ivy.home}/lib" /> - <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> - - <target name="download-ivy" unless="offline"> - - <mkdir dir="${ivy.jar.dir}"/> - <!-- download Ivy from web site so that it can be used even without any special installation --> - <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" - dest="${ivy.jar.file}" usetimestamp="true"/> - </target> - - <target name="init-ivy" depends="download-ivy"> - <!-- try to load ivy here from ivy home, in case the user has not already dropped - it into ant's lib dir (note that the latter copy will always take precedence). - We will not fail as long as local lib dir exists (it may be empty) and - ivy is in at least one of ant's lib dir or the local lib dir. --> - <path id="ivy.lib.path"> - <fileset dir="${ivy.jar.dir}" includes="*.jar"/> - - </path> - <taskdef resource="org/apache/ivy/ant/antlib.xml" - uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> - </target> - - <target name="deps-jar" depends="init-ivy"> - <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-selenium/build.xml b/src/plugin/lib-selenium/build.xml deleted file mode 100644 index 7c6d98d..0000000 --- a/src/plugin/lib-selenium/build.xml +++ /dev/null @@ -1,28 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="lib-selenium" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Add compilation dependencies to classpath --> - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/lib-http/*.jar" /> - </fileset> - </path> -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/howto_upgrade_selenium.txt ---------------------------------------------------------------------- diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.txt b/src/plugin/lib-selenium/howto_upgrade_selenium.txt deleted file mode 100644 index 1892a62..0000000 --- a/src/plugin/lib-selenium/howto_upgrade_selenium.txt +++ /dev/null @@ -1,15 +0,0 @@ -1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml - -2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml - - To get a list of dependencies and their versions execute: - $ ant -f ./build-ivy.xml - $ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export name="*"\/>\n <\/library>/g' - - Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). - - N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows - - $ brew install gnu-sed --with-default-names - - You can then restart your terminal and the Regex + Sed command should work just fine! http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml deleted file mode 100644 index 701b725..0000000 --- a/src/plugin/lib-selenium/ivy.xml +++ /dev/null @@ -1,52 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../../ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - <!-- begin selenium dependencies --> - <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.48.2" /> - - <dependency org="com.opera" name="operadriver" rev="1.5"> - <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> - </dependency> - <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" > - <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> - <exclude org="org.seleniumhq.selenium" name="selenium-java" /> - </dependency> - <!-- end selenium dependencies --> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-selenium/plugin.xml b/src/plugin/lib-selenium/plugin.xml deleted file mode 100644 index a86d665..0000000 --- a/src/plugin/lib-selenium/plugin.xml +++ /dev/null @@ -1,175 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<!-- - ! A common framework for http protocol implementations - !--> -<plugin - id="lib-selenium" - name="HTTP Framework" - version="1.0" - provider-name="org.apache.nutch"> - - <runtime> - <library name="lib-selenium.jar"> - <export name="*"/> - </library> - <!-- all classes from dependent libraries are exported --> - <library name="cglib-nodep-2.1_3.jar"> - <export name="*"/> - </library> - <library name="commons-codec-1.10.jar"> - <export name="*"/> - </library> - <library name="commons-collections-3.2.1.jar"> - <export name="*"/> - </library> - <library name="commons-exec-1.3.jar"> - <export name="*"/> - </library> - <library name="commons-io-2.4.jar"> - <export name="*"/> - </library> - <library name="commons-jxpath-1.3.jar"> - <export name="*"/> - </library> - <library name="commons-lang3-3.4.jar"> - <export name="*"/> - </library> - <library name="commons-logging-1.2.jar"> - <export name="*"/> - </library> - <library name="cssparser-0.9.16.jar"> - <export name="*"/> - </library> - <library name="gson-2.3.1.jar"> - <export name="*"/> - </library> - <library name="guava-18.0.jar"> - <export name="*"/> - </library> - <library name="htmlunit-2.18.jar"> - <export name="*"/> - </library> - <library name="htmlunit-core-js-2.17.jar"> - <export name="*"/> - </library> - <library name="httpclient-4.5.1.jar"> - <export name="*"/> - </library> - <library name="httpcore-4.4.3.jar"> - <export name="*"/> - </library> - <library name="httpmime-4.5.jar"> - <export name="*"/> - </library> - <library name="ini4j-0.5.2.jar"> - <export name="*"/> - </library> - <library name="jetty-io-9.2.12.v20150709.jar"> - <export name="*"/> - </library> - <library name="jetty-util-9.2.12.v20150709.jar"> - <export name="*"/> - </library> - <library name="jna-4.1.0.jar"> - <export name="*"/> - </library> - <library name="jna-platform-4.1.0.jar"> - <export name="*"/> - </library> - <library name="nekohtml-1.9.22.jar"> - <export name="*"/> - </library> - <library name="netty-3.5.2.Final.jar"> - <export name="*"/> - </library> - <library name="operadriver-1.5.jar"> - <export name="*"/> - </library> - <library name="operalaunchers-1.1.jar"> - <export name="*"/> - </library> - <library name="phantomjsdriver-1.2.1.jar"> - <export name="*"/> - </library> - <library name="protobuf-java-2.4.1.jar"> - <export name="*"/> - </library> - <library name="sac-1.3.jar"> - <export name="*"/> - </library> - <library name="selenium-api-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-chrome-driver-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-edge-driver-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-firefox-driver-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-htmlunit-driver-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-ie-driver-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-java-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-leg-rc-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-remote-driver-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-safari-driver-2.48.2.jar"> - <export name="*"/> - </library> - <library name="selenium-support-2.48.2.jar"> - <export name="*"/> - </library> - <library name="serializer-2.7.2.jar"> - <export name="*"/> - </library> - <library name="webbit-0.4.14.jar"> - <export name="*"/> - </library> - <library name="websocket-api-9.2.12.v20150709.jar"> - <export name="*"/> - </library> - <library name="websocket-client-9.2.12.v20150709.jar"> - <export name="*"/> - </library> - <library name="websocket-common-9.2.12.v20150709.jar"> - <export name="*"/> - </library> - <library name="xalan-2.7.2.jar"> - <export name="*"/> - </library> - <library name="xercesImpl-2.11.0.jar"> - <export name="*"/> - </library> - <library name="xml-apis-1.4.01.jar"> - <export name="*"/> - </library> - </runtime> - -</plugin>
