Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Thu Jan 29 05:38:59 2015 @@ -47,20 +47,20 @@ import org.apache.hadoop.io.Text; import crawlercommons.robots.BaseRobotRules; public abstract class HttpBase implements Protocol { - + public static final Text RESPONSE_TIME = new Text("_rs_"); public static final int BUFFER_SIZE = 8 * 1024; - + private static final byte[] EMPTY_CONTENT = new byte[0]; private HttpRobotRulesParser robots = null; - - /** The proxy hostname. */ + + /** The proxy hostname. */ protected String proxyHost = null; /** The proxy port. */ - protected int proxyPort = 8080; + protected int proxyPort = 8080; /** Indicates if a proxy is used */ protected boolean useProxy = false; @@ -69,29 +69,27 @@ public abstract class HttpBase implement protected int timeout = 10000; /** The length limit for downloaded content, in bytes. */ - protected int maxContent = 64 * 1024; + protected int maxContent = 64 * 1024; /** The Nutch 'User-Agent' request header */ - protected String userAgent = getAgentString( - "NutchCVS", null, "Nutch", - "http://nutch.apache.org/bot.html", - "[email protected]"); + protected String userAgent = getAgentString("NutchCVS", null, "Nutch", + "http://nutch.apache.org/bot.html", "[email protected]"); /** The "Accept-Language" request header value. */ protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3"; /** The "Accept" request header value. */ protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; - + /** The default logger */ private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class); /** The specified logger */ private Logger logger = LOGGER; - + /** The nutch configuration */ private Configuration conf = null; - + /** Do we use HTTP/1.1? */ protected boolean useHttp11 = false; @@ -99,14 +97,14 @@ public abstract class HttpBase implement * Record response time in CrawlDatum's meta data, see property * http.store.responsetime. */ - protected boolean responseTime = true; - + protected boolean responseTime = true; + /** Skip page if Crawl-Delay longer than this value. */ protected long maxCrawlDelay = -1L; - + /** Which TLS/SSL protocols to support */ protected Set<String> tlsPreferredProtocols; - + /** Which TLS/SSL cipher suites to support */ protected Set<String> tlsPreferredCipherSuites; @@ -114,7 +112,7 @@ public abstract class HttpBase implement public HttpBase() { this(null); } - + /** Creates a new instance of HttpBase */ public HttpBase(Logger logger) { if (logger != null) { @@ -122,134 +120,168 @@ public abstract class HttpBase implement } robots = new HttpRobotRulesParser(); } - + // Inherited Javadoc public void setConf(Configuration conf) { - this.conf = conf; - this.proxyHost = conf.get("http.proxy.host"); - this.proxyPort = conf.getInt("http.proxy.port", 8080); - this.useProxy = (proxyHost != null && proxyHost.length() > 0); - this.timeout = conf.getInt("http.timeout", 10000); - this.maxContent = conf.getInt("http.content.limit", 64 * 1024); - this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf - .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); - this.acceptLanguage = conf.get("http.accept.language", acceptLanguage); - this.accept = conf.get("http.accept", accept); - // backward-compatible default setting - this.useHttp11 = conf.getBoolean("http.useHttp11", false); - this.responseTime = conf.getBoolean("http.store.responsetime", true); - this.robots.setConf(conf); - - String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); - String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", - "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", - "TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", - "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", - "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", - "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA", - "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256", - "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", - "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", - "TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", - "TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", - "TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA", - "TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", - "SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", - "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5", - "TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA", - "TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA", - "SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA", - "TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5", - "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5"); + this.conf = conf; + this.proxyHost = conf.get("http.proxy.host"); + this.proxyPort = conf.getInt("http.proxy.port", 8080); + this.useProxy = (proxyHost != null && proxyHost.length() > 0); + this.timeout = conf.getInt("http.timeout", 10000); + this.maxContent = conf.getInt("http.content.limit", 64 * 1024); + this.userAgent = getAgentString(conf.get("http.agent.name"), + conf.get("http.agent.version"), conf.get("http.agent.description"), + conf.get("http.agent.url"), conf.get("http.agent.email")); + this.acceptLanguage = conf.get("http.accept.language", acceptLanguage); + this.accept = conf.get("http.accept", accept); + // backward-compatible default setting + this.useHttp11 = conf.getBoolean("http.useHttp11", false); + this.responseTime = conf.getBoolean("http.store.responsetime", true); + this.robots.setConf(conf); + + String[] protocols = conf.getStrings("http.tls.supported.protocols", + "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); + String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", + "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", + "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", + "TLS_RSA_WITH_AES_256_CBC_SHA256", + "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384", + "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", + "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256", + "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256", + "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA", + "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", + "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256", + "TLS_RSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256", + "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", + "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256", + "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA", + "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA", + "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA", + "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", + "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA", + "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA", + "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", + "TLS_ECDH_RSA_WITH_RC4_128_SHA", + "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA", + "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA", + "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA", + "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5", + "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256", + "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA", + "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA", + "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5", + "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA", + "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA", + "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA", + "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA", + "TLS_KRB5_WITH_DES_CBC_MD5"); - tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); - tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); + tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); + tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); - logConf(); + logConf(); } // Inherited Javadoc public Configuration getConf() { return this.conf; } - + public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { - + String urlString = url.toString(); try { URL u = new URL(urlString); - + long startTime = System.currentTimeMillis(); Response response = getResponse(u, datum, false); // make a request - - if(this.responseTime) { + + if (this.responseTime) { int elapsedTime = (int) (System.currentTimeMillis() - startTime); datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime)); } - + int code = response.getCode(); byte[] content = response.getContent(); Content c = new Content(u.toString(), u.toString(), - (content == null ? EMPTY_CONTENT : content), - response.getHeader("Content-Type"), - response.getHeaders(), this.conf); - + (content == null ? EMPTY_CONTENT : content), + response.getHeader("Content-Type"), response.getHeaders(), this.conf); + if (code == 200) { // got a good response return new ProtocolOutput(c); // return it - + } else if (code >= 300 && code < 400) { // handle redirect String location = response.getHeader("Location"); // some broken servers, such as MS IIS, use lowercase header name... - if (location == null) location = response.getHeader("location"); - if (location == null) location = ""; + if (location == null) + location = response.getHeader("location"); + if (location == null) + location = ""; u = new URL(u, location); int protocolStatusCode; switch (code) { - case 300: // multiple choices, preferred value in Location - protocolStatusCode = ProtocolStatus.MOVED; - break; - case 301: // moved permanently - case 305: // use proxy (Location is URL of proxy) - protocolStatusCode = ProtocolStatus.MOVED; - break; - case 302: // found (temporarily moved) - case 303: // see other (redirect after POST) - case 307: // temporary redirect - protocolStatusCode = ProtocolStatus.TEMP_MOVED; - break; - case 304: // not modified - protocolStatusCode = ProtocolStatus.NOTMODIFIED; - break; - default: - protocolStatusCode = ProtocolStatus.MOVED; + case 300: // multiple choices, preferred value in Location + protocolStatusCode = ProtocolStatus.MOVED; + break; + case 301: // moved permanently + case 305: // use proxy (Location is URL of proxy) + protocolStatusCode = ProtocolStatus.MOVED; + break; + case 302: // found (temporarily moved) + case 303: // see other (redirect after POST) + case 307: // temporary redirect + protocolStatusCode = ProtocolStatus.TEMP_MOVED; + break; + case 304: // not modified + protocolStatusCode = ProtocolStatus.NOTMODIFIED; + break; + default: + protocolStatusCode = ProtocolStatus.MOVED; } // handle this in the higher layer. return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u)); } else if (code == 400) { // bad request, mark as GONE - if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); } + if (logger.isTraceEnabled()) { + logger.trace("400 Bad request: " + u); + } return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u)); - } else if (code == 401) { // requires authorization, but no valid auth provided. - if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); } - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + } else if (code == 401) { // requires authorization, but no valid auth + // provided. + if (logger.isTraceEnabled()) { + logger.trace("401 Authentication Required"); + } + return new ProtocolOutput(c, new ProtocolStatus( + ProtocolStatus.ACCESS_DENIED, "Authentication required: " + urlString)); } else if (code == 404) { - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u)); + return new ProtocolOutput(c, new ProtocolStatus( + ProtocolStatus.NOTFOUND, u)); } else if (code == 410) { // permanently GONE - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + u)); + return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, + "Http: " + code + " url=" + u)); } else { - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" - + u)); + return new ProtocolOutput(c, new ProtocolStatus( + ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u)); } } catch (Throwable e) { logger.error("Failed to get protocol output", e); return new ProtocolOutput(null, new ProtocolStatus(e)); } } - - /* -------------------------- * - * </implementation:Protocol> * - * -------------------------- */ + /* + * -------------------------- * </implementation:Protocol> * + * -------------------------- + */ public String getProxyHost() { return proxyHost; @@ -274,70 +306,69 @@ public abstract class HttpBase implement public String getUserAgent() { return userAgent; } - - /** Value of "Accept-Language" request header sent by Nutch. + + /** + * Value of "Accept-Language" request header sent by Nutch. + * * @return The value of the header "Accept-Language" header. */ public String getAcceptLanguage() { - return acceptLanguage; + return acceptLanguage; } public String getAccept() { - return accept; + return accept; } public boolean getUseHttp11() { return useHttp11; } - + public Set<String> getTlsPreferredCipherSuites() { return tlsPreferredCipherSuites; } - + public Set<String> getTlsPreferredProtocols() { return tlsPreferredProtocols; } - private static String getAgentString(String agentName, - String agentVersion, - String agentDesc, - String agentURL, - String agentEmail) { - - if ( (agentName == null) || (agentName.trim().length() == 0) ) { + private static String getAgentString(String agentName, String agentVersion, + String agentDesc, String agentURL, String agentEmail) { + + if ((agentName == null) || (agentName.trim().length() == 0)) { // TODO : NUTCH-258 if (LOGGER.isErrorEnabled()) { LOGGER.error("No User-Agent string set (http.agent.name)!"); } } - - StringBuffer buf= new StringBuffer(); - + + StringBuffer buf = new StringBuffer(); + buf.append(agentName); if (agentVersion != null) { buf.append("/"); buf.append(agentVersion); } - if ( ((agentDesc != null) && (agentDesc.length() != 0)) - || ((agentEmail != null) && (agentEmail.length() != 0)) - || ((agentURL != null) && (agentURL.length() != 0)) ) { + if (((agentDesc != null) && (agentDesc.length() != 0)) + || ((agentEmail != null) && (agentEmail.length() != 0)) + || ((agentURL != null) && (agentURL.length() != 0))) { buf.append(" ("); - + if ((agentDesc != null) && (agentDesc.length() != 0)) { buf.append(agentDesc); - if ( (agentURL != null) || (agentEmail != null) ) + if ((agentURL != null) || (agentEmail != null)) buf.append("; "); } - + if ((agentURL != null) && (agentURL.length() != 0)) { buf.append(agentURL); if (agentEmail != null) buf.append("; "); } - + if ((agentEmail != null) && (agentEmail.length() != 0)) buf.append(agentEmail); - + buf.append(")"); } return buf.toString(); @@ -354,52 +385,59 @@ public abstract class HttpBase implement logger.info("http.accept = " + accept); } } - - public byte[] processGzipEncoded(byte[] compressed, URL url) throws IOException { - if (LOGGER.isTraceEnabled()) { LOGGER.trace("uncompressing...."); } + public byte[] processGzipEncoded(byte[] compressed, URL url) + throws IOException { + + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("uncompressing...."); + } // content can be empty (i.e. redirection) in which case // there is nothing to unzip if (compressed.length == 0) return compressed; - + byte[] content; if (getMaxContent() >= 0) { - content = GZIPUtils.unzipBestEffort(compressed, getMaxContent()); + content = GZIPUtils.unzipBestEffort(compressed, getMaxContent()); } else { - content = GZIPUtils.unzipBestEffort(compressed); - } + content = GZIPUtils.unzipBestEffort(compressed); + } if (content == null) throw new IOException("unzipBestEffort returned null"); if (LOGGER.isTraceEnabled()) { LOGGER.trace("fetched " + compressed.length - + " bytes of compressed content (expanded to " - + content.length + " bytes) from " + url); + + " bytes of compressed content (expanded to " + content.length + + " bytes) from " + url); } return content; } - public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException { + public byte[] processDeflateEncoded(byte[] compressed, URL url) + throws IOException { // content can be empty (i.e. redirection) in which case // there is nothing to deflate if (compressed.length == 0) return compressed; - - if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); } - byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent()); + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("inflating...."); + } + + byte[] content = DeflateUtils + .inflateBestEffort(compressed, getMaxContent()); if (content == null) throw new IOException("inflateBestEffort returned null"); if (LOGGER.isTraceEnabled()) { LOGGER.trace("fetched " + compressed.length - + " bytes of compressed content (expanded to " - + content.length + " bytes) from " + url); + + " bytes of compressed content (expanded to " + content.length + + " bytes) from " + url); } return content; } @@ -407,14 +445,14 @@ public abstract class HttpBase implement protected static void main(HttpBase http, String[] args) throws Exception { boolean verbose = false; String url = null; - + String usage = "Usage: Http [-verbose] [-timeout N] url"; - + if (args.length == 0) { System.err.println(usage); System.exit(-1); } - + for (int i = 0; i < args.length; i++) { // parse command line if (args[i].equals("-timeout")) { // found -timeout option http.timeout = Integer.parseInt(args[++i]) * 1000; @@ -423,35 +461,34 @@ public abstract class HttpBase implement } else if (i != args.length - 1) { System.err.println(usage); System.exit(-1); - } else // root is required parameter + } else + // root is required parameter url = args[i]; } - -// if (verbose) { -// LOGGER.setLevel(Level.FINE); -// } - - ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum()); + + // if (verbose) { + // LOGGER.setLevel(Level.FINE); + // } + + ProtocolOutput out = http + .getProtocolOutput(new Text(url), new CrawlDatum()); Content content = out.getContent(); - + System.out.println("Status: " + out.getStatus()); if (content != null) { System.out.println("Content Type: " + content.getContentType()); - System.out.println("Content Length: " + - content.getMetadata().get(Response.CONTENT_LENGTH)); + System.out.println("Content Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); System.out.println("Content:"); String text = new String(content.getContent()); System.out.println(text); - } + } } - - protected abstract Response getResponse(URL url, - CrawlDatum datum, - boolean followRedirects) - throws ProtocolException, IOException; + + protected abstract Response getResponse(URL url, CrawlDatum datum, + boolean followRedirects) throws ProtocolException, IOException; public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { return robots.getRobotRulesSet(this, url); } } -
Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java Thu Jan 29 05:38:59 2015 @@ -19,7 +19,6 @@ package org.apache.nutch.protocol.http.a // Nutch imports import org.apache.nutch.protocol.ProtocolException; - public class HttpException extends ProtocolException { public HttpException() { Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Thu Jan 29 05:38:59 2015 @@ -32,36 +32,41 @@ import crawlercommons.robots.BaseRobotRu import crawlercommons.robots.SimpleRobotRules; /** - * This class is used for parsing robots for urls belonging to HTTP protocol. - * It extends the generic {@link RobotRulesParser} class and contains - * Http protocol specific implementation for obtaining the robots file. + * This class is used for parsing robots for urls belonging to HTTP protocol. It + * extends the generic {@link RobotRulesParser} class and contains Http protocol + * specific implementation for obtaining the robots file. */ public class HttpRobotRulesParser extends RobotRulesParser { - - public static final Logger LOG = LoggerFactory.getLogger(HttpRobotRulesParser.class); + + public static final Logger LOG = LoggerFactory + .getLogger(HttpRobotRulesParser.class); protected boolean allowForbidden = false; - HttpRobotRulesParser() { } + HttpRobotRulesParser() { + } public HttpRobotRulesParser(Configuration conf) { - setConf(conf); + setConf(conf); } - + public void setConf(Configuration conf) { - super.setConf(conf); - allowForbidden = conf.getBoolean("http.robots.403.allow", true); + super.setConf(conf); + allowForbidden = conf.getBoolean("http.robots.403.allow", true); } /** Compose unique key to store and access robot rules in cache for given URL */ protected static String getCacheKey(URL url) { - String protocol = url.getProtocol().toLowerCase(); // normalize to lower case - String host = url.getHost().toLowerCase(); // normalize to lower case + String protocol = url.getProtocol().toLowerCase(); // normalize to lower + // case + String host = url.getHost().toLowerCase(); // normalize to lower case int port = url.getPort(); if (port == -1) { port = url.getDefaultPort(); } - /* Robot rules apply only to host, protocol, and port where robots.txt is - * hosted (cf. NUTCH-1752). Consequently */ + /* + * Robot rules apply only to host, protocol, and port where robots.txt is + * hosted (cf. NUTCH-1752). Consequently + */ String cacheKey = protocol + ":" + host + ":" + port; return cacheKey; } @@ -77,7 +82,7 @@ public class HttpRobotRulesParser extend * The {@link Protocol} object * @param url * URL robots.txt applies to - * + * * @return {@link BaseRobotRules} holding the rules from robots.txt */ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { @@ -86,13 +91,15 @@ public class HttpRobotRulesParser extend BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey); boolean cacheRule = true; - - if (robotRules == null) { // cache miss + + if (robotRules == null) { // cache miss URL redir = null; - if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); } + if (LOG.isTraceEnabled()) { + LOG.trace("cache miss " + url); + } try { - Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"), - new CrawlDatum(), true); + Response response = ((HttpBase) http).getResponse(new URL(url, + "/robots.txt"), new CrawlDatum(), true); // try one level of redirection ? if (response.getCode() == 301 || response.getCode() == 302) { String redirection = response.getHeader("Location"); @@ -107,23 +114,23 @@ public class HttpRobotRulesParser extend } else { redir = new URL(redirection); } - - response = ((HttpBase)http).getResponse(redir, new CrawlDatum(), true); + + response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), + true); } } - if (response.getCode() == 200) // found rules: parse them - robotRules = parseRules(url.toString(), response.getContent(), - response.getHeader("Content-Type"), - agentNames); + if (response.getCode() == 200) // found rules: parse them + robotRules = parseRules(url.toString(), response.getContent(), + response.getHeader("Content-Type"), agentNames); - else if ( (response.getCode() == 403) && (!allowForbidden) ) - robotRules = FORBID_ALL_RULES; // use forbid all + else if ((response.getCode() == 403) && (!allowForbidden)) + robotRules = FORBID_ALL_RULES; // use forbid all else if (response.getCode() >= 500) { cacheRule = false; robotRules = EMPTY_RULES; - }else - robotRules = EMPTY_RULES; // use default rules + } else + robotRules = EMPTY_RULES; // use default rules } catch (Throwable t) { if (LOG.isInfoEnabled()) { LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); @@ -133,7 +140,7 @@ public class HttpRobotRulesParser extend } if (cacheRule) { - CACHE.put(cacheKey, robotRules); // cache rules for host + CACHE.put(cacheKey, robotRules); // cache rules for host if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { // cache also for the redirected host CACHE.put(getCacheKey(redir), robotRules); Modified: nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original) +++ nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Thu Jan 29 05:38:59 2015 @@ -23,10 +23,10 @@ import org.junit.Test; import crawlercommons.robots.BaseRobotRules; /** - * JUnit test case which tests - * 1. that robots filtering is performed correctly as per the agent name - * 2. that crawl delay is extracted correctly from the robots file - * + * JUnit test case which tests 1. that robots filtering is performed correctly + * as per the agent name 2. that crawl delay is extracted correctly from the + * robots file + * */ public class TestRobotRulesParser { @@ -36,39 +36,32 @@ public class TestRobotRulesParser { private static final String UNKNOWN_AGENT = "AgentABC"; private static final String CR = "\r"; - private static final String ROBOTS_STRING = - "User-Agent: Agent1 #foo" + CR - + "Disallow: /a" + CR - + "Disallow: /b/a" + CR - + "#Disallow: /c" + CR - + "Crawl-delay: 10" + CR // set crawl delay for Agent1 as 10 sec - + "" + CR - + "" + CR - + "User-Agent: Agent2" + CR - + "Disallow: /a/bloh" + CR - + "Disallow: /c" + CR - + "Disallow: /foo" + CR - + "Crawl-delay: 20" + CR - + "" + CR - + "User-Agent: *" + CR - + "Disallow: /foo/bar/" + CR; // no crawl delay for other agents + private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR + + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + + CR + + "Crawl-delay: 10" + + CR // set crawl delay for Agent1 as 10 sec + + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh" + + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20" + + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no + // crawl + // delay + // for + // other + // agents private static final String[] TEST_PATHS = new String[] { - "http://example.com/a", - "http://example.com/a/bloh/foo.html", - "http://example.com/b", - "http://example.com/c", - "http://example.com/b/a/index.html", - "http://example.com/foo/bar/baz.html" - }; - - private static final boolean[] RESULTS = new boolean[] { - false, // /a - false, // /a/bloh/foo.html - true, // /b - true, // /c - false, // /b/a/index.html - true // /foo/bar/baz.html + "http://example.com/a", "http://example.com/a/bloh/foo.html", + "http://example.com/b", "http://example.com/c", + "http://example.com/b/a/index.html", + "http://example.com/foo/bar/baz.html" }; + + private static final boolean[] RESULTS = new boolean[] { false, // /a + false, // /a/bloh/foo.html + true, // /b + true, // /c + false, // /b/a/index.html + true // /foo/bar/baz.html }; private HttpRobotRulesParser parser; @@ -79,41 +72,52 @@ public class TestRobotRulesParser { } /** - * Test that the robots rules are interpreted correctly by the robots rules parser. + * Test that the robots rules are interpreted correctly by the robots rules + * parser. */ @Test public void testRobotsAgent() { - rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT); + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, SINGLE_AGENT); - for(int counter = 0; counter < TEST_PATHS.length; counter++) { - Assert.assertTrue("testing on agent (" + SINGLE_AGENT + "), and " - + "path " + TEST_PATHS[counter] - + " got " + rules.isAllowed(TEST_PATHS[counter]), - rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); + for (int counter = 0; counter < TEST_PATHS.length; counter++) { + Assert.assertTrue( + "testing on agent (" + SINGLE_AGENT + "), and " + "path " + + TEST_PATHS[counter] + " got " + + rules.isAllowed(TEST_PATHS[counter]), + rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); } - rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, MULTIPLE_AGENTS); + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, MULTIPLE_AGENTS); - for(int counter = 0; counter < TEST_PATHS.length; counter++) { - Assert.assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and " - + "path " + TEST_PATHS[counter] - + " got " + rules.isAllowed(TEST_PATHS[counter]), - rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); + for (int counter = 0; counter < TEST_PATHS.length; counter++) { + Assert.assertTrue( + "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path " + + TEST_PATHS[counter] + " got " + + rules.isAllowed(TEST_PATHS[counter]), + rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); } } /** - * Test that the crawl delay is extracted from the robots file for respective agent. - * If its not specified for a given agent, default value must be returned. + * Test that the crawl delay is extracted from the robots file for respective + * agent. If its not specified for a given agent, default value must be + * returned. */ @Test public void testCrawlDelay() { - // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be returned by the parser - rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT); - Assert.assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", (rules.getCrawlDelay() == 10000)); + // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be + // returned by the parser + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, SINGLE_AGENT); + Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ", + (rules.getCrawlDelay() == 10000)); // for UNKNOWN_AGENT, the default crawl delay must be returned. - rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT); - Assert.assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", (rules.getCrawlDelay() == Long.MIN_VALUE)); + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, UNKNOWN_AGENT); + Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ", + (rules.getCrawlDelay() == Long.MIN_VALUE)); } } Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (original) +++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java Thu Jan 29 05:38:59 2015 @@ -16,11 +16,9 @@ */ package org.apache.nutch.urlfilter.api; - - /** * A generic regular expression rule. - * + * * @author Jérôme Charron */ public abstract class RegexRule { @@ -29,13 +27,15 @@ public abstract class RegexRule { /** * Constructs a new regular expression rule. - * - * @param sign specifies if this rule must filter-in or filter-out. - * A <code>true</code> value means that any url matching this rule - * must be accepted, a <code>false</code> value means that any url - * matching this rule must be rejected. - * @param regex is the regular expression used for matching (see - * {@link #match(String)} method). + * + * @param sign + * specifies if this rule must filter-in or filter-out. A + * <code>true</code> value means that any url matching this rule must + * be accepted, a <code>false</code> value means that any url + * matching this rule must be rejected. + * @param regex + * is the regular expression used for matching (see + * {@link #match(String)} method). */ protected RegexRule(boolean sign, String regex) { this.sign = sign; @@ -43,19 +43,22 @@ public abstract class RegexRule { /** * Return if this rule is used for filtering-in or out. - * + * * @return <code>true</code> if any url matching this rule must be accepted, * otherwise <code>false</code>. */ - protected boolean accept() { return sign; } - + protected boolean accept() { + return sign; + } + /** * Checks if a url matches this rule. - * @param url is the url to check. - * @return <code>true</code> if the specified url matches this rule, - * otherwise <code>false</code>. + * + * @param url + * is the url to check. + * @return <code>true</code> if the specified url matches this rule, otherwise + * <code>false</code>. */ protected abstract boolean match(String url); } - Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (original) +++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Thu Jan 29 05:38:59 2015 @@ -37,28 +37,32 @@ import org.apache.hadoop.conf.Configurat // Nutch imports import org.apache.nutch.net.*; - /** - * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on - * regular expressions. - * - * <p>The regular expressions rules are expressed in a file. The file of rules - * is determined for each implementation using the - * {@link #getRulesReader(Configuration conf)} method.</p> + * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular + * expressions. + * + * <p> + * The regular expressions rules are expressed in a file. The file of rules is + * determined for each implementation using the + * {@link #getRulesReader(Configuration conf)} method. + * </p> * - * <p>The format of this file is made of many rules (one per line):<br/> + * <p> + * The format of this file is made of many rules (one per line):<br/> * <code> * [+-]<regex> * </code><br/> - * where plus (<code>+</code>)means go ahead and index it and minus - * (<code>-</code>)means no.</p> - * + * where plus (<code>+</code>)means go ahead and index it and minus ( + * <code>-</code>)means no. + * </p> + * * @author Jérôme Charron */ public abstract class RegexURLFilterBase implements URLFilter { /** My logger */ - private final static Logger LOG = LoggerFactory.getLogger(RegexURLFilterBase.class); + private final static Logger LOG = LoggerFactory + .getLogger(RegexURLFilterBase.class); /** An array of applicable rules */ private List<RegexRule> rules; @@ -66,24 +70,28 @@ public abstract class RegexURLFilterBase /** The current configuration */ private Configuration conf; - /** * Constructs a new empty RegexURLFilterBase */ - public RegexURLFilterBase() { } + public RegexURLFilterBase() { + } /** * Constructs a new RegexURLFilter and init it with a file of rules. - * @param filename is the name of rules file. + * + * @param filename + * is the name of rules file. */ - public RegexURLFilterBase(File filename) - throws IOException, IllegalArgumentException { + public RegexURLFilterBase(File filename) throws IOException, + IllegalArgumentException { this(new FileReader(filename)); } - + /** * Constructs a new RegexURLFilter and inits it with a list of rules. - * @param rules string with a list of rules, one rule per line + * + * @param rules + * string with a list of rules, one rule per line * @throws IOException * @throws IllegalArgumentException */ @@ -94,68 +102,82 @@ public abstract class RegexURLFilterBase /** * Constructs a new RegexURLFilter and init it with a Reader of rules. - * @param reader is a reader of rules. + * + * @param reader + * is a reader of rules. */ - protected RegexURLFilterBase(Reader reader) - throws IOException, IllegalArgumentException { + protected RegexURLFilterBase(Reader reader) throws IOException, + IllegalArgumentException { rules = readRules(reader); } - + /** * Creates a new {@link RegexRule}. - * @param sign of the regular expression. - * A <code>true</code> value means that any URL matching this rule - * must be included, whereas a <code>false</code> - * value means that any URL matching this rule must be excluded. - * @param regex is the regular expression associated to this rule. + * + * @param sign + * of the regular expression. A <code>true</code> value means that + * any URL matching this rule must be included, whereas a + * <code>false</code> value means that any URL matching this rule + * must be excluded. + * @param regex + * is the regular expression associated to this rule. */ protected abstract RegexRule createRule(boolean sign, String regex); - + /** - * Returns the name of the file of rules to use for - * a particular implementation. - * @param conf is the current configuration. + * Returns the name of the file of rules to use for a particular + * implementation. + * + * @param conf + * is the current configuration. * @return the name of the resource containing the rules to use. */ - protected abstract Reader getRulesReader(Configuration conf) throws IOException; - - - /* -------------------------- * - * <implementation:URLFilter> * - * -------------------------- */ - + protected abstract Reader getRulesReader(Configuration conf) + throws IOException; + + /* + * -------------------------- * <implementation:URLFilter> * + * -------------------------- + */ + // Inherited Javadoc public String filter(String url) { for (RegexRule rule : rules) { if (rule.match(url)) { return rule.accept() ? url : null; } - }; + } + ; return null; } - /* --------------------------- * - * </implementation:URLFilter> * - * --------------------------- */ - - - /* ----------------------------- * - * <implementation:Configurable> * - * ----------------------------- */ - + /* + * --------------------------- * </implementation:URLFilter> * + * --------------------------- + */ + + /* + * ----------------------------- * <implementation:Configurable> * + * ----------------------------- + */ + public void setConf(Configuration conf) { this.conf = conf; Reader reader = null; try { reader = getRulesReader(conf); } catch (Exception e) { - if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } - throw new RuntimeException(e.getMessage(), e); + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } + throw new RuntimeException(e.getMessage(), e); } try { rules = readRules(reader); } catch (IOException e) { - if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } throw new RuntimeException(e.getMessage(), e); } } @@ -163,45 +185,51 @@ public abstract class RegexURLFilterBase public Configuration getConf() { return this.conf; } - - /* ------------------------------ * - * </implementation:Configurable> * - * ------------------------------ */ - + + /* + * ------------------------------ * </implementation:Configurable> * + * ------------------------------ + */ /** * Read the specified file of rules. - * @param reader is a reader of regular expressions rules. + * + * @param reader + * is a reader of regular expressions rules. * @return the corresponding {@RegexRule rules}. */ - private List<RegexRule> readRules(Reader reader) - throws IOException, IllegalArgumentException { + private List<RegexRule> readRules(Reader reader) throws IOException, + IllegalArgumentException { BufferedReader in = new BufferedReader(reader); List<RegexRule> rules = new ArrayList<RegexRule>(); String line; - - while((line=in.readLine())!=null) { + + while ((line = in.readLine()) != null) { if (line.length() == 0) { continue; } - char first=line.charAt(0); - boolean sign=false; + char first = line.charAt(0); + boolean sign = false; switch (first) { - case '+' : - sign=true; + case '+': + sign = true; break; - case '-' : - sign=false; + case '-': + sign = false; break; - case ' ' : case '\n' : case '#' : // skip blank & comment lines + case ' ': + case '\n': + case '#': // skip blank & comment lines continue; - default : - throw new IOException("Invalid first character: "+line); + default: + throw new IOException("Invalid first character: " + line); } String regex = line.substring(1); - if (LOG.isTraceEnabled()) { LOG.trace("Adding rule [" + regex + "]"); } + if (LOG.isTraceEnabled()) { + LOG.trace("Adding rule [" + regex + "]"); + } RegexRule rule = createRule(sign, regex); rules.add(rule); } @@ -210,18 +238,20 @@ public abstract class RegexURLFilterBase /** * Filter the standard input using a RegexURLFilterBase. - * @param filter is the RegexURLFilterBase to use for filtering the - * standard input. - * @param args some optional parameters (not used). + * + * @param filter + * is the RegexURLFilterBase to use for filtering the standard input. + * @param args + * some optional parameters (not used). */ public static void main(RegexURLFilterBase filter, String args[]) - throws IOException, IllegalArgumentException { + throws IOException, IllegalArgumentException { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; - while((line=in.readLine())!=null) { + while ((line = in.readLine()) != null) { String out = filter.filter(line); - if (out!=null) { + if (out != null) { System.out.print("+"); System.out.println(out); } else { Modified: nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (original) +++ nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Thu Jan 29 05:38:59 2015 @@ -31,26 +31,25 @@ import org.slf4j.LoggerFactory; // Nutch imports import org.apache.nutch.net.URLFilter; - /** * JUnit based test of class <code>RegexURLFilterBase</code>. - * + * * @author Jérôme Charron */ public abstract class RegexURLFilterBaseTest { /** My logger */ - protected static final Logger LOG = LoggerFactory.getLogger(RegexURLFilterBaseTest.class); + protected static final Logger LOG = LoggerFactory + .getLogger(RegexURLFilterBaseTest.class); - private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SEPARATOR = System.getProperty("file.separator"); private final static String SAMPLES = System.getProperty("test.data", "."); protected abstract URLFilter getURLFilter(Reader rules); protected void bench(int loops, String file) { try { - bench(loops, - new FileReader(SAMPLES + SEPARATOR + file + ".rules"), + bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"), new FileReader(SAMPLES + SEPARATOR + file + ".urls")); } catch (Exception e) { Assert.fail(e.toString()); @@ -62,14 +61,14 @@ public abstract class RegexURLFilterBase try { URLFilter filter = getURLFilter(rules); FilteredURL[] expected = readURLFile(urls); - for (int i=0; i<loops; i++) { + for (int i = 0; i < loops; i++) { test(filter, expected); } } catch (Exception e) { Assert.fail(e.toString()); } - LOG.info("bench time (" + loops + ") " + - (System.currentTimeMillis()-start) + "ms"); + LOG.info("bench time (" + loops + ") " + + (System.currentTimeMillis() - start) + "ms"); } protected void test(String file) { @@ -90,7 +89,7 @@ public abstract class RegexURLFilterBase } protected void test(URLFilter filter, FilteredURL[] expected) { - for (int i=0; i<expected.length; i++) { + for (int i = 0; i < expected.length; i++) { String result = filter.filter(expected[i].url); if (result != null) { Assert.assertTrue(expected[i].url, expected[i].sign); @@ -104,7 +103,7 @@ public abstract class RegexURLFilterBase BufferedReader in = new BufferedReader(reader); List<FilteredURL> list = new ArrayList<FilteredURL>(); String line; - while((line=in.readLine()) != null) { + while ((line = in.readLine()) != null) { if (line.length() != 0) { list.add(new FilteredURL(line)); } @@ -119,13 +118,13 @@ public abstract class RegexURLFilterBase FilteredURL(String line) { switch (line.charAt(0)) { - case '+' : + case '+': sign = true; break; - case '-' : + case '-': sign = false; break; - default : + default: // Simply ignore... } url = line.substring(1); Modified: nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (original) +++ nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -16,7 +16,6 @@ */ package org.apache.nutch.microformats.reltag; - // Nutch imports import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; @@ -29,29 +28,27 @@ import org.apache.nutch.parse.Parse; // Hadoop imports import org.apache.hadoop.conf.Configuration; - /** - * An {@link org.apache.nutch.indexer.IndexingFilter} that - * add <code>tag</code> field(s) to the document. - * + * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code> + * field(s) to the document. + * * @see <a href="http://www.microformats.org/wiki/rel-tag"> * http://www.microformats.org/wiki/rel-tag</a> * @author Jérôme Charron */ public class RelTagIndexingFilter implements IndexingFilter { - private Configuration conf; - // Inherited JavaDoc - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) - throws IndexingException { + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { // Check if some Rel-Tags found, possibly put there by RelTagParser - String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG); + String[] tags = parse.getData().getParseMeta() + .getValues(RelTagParser.REL_TAG); if (tags != null) { - for (int i=0; i<tags.length; i++) { + for (int i = 0; i < tags.length; i++) { doc.add("tag", tags[i]); } } @@ -59,10 +56,11 @@ public class RelTagIndexingFilter implem return doc; } - /* ----------------------------- * - * <implementation:Configurable> * - * ----------------------------- */ - + /* + * ----------------------------- * <implementation:Configurable> * + * ----------------------------- + */ + public void setConf(Configuration conf) { this.conf = conf; } @@ -70,9 +68,10 @@ public class RelTagIndexingFilter implem public Configuration getConf() { return this.conf; } - - /* ------------------------------ * - * </implementation:Configurable> * - * ------------------------------ */ - + + /* + * ------------------------------ * </implementation:Configurable> * + * ------------------------------ + */ + } Modified: nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original) +++ nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Thu Jan 29 05:38:59 2015 @@ -45,24 +45,24 @@ import org.apache.hadoop.conf.Configurat /** * Adds microformat rel-tags of document if found. - * + * * @see <a href="http://www.microformats.org/wiki/rel-tag"> * http://www.microformats.org/wiki/rel-tag</a> */ public class RelTagParser implements HtmlParseFilter { - + public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class); public final static String REL_TAG = "Rel-Tag"; - + private Configuration conf = null; - + /** * Scan the HTML document looking at possible rel-tags */ public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { - + HTMLMetaTags metaTags, DocumentFragment doc) { + // get parse obj Parse parse = parseResult.get(content.getUrl()); // Trying to find the document's rel-tags @@ -79,16 +79,16 @@ public class RelTagParser implements Htm private static class Parser { Set<String> tags = null; - + Parser(Node node) { tags = new TreeSet<String>(); parse(node); } - + Set<String> getRelTags() { return tags; } - + void parse(Node node) { if (node.getNodeType() == Node.ELEMENT_NODE) { @@ -105,7 +105,7 @@ public class RelTagParser implements Htm if ("tag".equalsIgnoreCase(relNode.getNodeValue())) { String tag = parseTag(hrefNode.getNodeValue()); if (!StringUtil.isEmpty(tag)) { - if(!tags.contains(tag)){ + if (!tags.contains(tag)) { tags.add(tag); LOG.debug("Adding tag: " + tag + " to tag set."); } @@ -115,26 +115,27 @@ public class RelTagParser implements Htm } } } - + // Recurse NodeList children = node.getChildNodes(); - for (int i=0; children != null && i<children.getLength(); i++) + for (int i = 0; children != null && i < children.getLength(); i++) parse(children.item(i)); } - + private final static String parseTag(String url) { String tag = null; try { URL u = new URL(url); String path = u.getPath(); - tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8"); + tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), + "UTF-8"); } catch (Exception e) { // Malformed tag... tag = null; } return tag; } - + } public void setConf(Configuration conf) { Modified: nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original) +++ nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Thu Jan 29 05:38:59 2015 @@ -50,18 +50,21 @@ import java.nio.charset.Charset; public class ExtParser implements Parser { - public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.ext"); + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.ext"); static final int BUFFER_SIZE = 4096; static final int TIMEOUT_DEFAULT = 30; // in seconds - // handy map from String contentType to String[] {command, timeoutString, encoding} + // handy map from String contentType to String[] {command, timeoutString, + // encoding} Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>(); - private Configuration conf; + private Configuration conf; - public ExtParser () { } + public ExtParser() { + } public ParseResult getParse(Content content) { @@ -70,14 +73,15 @@ public class ExtParser implements Parser String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType); if (params == null) return new ParseStatus(ParseStatus.FAILED, - "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf()); + "No external command defined for contentType: " + contentType) + .getEmptyParseResult(content.getUrl(), getConf()); String command = params[0]; int timeout = Integer.parseInt(params[1]); String encoding = params[2]; if (LOG.isTraceEnabled()) { - LOG.trace("Use "+command+ " with timeout="+timeout+"secs"); + LOG.trace("Use " + command + " with timeout=" + timeout + "secs"); } String text = null; @@ -89,19 +93,19 @@ public class ExtParser implements Parser String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); if (contentLength != null - && raw.length != Integer.parseInt(contentLength)) { - return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, - "Content truncated at " + raw.length - +" bytes. Parser can't handle incomplete " - + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf()); + && raw.length != Integer.parseInt(contentLength)) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + + " bytes. Parser can't handle incomplete " + contentType + + " file.").getEmptyParseResult(content.getUrl(), getConf()); } ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE); - ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4); + ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4); CommandRunner cr = new CommandRunner(); - cr.setCommand(command+ " " +contentType); + cr.setCommand(command + " " + contentType); cr.setInputStream(new ByteArrayInputStream(raw)); cr.setStdOutputStream(os); cr.setStdErrorStream(es); @@ -111,14 +115,15 @@ public class ExtParser implements Parser cr.evaluate(); if (cr.getExitValue() != 0) - return new ParseStatus(ParseStatus.FAILED, - "External command " + command - + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf()); + return new ParseStatus(ParseStatus.FAILED, "External command " + + command + " failed with error: " + es.toString()) + .getEmptyParseResult(content.getUrl(), getConf()); text = os.toString(encoding); } catch (Exception e) { // run time exception - return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); } if (text == null) @@ -131,15 +136,15 @@ public class ExtParser implements Parser Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf()); ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, - outlinks, content.getMetadata()); - return ParseResult.createParseResult(content.getUrl(), - new ParseImpl(text, parseData)); + outlinks, content.getMetadata()); + return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, + parseData)); } - + public void setConf(Configuration conf) { this.conf = conf; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( - "org.apache.nutch.parse.Parser").getExtensions(); + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions(); String contentType, command, timeoutString, encoding; @@ -161,13 +166,14 @@ public class ExtParser implements Parser // null encoding means default encoding = extension.getAttribute("encoding"); if (encoding == null) - encoding = Charset.defaultCharset().name(); + encoding = Charset.defaultCharset().name(); timeoutString = extension.getAttribute("timeout"); if (timeoutString == null || timeoutString.equals("")) timeoutString = "" + TIMEOUT_DEFAULT; - TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, encoding }); + TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, + encoding }); } } Modified: nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java (original) +++ nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Parse wrapper to run external command to do the parsing. */ package org.apache.nutch.parse.ext; + Modified: nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original) +++ nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Thu Jan 29 05:38:59 2015 @@ -37,15 +37,14 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -/** - * Unit tests for ExtParser. - * First creates a temp file with fixed content, then fetch - * and parse it using external command 'cat' and 'md5sum' alternately - * for 10 times. Doing so also does a light stress test for class - * CommandRunner.java (as used in ExtParser.java). - * +/** + * Unit tests for ExtParser. First creates a temp file with fixed content, then + * fetch and parse it using external command 'cat' and 'md5sum' alternately for + * 10 times. Doing so also does a light stress test for class CommandRunner.java + * (as used in ExtParser.java). + * * Warning: currently only do test on linux platform. - * + * * @author John Xing */ public class TestExtParser { @@ -67,10 +66,11 @@ public class TestExtParser { File tempDir = new File(path); if (!tempDir.exists()) tempDir.mkdir(); - tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir); + tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt", + tempDir); } else { // otherwise in java.io.tmpdir - tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt"); + tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt"); } urlString = tempFile.toURI().toURL().toString(); @@ -79,8 +79,10 @@ public class TestExtParser { fos.close(); // get nutch content - Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); + Protocol protocol = new ProtocolFactory(NutchConfiguration.create()) + .getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) + .getContent(); protocol = null; } @@ -90,8 +92,8 @@ public class TestExtParser { content = null; // clean temp file - //if (tempFile != null && tempFile.exists()) - // tempFile.delete(); + // if (tempFile != null && tempFile.exists()) + // tempFile.delete(); } @Test @@ -100,24 +102,27 @@ public class TestExtParser { // now test only on linux platform if (!System.getProperty("os.name").equalsIgnoreCase("linux")) { - System.err.println("Current OS is "+System.getProperty("os.name")+"."); + System.err + .println("Current OS is " + System.getProperty("os.name") + "."); System.err.println("No test is run on OS other than linux."); return; } Configuration conf = NutchConfiguration.create(); // loop alternately, total 10*2 times of invoking external command - for (int i=0; i<10; i++) { + for (int i = 0; i < 10; i++) { // check external parser that does 'cat' contentType = "application/vnd.nutch.example.cat"; content.setContentType(contentType); - parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl()); - Assert.assertEquals(expectedText,parse.getText()); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get( + content.getUrl()); + Assert.assertEquals(expectedText, parse.getText()); // check external parser that does 'md5sum' contentType = "application/vnd.nutch.example.md5sum"; content.setContentType(contentType); - parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl()); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get( + content.getUrl()); Assert.assertTrue(parse.getText().startsWith(expectedMD5sum)); } }
