Modified: nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Thu Jan 29 05:38:59 2015 @@ -45,23 +45,23 @@ import org.apache.nutch.protocol.http.ap /** An HTTP response. */ public class HttpResponse implements Response { - + private Configuration conf; - private HttpBase http; + private HttpBase http; private URL url; private String orig; private String base; private byte[] content; private int code; private Metadata headers = new SpellCheckedMetadata(); - + protected enum Scheme { - HTTP, - HTTPS, + HTTP, HTTPS, } /** * Default public constructor. + * * @param http * @param url * @param datum @@ -69,15 +69,15 @@ public class HttpResponse implements Res * @throws IOException */ public HttpResponse(HttpBase http, URL url, CrawlDatum datum) - throws ProtocolException, IOException { + throws ProtocolException, IOException { this.http = http; this.url = url; this.orig = url.toString(); this.base = url.toString(); - + Scheme scheme = null; - + if ("http".equals(url.getProtocol())) { scheme = Scheme.HTTP; } else if ("https".equals(url.getProtocol())) { @@ -105,44 +105,49 @@ public class HttpResponse implements Res } else { port = 443; } - portString= ""; + portString = ""; } else { - port= url.getPort(); - portString= ":" + port; + port = url.getPort(); + portString = ":" + port; } Socket socket = null; try { - socket = new Socket(); // create the socket + socket = new Socket(); // create the socket socket.setSoTimeout(http.getTimeout()); - // connect String sockHost = http.useProxy() ? http.getProxyHost() : host; int sockPort = http.useProxy() ? http.getProxyPort() : port; - InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort); + InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); socket.connect(sockAddr, http.getTimeout()); - + if (scheme == Scheme.HTTPS) { - SSLSocketFactory factory = (SSLSocketFactory)SSLSocketFactory.getDefault(); - SSLSocket sslsocket = (SSLSocket)factory.createSocket(socket, sockHost, sockPort, true); + SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory + .getDefault(); + SSLSocket sslsocket = (SSLSocket) factory.createSocket(socket, + sockHost, sockPort, true); sslsocket.setUseClientMode(true); - - // Get the protocols and ciphers supported by this JVM - Set<String> protocols = new HashSet<String>(Arrays.asList(sslsocket.getSupportedProtocols())); - Set<String> ciphers = new HashSet<String>(Arrays.asList(sslsocket.getSupportedCipherSuites())); - + + // Get the protocols and ciphers supported by this JVM + Set<String> protocols = new HashSet<String>(Arrays.asList(sslsocket + .getSupportedProtocols())); + Set<String> ciphers = new HashSet<String>(Arrays.asList(sslsocket + .getSupportedCipherSuites())); + // Intersect with preferred protocols and ciphers protocols.retainAll(http.getTlsPreferredProtocols()); ciphers.retainAll(http.getTlsPreferredCipherSuites()); - - sslsocket.setEnabledProtocols(protocols.toArray(new String[protocols.size()])); - sslsocket.setEnabledCipherSuites(ciphers.toArray(new String[ciphers.size()])); - + + sslsocket.setEnabledProtocols(protocols.toArray(new String[protocols + .size()])); + sslsocket.setEnabledCipherSuites(ciphers.toArray(new String[ciphers + .size()])); + sslsocket.startHandshake(); socket = sslsocket; } - + this.conf = http.getConf(); if (sockAddr != null && conf.getBoolean("store.ip.address", false) == true) { @@ -154,9 +159,9 @@ public class HttpResponse implements Res StringBuffer reqStr = new StringBuffer("GET "); if (http.useProxy()) { - reqStr.append(url.getProtocol()+"://"+host+portString+path); + reqStr.append(url.getProtocol() + "://" + host + portString + path); } else { - reqStr.append(path); + reqStr.append(path); } reqStr.append(" HTTP/1.0\r\n"); @@ -170,7 +175,9 @@ public class HttpResponse implements Res String userAgent = http.getUserAgent(); if ((userAgent == null) || (userAgent.length() == 0)) { - if (Http.LOG.isErrorEnabled()) { Http.LOG.error("User-agent is not set!"); } + if (Http.LOG.isErrorEnabled()) { + Http.LOG.error("User-agent is not set!"); + } } else { reqStr.append("User-Agent: "); reqStr.append(userAgent); @@ -186,30 +193,30 @@ public class HttpResponse implements Res reqStr.append("\r\n"); if (datum.getModifiedTime() > 0) { - reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime())); + reqStr.append("If-Modified-Since: " + + HttpDateFormat.toString(datum.getModifiedTime())); reqStr.append("\r\n"); } reqStr.append("\r\n"); - - byte[] reqBytes= reqStr.toString().getBytes(); + + byte[] reqBytes = reqStr.toString().getBytes(); req.write(reqBytes); req.flush(); - - PushbackInputStream in = // process response - new PushbackInputStream( - new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE), - Http.BUFFER_SIZE) ; + + PushbackInputStream in = // process response + new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), + Http.BUFFER_SIZE), Http.BUFFER_SIZE); StringBuffer line = new StringBuffer(); - boolean haveSeenNonContinueStatus= false; + boolean haveSeenNonContinueStatus = false; while (!haveSeenNonContinueStatus) { // parse status code line - this.code = parseStatusLine(in, line); + this.code = parseStatusLine(in, line); // parse headers parseHeaders(in, line); - haveSeenNonContinueStatus= code != 100; // 100 is "Continue" + haveSeenNonContinueStatus = code != 100; // 100 is "Continue" } String transferEncoding = getHeader(Response.TRANSFER_ENCODING); if (transferEncoding != null @@ -223,7 +230,7 @@ public class HttpResponse implements Res if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { content = http.processGzipEncoded(content, url); } else if ("deflate".equals(contentEncoding)) { - content = http.processDeflateEncoded(content, url); + content = http.processDeflateEncoded(content, url); } else { if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetched " + content.length + " bytes from " + url); @@ -237,15 +244,15 @@ public class HttpResponse implements Res } - - /* ------------------------- * - * <implementation:Response> * - * ------------------------- */ - + /* + * ------------------------- * <implementation:Response> * + * ------------------------- + */ + public URL getUrl() { return url; } - + public int getCode() { return code; } @@ -253,7 +260,7 @@ public class HttpResponse implements Res public String getHeader(String name) { return headers.get(name); } - + public Metadata getHeaders() { return headers; } @@ -262,39 +269,40 @@ public class HttpResponse implements Res return content; } - /* ------------------------- * - * <implementation:Response> * - * ------------------------- */ - + /* + * ------------------------- * <implementation:Response> * + * ------------------------- + */ - private void readPlainContent(InputStream in) - throws HttpException, IOException { + private void readPlainContent(InputStream in) throws HttpException, + IOException { - int contentLength = Integer.MAX_VALUE; // get content length + int contentLength = Integer.MAX_VALUE; // get content length String contentLengthString = headers.get(Response.CONTENT_LENGTH); if (contentLengthString != null) { contentLengthString = contentLengthString.trim(); try { - if (!contentLengthString.isEmpty()) + if (!contentLengthString.isEmpty()) contentLength = Integer.parseInt(contentLengthString); } catch (NumberFormatException e) { - throw new HttpException("bad content length: "+contentLengthString); + throw new HttpException("bad content length: " + contentLengthString); } } - if (http.getMaxContent() >= 0 - && contentLength > http.getMaxContent()) // limit download size - contentLength = http.getMaxContent(); + if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit + // download + // size + contentLength = http.getMaxContent(); ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); byte[] bytes = new byte[Http.BUFFER_SIZE]; int length = 0; - + // do not try to read if the contentLength is 0 - if (contentLength == 0){ + if (contentLength == 0) { content = new byte[0]; return; } - + // read content int i = in.read(bytes); while (i != -1) { @@ -322,11 +330,10 @@ public class HttpResponse implements Res * @throws IOException */ @SuppressWarnings("unused") - private void readChunkedContent(PushbackInputStream in, - StringBuffer line) - throws HttpException, IOException { - boolean doneChunks= false; - int contentBytesRead= 0; + private void readChunkedContent(PushbackInputStream in, StringBuffer line) + throws HttpException, IOException { + boolean doneChunks = false; + int contentBytesRead = 0; byte[] bytes = new byte[Http.BUFFER_SIZE]; ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); @@ -338,51 +345,54 @@ public class HttpResponse implements Res readLine(in, line, false); String chunkLenStr; - // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); } + // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); + // } - int pos= line.indexOf(";"); + int pos = line.indexOf(";"); if (pos < 0) { - chunkLenStr= line.toString(); + chunkLenStr = line.toString(); } else { - chunkLenStr= line.substring(0, pos); - // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + line.substring(pos+1)); } + chunkLenStr = line.substring(0, pos); + // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + + // line.substring(pos+1)); } } - chunkLenStr= chunkLenStr.trim(); + chunkLenStr = chunkLenStr.trim(); int chunkLen; try { - chunkLen= Integer.parseInt(chunkLenStr, 16); - } catch (NumberFormatException e){ - throw new HttpException("bad chunk length: "+line.toString()); + chunkLen = Integer.parseInt(chunkLenStr, 16); + } catch (NumberFormatException e) { + throw new HttpException("bad chunk length: " + line.toString()); } if (chunkLen == 0) { - doneChunks= true; + doneChunks = true; break; } - if ( http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http.getMaxContent() ) - chunkLen= http.getMaxContent() - contentBytesRead; + if (http.getMaxContent() >= 0 + && (contentBytesRead + chunkLen) > http.getMaxContent()) + chunkLen = http.getMaxContent() - contentBytesRead; // read one chunk - int chunkBytesRead= 0; + int chunkBytesRead = 0; while (chunkBytesRead < chunkLen) { - int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? - (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE; - int len= in.read(bytes, 0, toRead); + int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? (chunkLen - chunkBytesRead) + : Http.BUFFER_SIZE; + int len = in.read(bytes, 0, toRead); - if (len == -1) + if (len == -1) throw new HttpException("chunk eof after " + contentBytesRead - + " bytes in successful chunks" - + " and " + chunkBytesRead - + " in current chunk"); + + " bytes in successful chunks" + " and " + chunkBytesRead + + " in current chunk"); // DANGER!!! Will printed GZIPed stuff right to your // terminal! - // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, len)); } + // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, + // len)); } out.write(bytes, 0, len); - chunkBytesRead+= len; + chunkBytesRead += len; } readLine(in, line, false); @@ -390,7 +400,7 @@ public class HttpResponse implements Res } if (!doneChunks) { - if (contentBytesRead != http.getMaxContent()) + if (contentBytesRead != http.getMaxContent()) throw new HttpException("chunk eof: !doneChunk && didn't max out"); return; } @@ -401,36 +411,35 @@ public class HttpResponse implements Res } private int parseStatusLine(PushbackInputStream in, StringBuffer line) - throws IOException, HttpException { + throws IOException, HttpException { readLine(in, line, false); int codeStart = line.indexOf(" "); - int codeEnd = line.indexOf(" ", codeStart+1); + int codeEnd = line.indexOf(" ", codeStart + 1); // handle lines with no plaintext result code, ie: // "HTTP/1.1 200" vs "HTTP/1.1 200 OK" - if (codeEnd == -1) - codeEnd= line.length(); + if (codeEnd == -1) + codeEnd = line.length(); int code; try { - code= Integer.parseInt(line.substring(codeStart+1, codeEnd)); + code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); } catch (NumberFormatException e) { - throw new HttpException("bad status line '" + line - + "': " + e.getMessage(), e); + throw new HttpException("bad status line '" + line + "': " + + e.getMessage(), e); } return code; } + private void processHeaderLine(StringBuffer line) throws IOException, + HttpException { - private void processHeaderLine(StringBuffer line) - throws IOException, HttpException { - - int colonIndex = line.indexOf(":"); // key is up to colon + int colonIndex = line.indexOf(":"); // key is up to colon if (colonIndex == -1) { int i; - for (i= 0; i < line.length(); i++) + for (i = 0; i < line.length(); i++) if (!Character.isWhitespace(line.charAt(i))) break; if (i == line.length()) @@ -439,7 +448,7 @@ public class HttpResponse implements Res } String key = line.substring(0, colonIndex); - int valueStart = colonIndex+1; // skip whitespace + int valueStart = colonIndex + 1; // skip whitespace while (valueStart < line.length()) { int c = line.charAt(valueStart); if (c != ' ' && c != '\t') @@ -450,28 +459,27 @@ public class HttpResponse implements Res headers.set(key, value); } - // Adds headers to our headers Metadata private void parseHeaders(PushbackInputStream in, StringBuffer line) - throws IOException, HttpException { + throws IOException, HttpException { while (readLine(in, line, true) != 0) { // handle HTTP responses with missing blank line after headers int pos; - if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) - || ((pos= line.indexOf("<HTML")) != -1) - || ((pos= line.indexOf("<html")) != -1) ) { + if (((pos = line.indexOf("<!DOCTYPE")) != -1) + || ((pos = line.indexOf("<HTML")) != -1) + || ((pos = line.indexOf("<html")) != -1)) { in.unread(line.substring(pos).getBytes("UTF-8")); line.setLength(pos); try { - //TODO: (CM) We don't know the header names here - //since we're just handling them generically. It would - //be nice to provide some sort of mapping function here - //for the returned header names to the standard metadata - //names in the ParseData class + // TODO: (CM) We don't know the header names here + // since we're just handling them generically. It would + // be nice to provide some sort of mapping function here + // for the returned header names to the standard metadata + // names in the ParseData class processHeaderLine(line); } catch (Exception e) { // fixme: @@ -485,29 +493,29 @@ public class HttpResponse implements Res } private static int readLine(PushbackInputStream in, StringBuffer line, - boolean allowContinuedLine) - throws IOException { + boolean allowContinuedLine) throws IOException { line.setLength(0); for (int c = in.read(); c != -1; c = in.read()) { switch (c) { - case '\r': - if (peek(in) == '\n') { - in.read(); - } - case '\n': - if (line.length() > 0) { - // at EOL -- check for continued line if the current - // (possibly continued) line wasn't blank - if (allowContinuedLine) - switch (peek(in)) { - case ' ' : case '\t': // line is continued - in.read(); - continue; - } - } - return line.length(); // else complete - default : - line.append((char)c); + case '\r': + if (peek(in) == '\n') { + in.read(); + } + case '\n': + if (line.length() > 0) { + // at EOL -- check for continued line if the current + // (possibly continued) line wasn't blank + if (allowContinuedLine) + switch (peek(in)) { + case ' ': + case '\t': // line is continued + in.read(); + continue; + } + } + return line.length(); // else complete + default: + line.append((char) c); } } throw new EOFException();
Modified: nutch/trunk/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java (original) +++ nutch/trunk/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java Thu Jan 29 05:38:59 2015 @@ -93,7 +93,7 @@ public class TestProtocolHttp { /** * Starts the Jetty server at a specified port and redirection parameter. - * + * * @param portno * Port number. * @param redirection @@ -114,7 +114,7 @@ public class TestProtocolHttp { * Fetches the specified <code>page</code> from the local Jetty server and * checks whether the HTTP response status code matches with the expected * code. Also use jsp pages for redirection. - * + * * @param page * Page to be fetched. * @param expectedCode @@ -138,4 +138,3 @@ public class TestProtocolHttp { } } } - Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Thu Jan 29 05:38:59 2015 @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ /* * Based on EasySSLProtocolSocketFactory from commons-httpclient: * @@ -41,10 +41,12 @@ import org.slf4j.LoggerFactory; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; -public class DummySSLProtocolSocketFactory implements SecureProtocolSocketFactory { +public class DummySSLProtocolSocketFactory implements + SecureProtocolSocketFactory { /** Logger object for this class. */ - private static final Logger LOG = LoggerFactory.getLogger(DummySSLProtocolSocketFactory.class); + private static final Logger LOG = LoggerFactory + .getLogger(DummySSLProtocolSocketFactory.class); private SSLContext sslcontext = null; @@ -58,10 +60,13 @@ public class DummySSLProtocolSocketFacto private static SSLContext createEasySSLContext() { try { SSLContext context = SSLContext.getInstance("SSL"); - context.init(null, new TrustManager[] { new DummyX509TrustManager(null) }, null); + context.init(null, + new TrustManager[] { new DummyX509TrustManager(null) }, null); return context; } catch (Exception e) { - if (LOG.isErrorEnabled()) { LOG.error(e.getMessage(), e); } + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage(), e); + } throw new HttpClientError(e.toString()); } } @@ -76,10 +81,11 @@ public class DummySSLProtocolSocketFacto /** * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int) */ - public Socket createSocket(String host, int port, InetAddress clientHost, int clientPort) throws IOException, - UnknownHostException { + public Socket createSocket(String host, int port, InetAddress clientHost, + int clientPort) throws IOException, UnknownHostException { - return getSSLContext().getSocketFactory().createSocket(host, port, clientHost, clientPort); + return getSSLContext().getSocketFactory().createSocket(host, port, + clientHost, clientPort); } /** @@ -93,20 +99,28 @@ public class DummySSLProtocolSocketFacto * throws an {@link ConnectTimeoutException} * </p> * - * @param host the host name/IP - * @param port the port on the host - * @param localAddress the local host name/IP to bind the socket to - * @param localPort the port on the local machine - * @param params {@link HttpConnectionParams Http connection parameters} + * @param host + * the host name/IP + * @param port + * the port on the host + * @param localAddress + * the local host name/IP to bind the socket to + * @param localPort + * the port on the local machine + * @param params + * {@link HttpConnectionParams Http connection parameters} * * @return Socket a new socket * - * @throws IOException if an I/O error occurs while creating the socket - * @throws UnknownHostException if the IP address of the host cannot be - * determined + * @throws IOException + * if an I/O error occurs while creating the socket + * @throws UnknownHostException + * if the IP address of the host cannot be determined */ - public Socket createSocket(final String host, final int port, final InetAddress localAddress, final int localPort, - final HttpConnectionParams params) throws IOException, UnknownHostException, ConnectTimeoutException { + public Socket createSocket(final String host, final int port, + final InetAddress localAddress, final int localPort, + final HttpConnectionParams params) throws IOException, + UnknownHostException, ConnectTimeoutException { if (params == null) { throw new IllegalArgumentException("Parameters may not be null"); } @@ -115,27 +129,31 @@ public class DummySSLProtocolSocketFacto return createSocket(host, port, localAddress, localPort); } else { // To be eventually deprecated when migrated to Java 1.4 or above - return ControllerThreadSocketFactory.createSocket(this, host, port, localAddress, localPort, timeout); + return ControllerThreadSocketFactory.createSocket(this, host, port, + localAddress, localPort, timeout); } } /** * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int) */ - public Socket createSocket(String host, int port) throws IOException, UnknownHostException { + public Socket createSocket(String host, int port) throws IOException, + UnknownHostException { return getSSLContext().getSocketFactory().createSocket(host, port); } /** * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean) */ - public Socket createSocket(Socket socket, String host, int port, boolean autoClose) throws IOException, - UnknownHostException { - return getSSLContext().getSocketFactory().createSocket(socket, host, port, autoClose); + public Socket createSocket(Socket socket, String host, int port, + boolean autoClose) throws IOException, UnknownHostException { + return getSSLContext().getSocketFactory().createSocket(socket, host, port, + autoClose); } public boolean equals(Object obj) { - return ((obj != null) && obj.getClass().equals(DummySSLProtocolSocketFactory.class)); + return ((obj != null) && obj.getClass().equals( + DummySSLProtocolSocketFactory.class)); } public int hashCode() { Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java Thu Jan 29 05:38:59 2015 @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ /* * Based on EasyX509TrustManager from commons-httpclient. */ @@ -29,59 +29,64 @@ import java.security.cert.X509Certificat import javax.net.ssl.TrustManagerFactory; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; -import org.slf4j.Logger; +import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class DummyX509TrustManager implements X509TrustManager -{ - private X509TrustManager standardTrustManager = null; - - /** Logger object for this class. */ - private static final Logger LOG = LoggerFactory.getLogger(DummyX509TrustManager.class); - - /** - * Constructor for DummyX509TrustManager. - */ - public DummyX509TrustManager(KeyStore keystore) throws NoSuchAlgorithmException, KeyStoreException { - super(); - String algo = TrustManagerFactory.getDefaultAlgorithm(); - TrustManagerFactory factory = TrustManagerFactory.getInstance(algo); - factory.init(keystore); - TrustManager[] trustmanagers = factory.getTrustManagers(); - if (trustmanagers.length == 0) { - throw new NoSuchAlgorithmException(algo + " trust manager not supported"); - } - this.standardTrustManager = (X509TrustManager)trustmanagers[0]; - } - - /** - * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[], String) - */ - public boolean isClientTrusted(X509Certificate[] certificates) { - return true; - } - - /** - * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[], String) - */ - public boolean isServerTrusted(X509Certificate[] certificates) { - return true; - } +public class DummyX509TrustManager implements X509TrustManager { + private X509TrustManager standardTrustManager = null; - /** - * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers() - */ - public X509Certificate[] getAcceptedIssuers() { - return this.standardTrustManager.getAcceptedIssuers(); - } + /** Logger object for this class. */ + private static final Logger LOG = LoggerFactory + .getLogger(DummyX509TrustManager.class); + + /** + * Constructor for DummyX509TrustManager. + */ + public DummyX509TrustManager(KeyStore keystore) + throws NoSuchAlgorithmException, KeyStoreException { + super(); + String algo = TrustManagerFactory.getDefaultAlgorithm(); + TrustManagerFactory factory = TrustManagerFactory.getInstance(algo); + factory.init(keystore); + TrustManager[] trustmanagers = factory.getTrustManagers(); + if (trustmanagers.length == 0) { + throw new NoSuchAlgorithmException(algo + " trust manager not supported"); + } + this.standardTrustManager = (X509TrustManager) trustmanagers[0]; + } + + /** + * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[], + * String) + */ + public boolean isClientTrusted(X509Certificate[] certificates) { + return true; + } + + /** + * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[], + * String) + */ + public boolean isServerTrusted(X509Certificate[] certificates) { + return true; + } + + /** + * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers() + */ + public X509Certificate[] getAcceptedIssuers() { + return this.standardTrustManager.getAcceptedIssuers(); + } + + public void checkClientTrusted(X509Certificate[] arg0, String arg1) + throws CertificateException { + // do nothing + + } + + public void checkServerTrusted(X509Certificate[] arg0, String arg1) + throws CertificateException { + // do nothing - public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException { - // do nothing - - } - - public void checkServerTrusted(X509Certificate[] arg0, String arg1) throws CertificateException { - // do nothing - - } + } } Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Thu Jan 29 05:38:59 2015 @@ -54,19 +54,18 @@ import org.apache.hadoop.conf.Configurat import org.apache.nutch.util.NutchConfiguration; /** - * This class is a protocol plugin that configures an HTTP client for - * Basic, Digest and NTLM authentication schemes for web server as well - * as proxy server. It takes care of HTTPS protocol as well as cookies - * in a single fetch session. - * + * This class is a protocol plugin that configures an HTTP client for Basic, + * Digest and NTLM authentication schemes for web server as well as proxy + * server. It takes care of HTTPS protocol as well as cookies in a single fetch + * session. + * * @author Susam Pal */ public class Http extends HttpBase { public static final Logger LOG = LoggerFactory.getLogger(Http.class); - private static MultiThreadedHttpConnectionManager connectionManager = - new MultiThreadedHttpConnectionManager(); + private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager(); // Since the Configuration has not yet been set, // then an unconfigured client is returned. @@ -86,10 +85,9 @@ public class Http extends HttpBase { private String proxyPassword; private String proxyRealm; - /** * Returns the configured HTTP client. - * + * * @return HTTP client */ static synchronized HttpClient getClient() { @@ -104,10 +102,11 @@ public class Http extends HttpBase { } /** - * Reads the configuration from the Nutch configuration files and sets - * the configuration. - * - * @param conf Configuration + * Reads the configuration from the Nutch configuration files and sets the + * configuration. + * + * @param conf + * Configuration */ public void setConf(Configuration conf) { super.setConf(conf); @@ -130,8 +129,9 @@ public class Http extends HttpBase { /** * Main method. - * - * @param args Command line arguments + * + * @param args + * Command line arguments */ public static void main(String[] args) throws Exception { Http http = new Http(); @@ -140,16 +140,19 @@ public class Http extends HttpBase { } /** - * Fetches the <code>url</code> with a configured HTTP client and - * gets the response. - * - * @param url URL to be fetched - * @param datum Crawl data - * @param redirect Follow redirects if and only if true - * @return HTTP response + * Fetches the <code>url</code> with a configured HTTP client and gets the + * response. + * + * @param url + * URL to be fetched + * @param datum + * Crawl data + * @param redirect + * Follow redirects if and only if true + * @return HTTP response */ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) - throws ProtocolException, IOException { + throws ProtocolException, IOException { resolveCredentials(url); return new HttpResponse(this, url, datum, redirect); } @@ -170,12 +173,14 @@ public class Http extends HttpBase { params.setSendBufferSize(BUFFER_SIZE); params.setReceiveBufferSize(BUFFER_SIZE); params.setMaxTotalConnections(maxThreadsTotal); - - //Also set max connections per host to maxThreadsTotal since all threads - //might be used to fetch from the same host - otherwise timeout errors can occur + + // Also set max connections per host to maxThreadsTotal since all threads + // might be used to fetch from the same host - otherwise timeout errors can + // occur params.setDefaultMaxConnectionsPerHost(maxThreadsTotal); - // executeMethod(HttpMethod) seems to ignore the connection timeout on the connection manager. + // executeMethod(HttpMethod) seems to ignore the connection timeout on the + // connection manager. // set it explicitly on the HttpClient. client.getParams().setConnectionManagerTimeout(timeout); @@ -188,7 +193,9 @@ public class Http extends HttpBase { // prefer UTF-8 headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7")); // prefer understandable formats - headers.add(new Header("Accept", + headers + .add(new Header( + "Accept", "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5")); // accept gzipped content headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate")); @@ -200,43 +207,42 @@ public class Http extends HttpBase { if (proxyUsername.length() > 0) { - AuthScope proxyAuthScope = getAuthScope( - this.proxyHost, this.proxyPort, this.proxyRealm); + AuthScope proxyAuthScope = getAuthScope(this.proxyHost, this.proxyPort, + this.proxyRealm); - NTCredentials proxyCredentials = new NTCredentials( - this.proxyUsername, this.proxyPassword, - Http.agentHost, this.proxyRealm); + NTCredentials proxyCredentials = new NTCredentials(this.proxyUsername, + this.proxyPassword, Http.agentHost, this.proxyRealm); - client.getState().setProxyCredentials( - proxyAuthScope, proxyCredentials); + client.getState().setProxyCredentials(proxyAuthScope, proxyCredentials); } } } /** - * Reads authentication configuration file (defined as - * 'http.auth.file' in Nutch configuration file) and sets the - * credentials for the configured authentication scopes in the HTTP - * client object. - * - * @throws ParserConfigurationException If a document builder can not - * be created. - * @throws SAXException If any parsing error occurs. - * @throws IOException If any I/O error occurs. + * Reads authentication configuration file (defined as 'http.auth.file' in + * Nutch configuration file) and sets the credentials for the configured + * authentication scopes in the HTTP client object. + * + * @throws ParserConfigurationException + * If a document builder can not be created. + * @throws SAXException + * If any parsing error occurs. + * @throws IOException + * If any I/O error occurs. */ - private static synchronized void setCredentials() throws - ParserConfigurationException, SAXException, IOException { + private static synchronized void setCredentials() + throws ParserConfigurationException, SAXException, IOException { if (authRulesRead) return; authRulesRead = true; // Avoid re-attempting to read - InputStream is = conf.getConfResourceAsInputStream(authFile); + InputStream is = conf.getConfResourceAsInputStream(authFile); if (is != null) { - Document doc = DocumentBuilderFactory.newInstance() - .newDocumentBuilder().parse(is); + Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() + .parse(is); Element rootElement = doc.getDocumentElement(); if (!"auth-configuration".equals(rootElement.getTagName())) { @@ -251,14 +257,14 @@ public class Http extends HttpBase { for (int i = 0; i < credList.getLength(); i++) { Node credNode = credList.item(i); if (!(credNode instanceof Element)) - continue; + continue; Element credElement = (Element) credNode; if (!"credentials".equals(credElement.getTagName())) { if (LOG.isWarnEnabled()) - LOG.warn("Bad auth conf file: Element <" - + credElement.getTagName() + "> not recognized in " - + authFile + " - expected <credentials>"); + LOG.warn("Bad auth conf file: Element <" + credElement.getTagName() + + "> not recognized in " + authFile + + " - expected <credentials>"); continue; } @@ -271,7 +277,7 @@ public class Http extends HttpBase { Node scopeNode = scopeList.item(j); if (!(scopeNode instanceof Element)) continue; - + Element scopeElement = (Element) scopeNode; if ("default".equals(scopeElement.getTagName())) { @@ -287,9 +293,9 @@ public class Http extends HttpBase { defaultScheme = scheme; if (LOG.isTraceEnabled()) { - LOG.trace("Credentials - username: " + username - + "; set as default" - + " for realm: " + realm + "; scheme: " + scheme); + LOG.trace("Credentials - username: " + username + + "; set as default" + " for realm: " + realm + "; scheme: " + + scheme); } } else if ("authscope".equals(scopeElement.getTagName())) { @@ -298,8 +304,7 @@ public class Http extends HttpBase { String host = scopeElement.getAttribute("host"); int port = -1; // For setting port to AuthScope.ANY_PORT try { - port = Integer.parseInt( - scopeElement.getAttribute("port")); + port = Integer.parseInt(scopeElement.getAttribute("port")); } catch (Exception ex) { // do nothing, port is already set to any port } @@ -308,16 +313,15 @@ public class Http extends HttpBase { // Set credentials for the determined scope AuthScope authScope = getAuthScope(host, port, realm, scheme); - NTCredentials credentials = new NTCredentials( - username, password, agentHost, realm); + NTCredentials credentials = new NTCredentials(username, password, + agentHost, realm); client.getState().setCredentials(authScope, credentials); if (LOG.isTraceEnabled()) { LOG.trace("Credentials - username: " + username - + "; set for AuthScope - " + "host: " + host - + "; port: " + port + "; realm: " + realm - + "; scheme: " + scheme); + + "; set for AuthScope - " + "host: " + host + "; port: " + + port + "; realm: " + realm + "; scheme: " + scheme); } } else { @@ -333,14 +337,14 @@ public class Http extends HttpBase { } /** - * If credentials for the authentication scope determined from the - * specified <code>url</code> is not already set in the HTTP client, - * then this method sets the default credentials to fetch the - * specified <code>url</code>. If credentials are found for the - * authentication scope, the method returns without altering the - * client. - * - * @param url URL to be fetched + * If credentials for the authentication scope determined from the specified + * <code>url</code> is not already set in the HTTP client, then this method + * sets the default credentials to fetch the specified <code>url</code>. If + * credentials are found for the authentication scope, the method returns + * without altering the client. + * + * @param url + * URL to be fetched */ private void resolveCredentials(URL url) { @@ -359,43 +363,42 @@ public class Http extends HttpBase { if (client.getState().getCredentials(scope) != null) { if (LOG.isTraceEnabled()) LOG.trace("Pre-configured credentials with scope - host: " - + url.getHost() + "; port: " + port - + "; found for url: " + url); + + url.getHost() + "; port: " + port + "; found for url: " + url); // Credentials are already configured, so do nothing and return return; } if (LOG.isTraceEnabled()) - LOG.trace("Pre-configured credentials with scope - host: " - + url.getHost() + "; port: " + port - + "; not found for url: " + url); - - AuthScope serverAuthScope = getAuthScope( - url.getHost(), port, defaultRealm, defaultScheme); - - NTCredentials serverCredentials = new NTCredentials( - defaultUsername, defaultPassword, - agentHost, defaultRealm); + LOG.trace("Pre-configured credentials with scope - host: " + + url.getHost() + "; port: " + port + "; not found for url: " + url); - client.getState().setCredentials( - serverAuthScope, serverCredentials); + AuthScope serverAuthScope = getAuthScope(url.getHost(), port, + defaultRealm, defaultScheme); + + NTCredentials serverCredentials = new NTCredentials(defaultUsername, + defaultPassword, agentHost, defaultRealm); + + client.getState().setCredentials(serverAuthScope, serverCredentials); } } /** - * Returns an authentication scope for the specified - * <code>host</code>, <code>port</code>, <code>realm</code> and - * <code>scheme</code>. - * - * @param host Host name or address. - * @param port Port number. - * @param realm Authentication realm. - * @param scheme Authentication scheme. + * Returns an authentication scope for the specified <code>host</code>, + * <code>port</code>, <code>realm</code> and <code>scheme</code>. + * + * @param host + * Host name or address. + * @param port + * Port number. + * @param realm + * Authentication realm. + * @param scheme + * Authentication scheme. */ - private static AuthScope getAuthScope(String host, int port, - String realm, String scheme) { - + private static AuthScope getAuthScope(String host, int port, String realm, + String scheme) { + if (host.length() == 0) host = null; @@ -412,17 +415,18 @@ public class Http extends HttpBase { } /** - * Returns an authentication scope for the specified - * <code>host</code>, <code>port</code> and <code>realm</code>. - * - * @param host Host name or address. - * @param port Port number. - * @param realm Authentication realm. + * Returns an authentication scope for the specified <code>host</code>, + * <code>port</code> and <code>realm</code>. + * + * @param host + * Host name or address. + * @param port + * Port number. + * @param realm + * Authentication realm. */ - private static AuthScope getAuthScope(String host, int port, - String realm) { + private static AuthScope getAuthScope(String host, int port, String realm) { - return getAuthScope(host, port, realm, ""); + return getAuthScope(host, port, realm, ""); } } - Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java Thu Jan 29 05:38:59 2015 @@ -15,32 +15,31 @@ * limitations under the License. */ package org.apache.nutch.protocol.httpclient; - + import java.util.List; /** - * The base level of services required for Http Authentication - * + * The base level of services required for Http Authentication + * * @see HttpAuthenticationFactory * - * @author Matt Tencati + * @author Matt Tencati */ public interface HttpAuthentication { - /** - * Gets the credentials generated by the HttpAuthentication - * object. May return null. - * - * @return The credentials value - */ - public List<String> getCredentials(); + /** + * Gets the credentials generated by the HttpAuthentication object. May return + * null. + * + * @return The credentials value + */ + public List<String> getCredentials(); - /** - * Gets the realm used by the HttpAuthentication object during creation. - * - * @return The realm value - */ - public String getRealm(); + /** + * Gets the realm used by the HttpAuthentication object during creation. + * + * @return The realm value + */ + public String getRealm(); } - Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java Thu Jan 29 05:38:59 2015 @@ -26,40 +26,46 @@ package org.apache.nutch.protocol.httpcl */ public class HttpAuthenticationException extends Exception { - /** - * Constructs a new exception with null as its detail message. - */ - public HttpAuthenticationException() { - super(); - } + /** + * Constructs a new exception with null as its detail message. + */ + public HttpAuthenticationException() { + super(); + } - /** - * Constructs a new exception with the specified detail message. - * - * @param message the detail message. The detail message is saved for later retrieval by the {@link Throwable#getMessage()} method. - */ - public HttpAuthenticationException(String message) { - super(message); - } + /** + * Constructs a new exception with the specified detail message. + * + * @param message + * the detail message. The detail message is saved for later + * retrieval by the {@link Throwable#getMessage()} method. + */ + public HttpAuthenticationException(String message) { + super(message); + } - /** - * Constructs a new exception with the specified message and cause. - * - * @param message the detail message. The detail message is saved for later retrieval by the {@link Throwable#getMessage()} method. - * @param cause the cause (use {@link #getCause()} to retrieve the cause) - */ - public HttpAuthenticationException(String message, Throwable cause) { - super(message, cause); - } + /** + * Constructs a new exception with the specified message and cause. + * + * @param message + * the detail message. The detail message is saved for later + * retrieval by the {@link Throwable#getMessage()} method. + * @param cause + * the cause (use {@link #getCause()} to retrieve the cause) + */ + public HttpAuthenticationException(String message, Throwable cause) { + super(message, cause); + } - /** - * Constructs a new exception with the specified cause and detail message from - * given clause if it is not null. - * - * @param cause the cause (use {@link #getCause()} to retrieve the cause) - */ - public HttpAuthenticationException(Throwable cause) { - super(cause); - } + /** + * Constructs a new exception with the specified cause and detail message from + * given clause if it is not null. + * + * @param cause + * the cause (use {@link #getCause()} to retrieve the cause) + */ + public HttpAuthenticationException(Throwable cause) { + super(cause); + } } Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Thu Jan 29 05:38:59 2015 @@ -31,28 +31,27 @@ import org.apache.hadoop.conf.Configurab // Nutch imports import org.apache.nutch.metadata.Metadata; - /** - * Provides the Http protocol implementation - * with the ability to authenticate when prompted. The goal is to provide - * multiple authentication types but for now just the {@link HttpBasicAuthentication} authentication - * type is provided. - * + * Provides the Http protocol implementation with the ability to authenticate + * when prompted. The goal is to provide multiple authentication types but for + * now just the {@link HttpBasicAuthentication} authentication type is provided. + * * @see HttpBasicAuthentication * @see Http * @see HttpResponse - * + * * @author Matt Tencati */ public class HttpAuthenticationFactory implements Configurable { /** - * The HTTP Authentication (WWW-Authenticate) header which is returned - * by a webserver requiring authentication. + * The HTTP Authentication (WWW-Authenticate) header which is returned by a + * webserver requiring authentication. */ public static final String WWW_AUTHENTICATE = "WWW-Authenticate"; - public static final Logger LOG = LoggerFactory.getLogger(HttpAuthenticationFactory.class); + public static final Logger LOG = LoggerFactory + .getLogger(HttpAuthenticationFactory.class); private Configuration conf = null; @@ -63,29 +62,33 @@ public class HttpAuthenticationFactory i public void setConf(Configuration conf) { this.conf = conf; } + public Configuration getConf() { return conf; } public HttpAuthentication findAuthentication(Metadata header) { - if (header == null) return null; + if (header == null) + return null; try { Collection<String> challenge = new ArrayList<String>(); challenge.add(header.get(WWW_AUTHENTICATE)); - for(String challengeString: challenge) { + for (String challengeString : challenge) { if (challengeString.equals("NTLM")) - challengeString="Basic realm=techweb"; + challengeString = "Basic realm=techweb"; if (LOG.isTraceEnabled()) LOG.trace("Checking challengeString=" + challengeString); - HttpAuthentication auth = HttpBasicAuthentication.getAuthentication(challengeString, conf); - if (auth != null) return auth; + HttpAuthentication auth = HttpBasicAuthentication.getAuthentication( + challengeString, conf); + if (auth != null) + return auth; - //TODO Add additional Authentication lookups here + // TODO Add additional Authentication lookups here } } catch (Exception e) { LOG.error("Error: ", e); Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java Thu Jan 29 05:38:59 2015 @@ -35,156 +35,165 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configurable; - /** - * Implementation of RFC 2617 Basic Authentication. Usernames and passwords are stored - * in standard Nutch configuration files using the following properties: - * http.auth.basic.<realm>.user - * http.auth.basic.<realm>.pass - * - * @author Matt Tencati + * Implementation of RFC 2617 Basic Authentication. Usernames and passwords are + * stored in standard Nutch configuration files using the following properties: + * http.auth.basic.<realm>.user http.auth.basic.<realm>.pass + * + * @author Matt Tencati */ -public class HttpBasicAuthentication implements HttpAuthentication, Configurable { +public class HttpBasicAuthentication implements HttpAuthentication, + Configurable { - public static final Logger LOG = LoggerFactory.getLogger(HttpBasicAuthentication.class); + public static final Logger LOG = LoggerFactory + .getLogger(HttpBasicAuthentication.class); - private static Pattern basic = Pattern.compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\""); - - private static Map<String, HttpBasicAuthentication> authMap = new TreeMap<String, HttpBasicAuthentication>(); - - private Configuration conf = null; - private String challenge = null; - private ArrayList<String> credentials = null; - private String realm = null; - - - /** - * Construct an HttpBasicAuthentication for the given challenge - * parameters. The challenge parameters are returned by the web - * server using a WWW-Authenticate header. This will typically be - * represented by single line of the form <code>WWW-Authenticate: Basic realm="myrealm"</code> - * - * @param challenge WWW-Authenticate header from web server - */ - protected HttpBasicAuthentication(String challenge, Configuration conf) throws HttpAuthenticationException { - - setConf(conf); - this.challenge = challenge; - credentials = new ArrayList<String>(); - - String username = this.conf.get("http.auth.basic." + challenge + ".user"); - String password = this.conf.get("http.auth.basic." + challenge + ".password"); - - if (LOG.isTraceEnabled()) { - LOG.trace("BasicAuthentication challenge is " + challenge); - LOG.trace("BasicAuthentication username=" + username); - LOG.trace("BasicAuthentication password=" + password); - } - - if (username == null) { - throw new HttpAuthenticationException("Username for " + challenge + " is null"); - } + private static Pattern basic = Pattern + .compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\""); - if (password == null) { - throw new HttpAuthenticationException("Password for " + challenge + " is null"); + private static Map<String, HttpBasicAuthentication> authMap = new TreeMap<String, HttpBasicAuthentication>(); + + private Configuration conf = null; + private String challenge = null; + private ArrayList<String> credentials = null; + private String realm = null; + + /** + * Construct an HttpBasicAuthentication for the given challenge parameters. + * The challenge parameters are returned by the web server using a + * WWW-Authenticate header. This will typically be represented by single line + * of the form <code>WWW-Authenticate: Basic realm="myrealm"</code> + * + * @param challenge + * WWW-Authenticate header from web server + */ + protected HttpBasicAuthentication(String challenge, Configuration conf) + throws HttpAuthenticationException { + + setConf(conf); + this.challenge = challenge; + credentials = new ArrayList<String>(); + + String username = this.conf.get("http.auth.basic." + challenge + ".user"); + String password = this.conf.get("http.auth.basic." + challenge + + ".password"); + + if (LOG.isTraceEnabled()) { + LOG.trace("BasicAuthentication challenge is " + challenge); + LOG.trace("BasicAuthentication username=" + username); + LOG.trace("BasicAuthentication password=" + password); + } + + if (username == null) { + throw new HttpAuthenticationException("Username for " + challenge + + " is null"); + } + + if (password == null) { + throw new HttpAuthenticationException("Password for " + challenge + + " is null"); + } + + byte[] credBytes = (username + ":" + password).getBytes(); + credentials.add("Authorization: Basic " + + new String(Base64.encodeBase64(credBytes))); + if (LOG.isTraceEnabled()) { + LOG.trace("Basic credentials: " + credentials); + } + } + + /* + * ---------------------------------- * <implementation:Configurable> * + * ---------------------------------- + */ + + public void setConf(Configuration conf) { + this.conf = conf; + // if (conf.getBoolean("http.auth.verbose", false)) { + // LOG.setLevel(Level.FINE); + // } else { + // LOG.setLevel(Level.WARNING); + // } + } + + public Configuration getConf() { + return this.conf; + } + + /* + * ---------------------------------- * <implementation:Configurable> * + * ---------------------------------- + */ + + /** + * Gets the Basic credentials generated by this HttpBasicAuthentication object + * + * @return Credentials in the form of + * <code>Authorization: Basic <Base64 encoded userid:password> + * + */ + public List<String> getCredentials() { + return credentials; + } + + /** + * Gets the realm attribute of the HttpBasicAuthentication object. This should + * have been supplied to the {@link #getAuthentication(String, Configuration)} + * static method + * + * @return The realm + */ + public String getRealm() { + return realm; + } + + /** + * This method is responsible for providing Basic authentication information. + * The method caches authentication information for each realm so that the + * required authentication information does not need to be regenerated for + * every request. + * + * @param challenge + * The challenge string provided by the webserver. This is the text + * which follows the WWW-Authenticate header, including the Basic + * tag. + * @return An HttpBasicAuthentication object or null if unable to generate + * appropriate credentials. + */ + public static HttpBasicAuthentication getAuthentication(String challenge, + Configuration conf) { + if (challenge == null) + return null; + Matcher basicMatcher = basic.matcher(challenge); + if (basicMatcher.matches()) { + String realm = basicMatcher.group(1); + Object auth = authMap.get(realm); + if (auth == null) { + HttpBasicAuthentication newAuth = null; + try { + newAuth = new HttpBasicAuthentication(realm, conf); + } catch (HttpAuthenticationException hae) { + if (LOG.isTraceEnabled()) { + LOG.trace("HttpBasicAuthentication failed for " + challenge); + } } - - byte[] credBytes = (username + ":" + password).getBytes(); - credentials.add("Authorization: Basic " + new String(Base64.encodeBase64(credBytes))); - if (LOG.isTraceEnabled()) { - LOG.trace("Basic credentials: " + credentials); - } - } - - - /* ---------------------------------- * - * <implementation:Configurable> * - * ---------------------------------- */ - - public void setConf(Configuration conf) { - this.conf = conf; - //if (conf.getBoolean("http.auth.verbose", false)) { - // LOG.setLevel(Level.FINE); - //} else { - // LOG.setLevel(Level.WARNING); - //} - } - - public Configuration getConf() { - return this.conf; - } - - /* ---------------------------------- * - * <implementation:Configurable> * - * ---------------------------------- */ - - - /** - * Gets the Basic credentials generated by this - * HttpBasicAuthentication object - * - * @return Credentials in the form of <code>Authorization: Basic <Base64 encoded userid:password> - * - */ - public List<String> getCredentials() { - return credentials; - } - - - /** - * Gets the realm attribute of the HttpBasicAuthentication object. - * This should have been supplied to the {@link #getAuthentication(String, Configuration)} - * static method - * - * @return The realm - */ - public String getRealm() { - return realm; - } - - /** - * This method is responsible for providing Basic authentication information. The - * method caches authentication information for each realm so that the required - * authentication information does not need to be regenerated for every request. - * - * @param challenge The challenge string provided by the webserver. This is the - * text which follows the WWW-Authenticate header, including the Basic tag. - * @return An HttpBasicAuthentication object or null - * if unable to generate appropriate credentials. - */ - public static HttpBasicAuthentication getAuthentication(String challenge, Configuration conf) { - if (challenge == null) return null; - Matcher basicMatcher = basic.matcher(challenge); - if (basicMatcher.matches()) { - String realm = basicMatcher.group(1); - Object auth = authMap.get(realm); - if (auth == null) { - HttpBasicAuthentication newAuth = null; - try { - newAuth = new HttpBasicAuthentication(realm, conf); - } catch (HttpAuthenticationException hae) { - if (LOG.isTraceEnabled()) { - LOG.trace("HttpBasicAuthentication failed for " + challenge); - } - } - authMap.put(realm, newAuth); - return newAuth; - } else { - return (HttpBasicAuthentication) auth; - } - } - return null; - } - - /** - * Provides a pattern which can be used by an outside resource to determine if - * this class can provide credentials based on simple header information. It does - * not calculate any information regarding realms or challenges. - * - * @return Returns a Pattern which will match a Basic WWW-Authenticate header. - */ - public static final Pattern getBasicPattern() { - return basic; - } + authMap.put(realm, newAuth); + return newAuth; + } else { + return (HttpBasicAuthentication) auth; + } + } + return null; + } + + /** + * Provides a pattern which can be used by an outside resource to determine if + * this class can provide credentials based on simple header information. It + * does not calculate any information regarding realms or challenges. + * + * @return Returns a Pattern which will match a Basic WWW-Authenticate header. + */ + public static final Pattern getBasicPattern() { + return basic; + } } - Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Thu Jan 29 05:38:59 2015 @@ -40,7 +40,7 @@ import org.apache.nutch.protocol.http.ap /** * An HTTP response. - * + * * @author Susam Pal */ public class HttpResponse implements Response { @@ -52,18 +52,22 @@ public class HttpResponse implements Res /** * Fetches the given <code>url</code> and prepares HTTP response. - * - * @param http An instance of the implementation class - * of this plugin - * @param url URL to be fetched - * @param datum Crawl data - * @param followRedirects Whether to follow redirects; follows - * redirect if and only if this is true - * @return HTTP response - * @throws IOException When an error occurs + * + * @param http + * An instance of the implementation class of this plugin + * @param url + * URL to be fetched + * @param datum + * Crawl data + * @param followRedirects + * Whether to follow redirects; follows redirect if and only if this + * is true + * @return HTTP response + * @throws IOException + * When an error occurs */ - HttpResponse(Http http, URL url, CrawlDatum datum, - boolean followRedirects) throws IOException { + HttpResponse(Http http, URL url, CrawlDatum datum, boolean followRedirects) + throws IOException { // Prepare GET method for HTTP request this.url = url; @@ -98,7 +102,7 @@ public class HttpResponse implements Res for (int i = 0; i < heads.length; i++) { headers.set(heads[i].getName(), heads[i].getValue()); } - + // Limit download size int contentLength = Integer.MAX_VALUE; String contentLengthString = headers.get(Response.CONTENT_LENGTH); @@ -106,12 +110,10 @@ public class HttpResponse implements Res try { contentLength = Integer.parseInt(contentLengthString.trim()); } catch (NumberFormatException ex) { - throw new HttpException("bad content length: " + - contentLengthString); + throw new HttpException("bad content length: " + contentLengthString); } } - if (http.getMaxContent() >= 0 && - contentLength > http.getMaxContent()) { + if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { contentLength = http.getMaxContent(); } @@ -131,7 +133,8 @@ public class HttpResponse implements Res content = out.toByteArray(); } catch (Exception e) { - if (code == 200) throw new IOException(e.toString()); + if (code == 200) + throw new IOException(e.toString()); // for codes other than 200 OK, we are fine with empty content } finally { if (in != null) { @@ -139,16 +142,15 @@ public class HttpResponse implements Res } get.abort(); } - + StringBuilder fetchTrace = null; if (Http.LOG.isTraceEnabled()) { // Trace message - fetchTrace = new StringBuilder("url: " + url + - "; status code: " + code + - "; bytes received: " + content.length); + fetchTrace = new StringBuilder("url: " + url + "; status code: " + code + + "; bytes received: " + content.length); if (getHeader(Response.CONTENT_LENGTH) != null) - fetchTrace.append("; Content-Length: " + - getHeader(Response.CONTENT_LENGTH)); + fetchTrace.append("; Content-Length: " + + getHeader(Response.CONTENT_LENGTH)); if (getHeader(Response.LOCATION) != null) fetchTrace.append("; Location: " + getHeader(Response.LOCATION)); } @@ -158,8 +160,7 @@ public class HttpResponse implements Res String contentEncoding = headers.get(Response.CONTENT_ENCODING); if (contentEncoding != null && Http.LOG.isTraceEnabled()) fetchTrace.append("; Content-Encoding: " + contentEncoding); - if ("gzip".equals(contentEncoding) || - "x-gzip".equals(contentEncoding)) { + if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { content = http.processGzipEncoded(content, url); if (Http.LOG.isTraceEnabled()) fetchTrace.append("; extracted to " + content.length + " bytes"); @@ -179,15 +180,15 @@ public class HttpResponse implements Res } } - - /* ------------------------- * - * <implementation:Response> * - * ------------------------- */ - + /* + * ------------------------- * <implementation:Response> * + * ------------------------- + */ + public URL getUrl() { return url; } - + public int getCode() { return code; } @@ -195,7 +196,7 @@ public class HttpResponse implements Res public String getHeader(String name) { return headers.get(name); } - + public Metadata getHeaders() { return headers; } @@ -204,8 +205,8 @@ public class HttpResponse implements Res return content; } - /* -------------------------- * - * </implementation:Response> * - * -------------------------- */ + /* + * -------------------------- * </implementation:Response> * + * -------------------------- + */ } - Modified: nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java Thu Jan 29 05:38:59 2015 @@ -60,7 +60,7 @@ public class TestProtocolHttpClient { conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site-test.xml"); - + http = new Http(); http.setConf(conf); } @@ -72,8 +72,9 @@ public class TestProtocolHttpClient { /** * Tests whether the client can remember cookies. - * - * @throws Exception If an error occurs or the test case fails. + * + * @throws Exception + * If an error occurs or the test case fails. */ @Test public void testCookies() throws Exception { @@ -83,10 +84,10 @@ public class TestProtocolHttpClient { } /** - * Tests that no pre-emptive authorization headers are sent by the - * client. - * - * @throws Exception If an error occurs or the test case fails. + * Tests that no pre-emptive authorization headers are sent by the client. + * + * @throws Exception + * If an error occurs or the test case fails. */ @Test public void testNoPreemptiveAuth() throws Exception { @@ -96,8 +97,9 @@ public class TestProtocolHttpClient { /** * Tests default credentials. - * - * @throws Exception If an error occurs or the test case fails. + * + * @throws Exception + * If an error occurs or the test case fails. */ @Test public void testDefaultCredentials() throws Exception { @@ -108,7 +110,8 @@ public class TestProtocolHttpClient { /** * Tests basic authentication scheme for various realms. * - * @throws Exception If an error occurs or the test case fails. + * @throws Exception + * If an error occurs or the test case fails. */ @Test public void testBasicAuth() throws Exception { @@ -120,11 +123,12 @@ public class TestProtocolHttpClient { } /** - * Tests that authentication happens for a defined realm and not for - * other realms for a host:port when an extra <code>authscope</code> - * tag is not defined to match all other realms. - * - * @throws Exception If an error occurs or the test case fails. + * Tests that authentication happens for a defined realm and not for other + * realms for a host:port when an extra <code>authscope</code> tag is not + * defined to match all other realms. + * + * @throws Exception + * If an error occurs or the test case fails. */ @Test public void testOtherRealmsNoAuth() throws Exception { @@ -136,8 +140,9 @@ public class TestProtocolHttpClient { /** * Tests Digest authentication scheme. - * - * @throws Exception If an error occurs or the test case fails. + * + * @throws Exception + * If an error occurs or the test case fails. */ @Test public void testDigestAuth() throws Exception { @@ -147,8 +152,9 @@ public class TestProtocolHttpClient { /** * Tests NTLM authentication scheme. - * - * @throws Exception If an error occurs or the test case fails. + * + * @throws Exception + * If an error occurs or the test case fails. */ @Test public void testNtlmAuth() throws Exception { @@ -158,9 +164,11 @@ public class TestProtocolHttpClient { /** * Starts the Jetty server at a specified port. - * - * @param portno Port number. - * @throws Exception When an error occurs. + * + * @param portno + * Port number. + * @throws Exception + * When an error occurs. */ private void startServer(int portno) throws Exception { port = portno; @@ -172,17 +180,18 @@ public class TestProtocolHttpClient { } /** - * Fetches the specified <code>page</code> from the local Jetty server - * and checks whether the HTTP response status code matches with the - * expected code. - * - * @param page Page to be fetched. - * @param expectedCode HTTP response status code expected while - * fetching the page. - * @throws Exception When an error occurs or test case fails. + * Fetches the specified <code>page</code> from the local Jetty server and + * checks whether the HTTP response status code matches with the expected + * code. + * + * @param page + * Page to be fetched. + * @param expectedCode + * HTTP response status code expected while fetching the page. + * @throws Exception + * When an error occurs or test case fails. */ - private void fetchPage(String page, int expectedCode) - throws Exception { + private void fetchPage(String page, int expectedCode) throws Exception { URL url = new URL("http", "127.0.0.1", port, page); Response response = null; response = http.getResponse(url, new CrawlDatum(), true); Modified: nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java (original) +++ nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java Thu Jan 29 05:38:59 2015 @@ -28,33 +28,34 @@ import org.apache.nutch.scoring.ScoringF */ public class DepthScoringFilter extends Configured implements ScoringFilter { private static final Log LOG = LogFactory.getLog(DepthScoringFilter.class); - + public static final String DEPTH_KEY = "_depth_"; public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY); public static final String MAX_DEPTH_KEY = "_maxdepth_"; public static final Text MAX_DEPTH_KEY_W = new Text(MAX_DEPTH_KEY); - + // maximum value that we are never likely to reach // because the depth of the Web graph is that high only // for spam cliques. public static final int DEFAULT_MAX_DEPTH = 1000; - + private int defaultMaxDepth; - + @Override public void setConf(Configuration conf) { super.setConf(conf); - if (conf == null) return; + if (conf == null) + return; defaultMaxDepth = conf.getInt("scoring.depth.max", DEFAULT_MAX_DEPTH); if (defaultMaxDepth <= 0) { defaultMaxDepth = DEFAULT_MAX_DEPTH; } } - + @Override public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount) throws ScoringFilterException { + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { String depthString = parseData.getMeta(DEPTH_KEY); if (depthString == null) { LOG.warn("Missing depth, removing all outlinks from url " + fromUrl); @@ -72,15 +73,17 @@ public class DepthScoringFilter extends } if (curDepth >= curMaxDepth) { // depth exceeded - throw away - LOG.info("Depth limit (" + curMaxDepth + ") reached, ignoring outlinks for " + fromUrl); + LOG.info("Depth limit (" + curMaxDepth + + ") reached, ignoring outlinks for " + fromUrl); targets.clear(); return adjust; } - Iterator<Entry<Text,CrawlDatum>> it = targets.iterator(); + Iterator<Entry<Text, CrawlDatum>> it = targets.iterator(); while (it.hasNext()) { - Entry<Text,CrawlDatum> e = it.next(); + Entry<Text, CrawlDatum> e = it.next(); // record increased depth - e.getValue().getMetaData().put(DEPTH_KEY_W, new IntWritable(curDepth + 1)); + e.getValue().getMetaData() + .put(DEPTH_KEY_W, new IntWritable(curDepth + 1)); // record maxDepth if any if (customMaxDepth != null) { e.getValue().getMetaData().put(MAX_DEPTH_KEY_W, customMaxDepth); @@ -92,16 +95,17 @@ public class DepthScoringFilter extends // prioritize by smaller values of depth @Override public float generatorSortValue(Text url, CrawlDatum datum, float initSort) - throws ScoringFilterException { + throws ScoringFilterException { // boost up by current depth int curDepth, curMaxDepth; - IntWritable maxDepth = (IntWritable)datum.getMetaData().get(MAX_DEPTH_KEY_W); + IntWritable maxDepth = (IntWritable) datum.getMetaData().get( + MAX_DEPTH_KEY_W); if (maxDepth != null) { curMaxDepth = maxDepth.get(); } else { curMaxDepth = defaultMaxDepth; } - IntWritable depth = (IntWritable)datum.getMetaData().get(DEPTH_KEY_W); + IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W); if (depth == null) { // penalize curDepth = curMaxDepth; @@ -113,27 +117,28 @@ public class DepthScoringFilter extends } public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, - CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) - throws ScoringFilterException { + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { return initScore; } @Override public void initialScore(Text url, CrawlDatum datum) - throws ScoringFilterException { + throws ScoringFilterException { // the datum might already have some values set // e.g. obtained from redirection // in which case we don't want to override them - if (datum.getMetaData().get(MAX_DEPTH_KEY_W) == null) datum.getMetaData() - .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth)); + if (datum.getMetaData().get(MAX_DEPTH_KEY_W) == null) + datum.getMetaData() + .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth)); // initial depth is 1 - if (datum.getMetaData().get(DEPTH_KEY_W) == null) datum.getMetaData().put( - DEPTH_KEY_W, new IntWritable(1)); + if (datum.getMetaData().get(DEPTH_KEY_W) == null) + datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1)); } @Override public void injectedScore(Text url, CrawlDatum datum) - throws ScoringFilterException { + throws ScoringFilterException { // check for the presence of the depth limit key if (datum.getMetaData().get(MAX_DEPTH_KEY_W) != null) { @@ -143,7 +148,8 @@ public class DepthScoringFilter extends int depth = Integer.parseInt(depthString); datum.getMetaData().put(MAX_DEPTH_KEY_W, new IntWritable(depth)); } else { // put the default - datum.getMetaData().put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth)); + datum.getMetaData() + .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth)); } // initial depth is 1 datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1)); @@ -151,7 +157,7 @@ public class DepthScoringFilter extends @Override public void passScoreAfterParsing(Text url, Content content, Parse parse) - throws ScoringFilterException { + throws ScoringFilterException { String depth = content.getMetadata().get(DEPTH_KEY); if (depth != null) { parse.getData().getParseMeta().set(DEPTH_KEY, depth); @@ -164,12 +170,13 @@ public class DepthScoringFilter extends @Override public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) - throws ScoringFilterException { - IntWritable depth = (IntWritable)datum.getMetaData().get(DEPTH_KEY_W); + throws ScoringFilterException { + IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W); if (depth != null) { content.getMetadata().set(DEPTH_KEY, depth.toString()); } - IntWritable maxdepth = (IntWritable)datum.getMetaData().get(MAX_DEPTH_KEY_W); + IntWritable maxdepth = (IntWritable) datum.getMetaData().get( + MAX_DEPTH_KEY_W); if (maxdepth != null) { content.getMetadata().set(MAX_DEPTH_KEY, maxdepth.toString()); } @@ -177,7 +184,7 @@ public class DepthScoringFilter extends @Override public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, - List<CrawlDatum> inlinked) throws ScoringFilterException { + List<CrawlDatum> inlinked) throws ScoringFilterException { // find a minimum of all depths int newDepth = DEFAULT_MAX_DEPTH; if (old != null) { @@ -190,7 +197,7 @@ public class DepthScoringFilter extends } } for (CrawlDatum lnk : inlinked) { - IntWritable depth = (IntWritable)lnk.getMetaData().get(DEPTH_KEY_W); + IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W); if (depth != null && depth.get() < newDepth) { newDepth = depth.get(); }
