Repository: nutch Updated Branches: refs/heads/master 044e8e77e -> 8572fd955
fix for NUTCH-2191 - fixing Nutch build - contributed by karanjeets Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/8572fd95 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/8572fd95 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/8572fd95 Branch: refs/heads/master Commit: 8572fd9551b430f31a4fdace14738f2d9959b370 Parents: 044e8e7 Author: Karanjeet Singh <[email protected]> Authored: Mon Apr 18 00:45:37 2016 -0700 Committer: Karanjeet Singh <[email protected]> Committed: Mon Apr 18 00:45:37 2016 -0700 ---------------------------------------------------------------------- src/plugin/protocol-htmlunit/build.xml | 9 - .../nutch/protocol/htmlunit/HttpResponse.java | 408 ++++++++++++++----- 2 files changed, 317 insertions(+), 100 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/8572fd95/src/plugin/protocol-htmlunit/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml index bf695fe..899214c 100644 --- a/src/plugin/protocol-htmlunit/build.xml +++ b/src/plugin/protocol-htmlunit/build.xml @@ -34,13 +34,4 @@ <pathelement location="${build.dir}/test/conf"/> </path> - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../lib-http"/> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - <copy toDir="${build.test}"> - <fileset dir="${src.test}" excludes="**/*.java"/> - </copy> - </target> - </project> http://git-wip-us.apache.org/repos/asf/nutch/blob/8572fd95/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java index 7242f40..8b1a031 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -20,11 +20,18 @@ import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.EOFException; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.io.PushbackInputStream; import java.net.InetSocketAddress; import java.net.Socket; import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import javax.net.ssl.SSLSocket; +import javax.net.ssl.SSLSocketFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; @@ -35,46 +42,78 @@ import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.http.api.HttpBase; import org.apache.nutch.protocol.http.api.HttpException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +/** + * An HTTP response. + */ public class HttpResponse implements Response { - private static final Logger LOG = LoggerFactory.getLogger(HttpResponse.class); - - private Http http; + private Configuration conf; + private HttpBase http; private URL url; + private String orig; + private String base; private byte[] content; private int code; private Metadata headers = new SpellCheckedMetadata(); + // used for storing the http headers verbatim + private StringBuffer httpHeaders; - /** The nutch configuration */ - private Configuration conf = null; + protected enum Scheme { + HTTP, HTTPS, + } - public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException { + /** + * Default public constructor. + * + * @param http + * @param url + * @param datum + * @throws ProtocolException + * @throws IOException + */ + public HttpResponse(HttpBase http, URL url, CrawlDatum datum) + throws ProtocolException, IOException { - this.conf = http.getConf(); this.http = http; this.url = url; + this.orig = url.toString(); + this.base = url.toString(); + + Scheme scheme = null; + + if ("http".equals(url.getProtocol())) { + scheme = Scheme.HTTP; + } else if ("https".equals(url.getProtocol())) { + scheme = Scheme.HTTPS; + } else { + throw new HttpException("Unknown scheme (not http/https) for url:" + url); + } + + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("fetching " + url); + } - LOG.info("fetching {}", url); - String path = "".equals(url.getFile()) ? "/" : url.getFile(); // some servers will redirect a request with a host line like // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they // don't want the :80... + String host = url.getHost(); int port; String portString; if (url.getPort() == -1) { - port = 80; + if (scheme == Scheme.HTTP) { + port = 80; + } else { + port = 443; + } portString = ""; } else { port = url.getPort(); portString = ":" + port; } - Socket socket = null; try { @@ -87,6 +126,38 @@ public class HttpResponse implements Response { InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); socket.connect(sockAddr, http.getTimeout()); + if (scheme == Scheme.HTTPS) { + SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory + .getDefault(); + SSLSocket sslsocket = (SSLSocket) factory + .createSocket(socket, sockHost, sockPort, true); + sslsocket.setUseClientMode(true); + + // Get the protocols and ciphers supported by this JVM + Set<String> protocols = new HashSet<String>( + Arrays.asList(sslsocket.getSupportedProtocols())); + Set<String> ciphers = new HashSet<String>( + Arrays.asList(sslsocket.getSupportedCipherSuites())); + + // Intersect with preferred protocols and ciphers + protocols.retainAll(http.getTlsPreferredProtocols()); + ciphers.retainAll(http.getTlsPreferredCipherSuites()); + + sslsocket.setEnabledProtocols( + protocols.toArray(new String[protocols.size()])); + sslsocket.setEnabledCipherSuites( + ciphers.toArray(new String[ciphers.size()])); + + sslsocket.startHandshake(); + socket = sslsocket; + } + + this.conf = http.getConf(); + if (sockAddr != null + && conf.getBoolean("store.ip.address", false) == true) { + headers.add("_ip_", sockAddr.getAddress().getHostAddress()); + } + // make request OutputStream req = socket.getOutputStream(); @@ -97,7 +168,6 @@ public class HttpResponse implements Response { reqStr.append(path); } - // TODO: Write code for Https reqStr.append(" HTTP/1.0\r\n"); reqStr.append("Host: "); @@ -126,91 +196,262 @@ public class HttpResponse implements Response { reqStr.append(this.http.getAccept()); reqStr.append("\r\n"); - if (datum.getModifiedTime() > 0) { - reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime())); + if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { + reqStr.append("If-Modified-Since: " + HttpDateFormat + .toString(datum.getModifiedTime())); reqStr.append("\r\n"); } reqStr.append("\r\n"); + // store the request in the metadata? + if (conf.getBoolean("store.http.request", false) == true) { + headers.add("_request_", reqStr.toString()); + } + byte[] reqBytes = reqStr.toString().getBytes(); req.write(reqBytes); req.flush(); PushbackInputStream in = // process response - new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE), - Http.BUFFER_SIZE); + new PushbackInputStream( + new BufferedInputStream(socket.getInputStream(), + Http.BUFFER_SIZE), Http.BUFFER_SIZE); StringBuffer line = new StringBuffer(); + // store the http headers verbatim + if (conf.getBoolean("store.http.headers", false) == true) { + httpHeaders = new StringBuffer(); + } + + headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis())); + boolean haveSeenNonContinueStatus = false; while (!haveSeenNonContinueStatus) { // parse status code line this.code = parseStatusLine(in, line); + if (httpHeaders != null) + httpHeaders.append(line).append("\n"); // parse headers - parseHeaders(in, line); + parseHeaders(in, line, httpHeaders); haveSeenNonContinueStatus = code != 100; // 100 is "Continue" } // Get Content type header String contentType = getHeader(Response.CONTENT_TYPE); - // handle with Selenium only if content type in HTML or XHTML + // handle with HtmlUnit only if content type in HTML or XHTML if (contentType != null) { if (contentType.contains("text/html") || contentType.contains("application/xhtml")) { - readPlainContent(url); + readContentFromHtmlUnit(url); } else { - try { - int contentLength = Integer.MAX_VALUE; - String contentLengthString = headers.get(Response.CONTENT_LENGTH); - if (contentLengthString != null) { - try { - contentLength = Integer.parseInt(contentLengthString.trim()); - } catch (NumberFormatException ex) { - throw new HttpException("bad content length: " + contentLengthString); - } - } + String transferEncoding = getHeader(Response.TRANSFER_ENCODING); + if (transferEncoding != null && "chunked" + .equalsIgnoreCase(transferEncoding.trim())) { + readChunkedContent(in, line); + } else { + readPlainContent(in); + } - if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { - contentLength = http.getMaxContent(); + String contentEncoding = getHeader(Response.CONTENT_ENCODING); + if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { + content = http.processGzipEncoded(content, url); + } else if ("deflate".equals(contentEncoding)) { + content = http.processDeflateEncoded(content, url); + } else { + // store the headers verbatim only if the response was not compressed + // as the content length reported with not match otherwise + if (httpHeaders != null) { + headers.add("_response.headers_", httpHeaders.toString()); } - - byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; - int bufferFilled = 0; - int totalRead = 0; - ByteArrayOutputStream out = new ByteArrayOutputStream(); - while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 - && totalRead + bufferFilled <= contentLength) { - totalRead += bufferFilled; - out.write(buffer, 0, bufferFilled); - } - - content = out.toByteArray(); - - } catch (Exception e) { - if (code == 200) - throw new IOException(e.toString()); - // for codes other than 200 OK, we are fine with empty content - } finally { - if (in != null) { - in.close(); + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("fetched " + content.length + " bytes from " + url); } } } - } + } } finally { if (socket != null) socket.close(); } + } - - private void readPlainContent(URL url) throws IOException { + + /* + * ------------------------- * <implementation:Response> * + * ------------------------- + */ + + public URL getUrl() { + return url; + } + + public int getCode() { + return code; + } + + public String getHeader(String name) { + return headers.get(name); + } + + public Metadata getHeaders() { + return headers; + } + + public byte[] getContent() { + return content; + } + + /* + * ------------------------- * <implementation:Response> * + * ------------------------- + */ + + private void readContentFromHtmlUnit(URL url) throws IOException { String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf); content = page.getBytes("UTF-8"); } + + private void readPlainContent(InputStream in) + throws HttpException, IOException { + + int contentLength = Integer.MAX_VALUE; // get content length + String contentLengthString = headers.get(Response.CONTENT_LENGTH); + if (contentLengthString != null) { + contentLengthString = contentLengthString.trim(); + try { + if (!contentLengthString.isEmpty()) + contentLength = Integer.parseInt(contentLengthString); + } catch (NumberFormatException e) { + throw new HttpException("bad content length: " + contentLengthString); + } + } + if (http.getMaxContent() >= 0 && contentLength > http + .getMaxContent()) // limit + // download + // size + contentLength = http.getMaxContent(); + + ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); + byte[] bytes = new byte[Http.BUFFER_SIZE]; + int length = 0; + + // do not try to read if the contentLength is 0 + if (contentLength == 0) { + content = new byte[0]; + return; + } + + // read content + int i = in.read(bytes); + while (i != -1) { + out.write(bytes, 0, i); + length += i; + if (length >= contentLength) { + break; + } + if ((length + Http.BUFFER_SIZE) > contentLength) { + // reading next chunk may hit contentLength, + // must limit number of bytes read + i = in.read(bytes, 0, (contentLength - length)); + } else { + i = in.read(bytes); + } + } + content = out.toByteArray(); + } + + /** + * @param in + * @param line + * @throws HttpException + * @throws IOException + */ + private void readChunkedContent(PushbackInputStream in, StringBuffer line) + throws HttpException, IOException { + boolean doneChunks = false; + int contentBytesRead = 0; + byte[] bytes = new byte[Http.BUFFER_SIZE]; + ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); + + while (!doneChunks) { + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("Http: starting chunk"); + } + + readLine(in, line, false); + + String chunkLenStr; + // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); + // } + + int pos = line.indexOf(";"); + if (pos < 0) { + chunkLenStr = line.toString(); + } else { + chunkLenStr = line.substring(0, pos); + // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + + // line.substring(pos+1)); } + } + chunkLenStr = chunkLenStr.trim(); + int chunkLen; + try { + chunkLen = Integer.parseInt(chunkLenStr, 16); + } catch (NumberFormatException e) { + throw new HttpException("bad chunk length: " + line.toString()); + } + + if (chunkLen == 0) { + doneChunks = true; + break; + } + + if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http + .getMaxContent()) + chunkLen = http.getMaxContent() - contentBytesRead; + + // read one chunk + int chunkBytesRead = 0; + while (chunkBytesRead < chunkLen) { + + int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? + (chunkLen - chunkBytesRead) : + Http.BUFFER_SIZE; + int len = in.read(bytes, 0, toRead); + + if (len == -1) + throw new HttpException("chunk eof after " + contentBytesRead + + " bytes in successful chunks" + " and " + chunkBytesRead + + " in current chunk"); + + // DANGER!!! Will printed GZIPed stuff right to your + // terminal! + // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, + // len)); } + + out.write(bytes, 0, len); + chunkBytesRead += len; + } - private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { + readLine(in, line, false); + + } + + if (!doneChunks) { + if (contentBytesRead != http.getMaxContent()) + throw new HttpException("chunk eof: !doneChunk && didn't max out"); + return; + } + + content = out.toByteArray(); + parseHeaders(in, line, null); + + } + + private int parseStatusLine(PushbackInputStream in, StringBuffer line) + throws IOException, HttpException { readLine(in, line, false); int codeStart = line.indexOf(" "); @@ -225,13 +466,15 @@ public class HttpResponse implements Response { try { code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); } catch (NumberFormatException e) { - throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e); + throw new HttpException( + "bad status line '" + line + "': " + e.getMessage(), e); } return code; } - private void processHeaderLine(StringBuffer line) throws IOException, HttpException { + private void processHeaderLine(StringBuffer line) + throws IOException, HttpException { int colonIndex = line.indexOf(":"); // key is up to colon if (colonIndex == -1) { @@ -257,24 +500,29 @@ public class HttpResponse implements Response { } // Adds headers to our headers Metadata - private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { + private void parseHeaders(PushbackInputStream in, StringBuffer line, + StringBuffer httpHeaders) throws IOException, HttpException { while (readLine(in, line, true) != 0) { + if (httpHeaders != null) + httpHeaders.append(line).append("\n"); + // handle HTTP responses with missing blank line after headers int pos; - if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1) - || ((pos = line.indexOf("<html")) != -1)) { + if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ( + (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html")) + != -1)) { in.unread(line.substring(pos).getBytes("UTF-8")); line.setLength(pos); try { - //TODO: (CM) We don't know the header names here - //since we're just handling them generically. It would - //be nice to provide some sort of mapping function here - //for the returned header names to the standard metadata - //names in the ParseData class + // TODO: (CM) We don't know the header names here + // since we're just handling them generically. It would + // be nice to provide some sort of mapping function here + // for the returned header names to the standard metadata + // names in the ParseData class processHeaderLine(line); } catch (Exception e) { // fixme: @@ -287,8 +535,8 @@ public class HttpResponse implements Response { } } - private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine) - throws IOException { + private static int readLine(PushbackInputStream in, StringBuffer line, + boolean allowContinuedLine) throws IOException { line.setLength(0); for (int c = in.read(); c != -1; c = in.read()) { switch (c) { @@ -322,26 +570,4 @@ public class HttpResponse implements Response { return value; } - public URL getUrl() { - return url; - } - - public String getHeader(String name) { - return headers.get(name); - } - - public Metadata getHeaders() { - return headers; - } - - public byte[] getContent() { - return content; - } - - @Override - public int getCode() { - // TODO Auto-generated method stub - return code; - } } -
