This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new b6f645a4d NUTCH-3001 - fix logic for grabbing bytes if there's no content type in the header new f078a88df Merge pull request #774 from tballison/NUTCH-3001 b6f645a4d is described below commit b6f645a4d025fa136f557dd37e9aba611b425fbb Author: tallison <talli...@apache.org> AuthorDate: Wed Sep 13 10:37:17 2023 -0400 NUTCH-3001 - fix logic for grabbing bytes if there's no content type in the header --- .../nutch/protocol/selenium/HttpResponse.java | 78 ++++++++++------------ 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index bb3bf6357..750677374 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -269,55 +269,51 @@ public class HttpResponse implements Response { String contentType = getHeader(Response.CONTENT_TYPE); // handle with Selenium only if content type in HTML or XHTML - if (contentType != null) { - if (contentType.contains("text/html") - || contentType.contains("application/xhtml")) { - readPlainContent(url); - } else { - try { - int contentLength = Integer.MAX_VALUE; - String contentLengthString = headers.get(Response.CONTENT_LENGTH); - if (contentLengthString != null) { - try { - contentLength = Integer.parseInt(contentLengthString.trim()); - } catch (NumberFormatException ex) { - throw new HttpException( - "bad content length: " + contentLengthString); - } + if (contentType != null && + (contentType.contains("text/html") || contentType.contains("application/xhtml"))) { + readPlainContent(url); + } else { + try { + int contentLength = Integer.MAX_VALUE; + String contentLengthString = headers.get(Response.CONTENT_LENGTH); + if (contentLengthString != null) { + try { + contentLength = Integer.parseInt(contentLengthString.trim()); + } catch (NumberFormatException ex) { + throw new HttpException("bad content length: " + contentLengthString); } + } - if (http.getMaxContent() >= 0 - && contentLength > http.getMaxContent()) { - contentLength = http.getMaxContent(); - } + if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { + contentLength = http.getMaxContent(); + } - byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; - int bufferFilled = 0; - int totalRead = 0; - ByteArrayOutputStream out = new ByteArrayOutputStream(); - while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 - && totalRead + bufferFilled <= contentLength) { - totalRead += bufferFilled; - out.write(buffer, 0, bufferFilled); - } + byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; + int bufferFilled = 0; + int totalRead = 0; + ByteArrayOutputStream out = new ByteArrayOutputStream(); + while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 && + totalRead + bufferFilled <= contentLength) { + totalRead += bufferFilled; + out.write(buffer, 0, bufferFilled); + } - content = out.toByteArray(); + content = out.toByteArray(); - } catch (Exception e) { - if (code == 200) - throw new IOException(e.toString()); - // for codes other than 200 OK, we are fine with empty content - } finally { - if (in != null) { - in.close(); - } + } catch (Exception e) { + if (code == 200) { + throw new IOException(e.toString()); + } + // for codes other than 200 OK, we are fine with empty content + } finally { + if (in != null) { + in.close(); } - } - if (httpHeaders != null) { - headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); } } - + if (httpHeaders != null) { + headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); + } } catch(KeyManagementException | NoSuchAlgorithmException | KeyStoreException e) { throw new ProtocolException(e); } finally {