Author: ab Date: Mon Sep 22 09:02:40 2008 New Revision: 697878 URL: http://svn.apache.org/viewvc?rev=697878&view=rev Log: NUTCH-375 - Add support for Content-Encoding: deflate.
Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=697878&r1=697877&r2=697878&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Sep 22 09:02:40 2008 @@ -268,6 +268,9 @@ 98. NUTCH-651 - Remove bin/{start|stop}-balancer.sh from svn tracking. (dogacan) +99. NUTCH-375 - Add support for Content-Encoding: deflated + (Pascal Beis, ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java?rev=697878&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java Mon Sep 22 09:02:40 2008 @@ -0,0 +1,142 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.io.ByteArrayOutputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.zip.Inflater; +import java.util.zip.InflaterInputStream; +import java.util.zip.DeflaterOutputStream; + +// Commons Logging imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A collection of utility methods for working on deflated data. + */ +public class DeflateUtils { + + private static final Log LOG = LogFactory.getLog(DeflateUtils.class); + private static final int EXPECTED_COMPRESSION_RATIO = 5; + private static final int BUF_SIZE = 4096; + + /** + * Returns an inflated copy of the input array. If the deflated + * input has been truncated or corrupted, a best-effort attempt is + * made to inflate as much as possible. If no data can be extracted + * <code>null</code> is returned. + */ + public static final byte[] inflateBestEffort(byte[] in) { + return inflateBestEffort(in, Integer.MAX_VALUE); + } + + /** + * Returns an inflated copy of the input array, truncated to + * <code>sizeLimit</code> bytes, if necessary. If the deflated input + * has been truncated or corrupted, a best-effort attempt is made to + * inflate as much as possible. If no data can be extracted + * <code>null</code> is returned. + */ + public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) { + // decompress using InflaterInputStream + ByteArrayOutputStream outStream = + new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length); + + // "true" because HTTP does not provide zlib headers + Inflater inflater = new Inflater(true); + InflaterInputStream inStream = + new InflaterInputStream(new ByteArrayInputStream(in), inflater); + + byte[] buf = new byte[BUF_SIZE]; + int written = 0; + while (true) { + try { + int size = inStream.read(buf); + if (size <= 0) + break; + if ((written + size) > sizeLimit) { + outStream.write(buf, 0, sizeLimit - written); + break; + } + outStream.write(buf, 0, size); + written+= size; + } catch (Exception e) { + LOG.info( "Caught Exception in inflateBestEffort" ); + e.printStackTrace(LogUtil.getWarnStream(LOG)); + break; + } + } + try { + outStream.close(); + } catch (IOException e) { + } + + return outStream.toByteArray(); + } + + + /** + * Returns an inflated copy of the input array. + * @throws IOException if the input cannot be properly decompressed + */ + public static final byte[] inflate(byte[] in) throws IOException { + // decompress using InflaterInputStream + ByteArrayOutputStream outStream = + new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length); + + InflaterInputStream inStream = + new InflaterInputStream ( new ByteArrayInputStream(in) ); + + byte[] buf = new byte[BUF_SIZE]; + while (true) { + int size = inStream.read(buf); + if (size <= 0) + break; + outStream.write(buf, 0, size); + } + outStream.close(); + + return outStream.toByteArray(); + } + + /** + * Returns a deflated copy of the input array. + */ + public static final byte[] deflate(byte[] in) { + // compress using DeflaterOutputStream + ByteArrayOutputStream byteOut = + new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO); + + DeflaterOutputStream outStream = new DeflaterOutputStream(byteOut); + + try { + outStream.write(in); + } catch (Exception e) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + } + + try { + outStream.close(); + } catch (IOException e) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + } + + return byteOut.toByteArray(); + } +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=697878&r1=697877&r2=697878&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Sep 22 09:02:40 2008 @@ -38,6 +38,7 @@ import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.protocol.RobotRules; import org.apache.nutch.util.GZIPUtils; +import org.apache.nutch.util.DeflateUtils; import org.apache.nutch.util.LogUtil; // Hadoop imports @@ -498,7 +499,24 @@ } return content; } - + + public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException { + + if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); } + + byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent()); + + if (content == null) + throw new IOException("inflateBestEffort returned null"); + + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("fetched " + compressed.length + + " bytes of compressed content (expanded to " + + content.length + " bytes) from " + url); + } + return content; + } + protected static void main(HttpBase http, String[] args) throws Exception { boolean verbose = false; String url = null; Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=697878&r1=697877&r2=697878&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Mon Sep 22 09:02:40 2008 @@ -113,7 +113,7 @@ reqStr.append(portString); reqStr.append("\r\n"); - reqStr.append("Accept-Encoding: x-gzip, gzip\r\n"); + reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); String userAgent = http.getUserAgent(); if ((userAgent == null) || (userAgent.length() == 0)) { @@ -156,6 +156,8 @@ String contentEncoding = getHeader(Response.CONTENT_ENCODING); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { content = http.processGzipEncoded(content, url); + } else if ("deflate".equals(contentEncoding)) { + content = http.processDeflateEncoded(content, url); } else { if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetched " + content.length + " bytes from " + url); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=697878&r1=697877&r2=697878&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Mon Sep 22 09:02:40 2008 @@ -188,7 +188,7 @@ headers.add(new Header("Accept", "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5")); // accept gzipped content - headers.add(new Header("Accept-Encoding", "x-gzip, gzip")); + headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate")); hostConf.getParams().setParameter("http.default-headers", headers); // HTTP proxy server details Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=697878&r1=697877&r2=697878&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Mon Sep 22 09:02:40 2008 @@ -150,7 +150,7 @@ if (getHeader(Response.LOCATION) != null) fetchTrace.append("; Location: " + getHeader(Response.LOCATION)); } - // Extract gzip and x-gzip files + // Extract gzip, x-gzip and deflate content if (content != null) { // check if we have to uncompress it String contentEncoding = headers.get(Response.CONTENT_ENCODING); @@ -161,6 +161,10 @@ content = http.processGzipEncoded(content, url); if (Http.LOG.isTraceEnabled()) fetchTrace.append("; extracted to " + content.length + " bytes"); + } else if ("deflate".equals(contentEncoding)) { + content = http.processDeflateEncoded(content, url); + if (Http.LOG.isTraceEnabled()) + fetchTrace.append("; extracted to " + content.length + " bytes"); } }