Author: tilman
Date: Tue Feb 25 19:56:41 2014
New Revision: 1571806

URL: http://svn.apache.org/r1571806
Log:
PDFBOX-1147: rewrote LZW filter after failure to find bug; PDFBOX-205: catch 
EOF if EOD marker is missing; delete files that are no longer needed.

Removed:
    
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWDictionary.java
    
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWNode.java
    
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/io/NBitInputStream.java
    
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/io/NBitOutputStream.java
Modified:
    
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java

Modified: 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java?rev=1571806&r1=1571805&r2=1571806&view=diff
==============================================================================
--- 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java
 (original)
+++ 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java
 Tue Feb 25 19:56:41 2014
@@ -1,10 +1,9 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
@@ -16,196 +15,257 @@
  */
 package org.apache.pdfbox.filter;
 
-import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.io.PushbackInputStream;
-import java.io.StreamCorruptedException;
-
+import java.util.ArrayList;
+import java.util.Arrays;
+import javax.imageio.stream.MemoryCacheImageInputStream;
+import javax.imageio.stream.MemoryCacheImageOutputStream;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.cos.COSDictionary;
 
-import org.apache.pdfbox.io.NBitInputStream;
-import org.apache.pdfbox.io.NBitOutputStream;
-
 /**
- * This is the used for the LZWDecode filter.
+ *
+ * This is the filter used for the LZWDecode filter.
  *
  * @author <a href="mailto:b...@benlitchfield.com";>Ben Litchfield</a>
- * @version $Revision: 1.15 $
+ * @author Tilman Hausherr
  */
 public class LZWFilter implements Filter
 {
+    /**
+     * Log instance.
+     */
+    private static final Log LOG = LogFactory.getLog(LZWFilter.class);
 
     /**
      * The LZW clear table code.
      */
     public static final long CLEAR_TABLE = 256;
+
     /**
      * The LZW end of data code.
      */
     public static final long EOD = 257;
 
     /**
+     * The LZW code table.
+     */
+    private ArrayList<byte[]> codeTable = null;
+
+    /**
      * {@inheritDoc}
      */
-    public void decode( InputStream compressedData, OutputStream result, 
COSDictionary options, int filterIndex ) 
-        throws IOException
+    public void decode(InputStream compressedData, OutputStream result, 
COSDictionary options, int filterIndex)
+            throws IOException
     {
-        //log.debug("decode( )");
-        NBitInputStream in = null;
-        in = new NBitInputStream( compressedData );
-        in.setBitsInChunk( 9 );
-        LZWDictionary dic = new LZWDictionary();
-        byte firstByte = 0;
+        codeTable = null;
+        int chunk = 9;
+        MemoryCacheImageInputStream in = new 
MemoryCacheImageInputStream(compressedData);
         long nextCommand = 0;
-        while( (nextCommand = in.read() ) != EOD )
-        {
-            // log.debug( "decode - nextCommand=" + nextCommand + ", 
bitsInChunk: " + in.getBitsInChunk());
+        long prevCommand = -1;
 
-            if( nextCommand == CLEAR_TABLE )
-            {
-                in.setBitsInChunk( 9 );
-                dic = new LZWDictionary();
-            }
-            else
+        try
+        {
+            while ((nextCommand = in.readBits(chunk)) != EOD)
             {
-                byte[] data = dic.getData( nextCommand );
-                if( data == null )
+                if (nextCommand == CLEAR_TABLE)
                 {
-                    dic.visit( firstByte );
-                    data = dic.getData( nextCommand );
-                    dic.clear();
-                }
-                if( data == null )
-                {
-                    throw new StreamCorruptedException( "Error: data is null" 
);
-                }
-                dic.visit(data);
-
-                //log.debug( "decode - dic.getNextCode(): " + 
dic.getNextCode());
-
-                if( dic.getNextCode() >= 2047 )
-                {
-                    in.setBitsInChunk( 12 );
-                }
-                else if( dic.getNextCode() >= 1023 )
-                {
-                    in.setBitsInChunk( 11 );
-                }
-                else if( dic.getNextCode() >= 511 )
-                {
-                    in.setBitsInChunk( 10 );
+                    chunk = 9;
+                    initCodeTable();
+                    prevCommand = -1;
                 }
                 else
                 {
-                    in.setBitsInChunk( 9 );
+                    if (nextCommand < codeTable.size())
+                    {
+                        byte[] data = codeTable.get((int) nextCommand);
+                        byte firstByte = data[0];
+                        result.write(data);
+                        if (prevCommand != -1)
+                        {
+                            data = codeTable.get((int) prevCommand);
+                            byte[] newData = Arrays.copyOf(data, data.length + 
1);
+                            newData[data.length] = firstByte;
+                            codeTable.add(newData);
+                        }
+                    }
+                    else
+                    {
+                        byte[] data = codeTable.get((int) prevCommand);
+                        byte[] newData = Arrays.copyOf(data, data.length + 1);
+                        newData[data.length] = data[0];
+                        result.write(newData);
+                        codeTable.add(newData);
+                    }
+                    if (codeTable.size() >= 2047)
+                    {
+                        chunk = 12;
+                    }
+                    else if (codeTable.size() >= 1023)
+                    {
+                        chunk = 11;
+                    }
+                    else if (codeTable.size() >= 511)
+                    {
+                        chunk = 10;
+                    }
+                    else
+                    {
+                        chunk = 9;
+                    }
+                    prevCommand = nextCommand;
                 }
-                /**
-                if( in.getBitsInChunk() != dic.getCodeSize() )
-                {
-                    in.unread( nextCommand );
-                    in.setBitsInChunk( dic.getCodeSize() );
-                    System.out.print( "Switching " + nextCommand + " to " );
-                    nextCommand = in.read();
-                    System.out.println( "" +  nextCommand );
-                    data = dic.getData( nextCommand );
-                }**/
-                firstByte = data[0];
-                result.write( data );
             }
         }
+        catch (EOFException ex)
+        {
+            LOG.warn("Premature EOF in LZW stream, EOD code missing");
+        }
         result.flush();
     }
 
-
     /**
      * {@inheritDoc}
      */
-    public void encode( InputStream rawData, OutputStream result, 
COSDictionary options, int filterIndex ) 
-        throws IOException
+    public void encode(InputStream rawData, OutputStream result, COSDictionary 
options, int filterIndex)
+            throws IOException
     {
-        //log.debug("encode( )");
-        PushbackInputStream input = new PushbackInputStream( rawData, 4096 );
-        LZWDictionary dic = new LZWDictionary();
-        NBitOutputStream out = new NBitOutputStream( result );
-        out.setBitsInChunk( 9 ); //initially nine
-        out.write( CLEAR_TABLE );
-        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
-        int byteRead = 0;
-        for( int i=0; (byteRead = input.read()) != -1; i++ )
-        {
-            //log.debug( "byteRead = '" + (char)byteRead + "' (0x" + 
Integer.toHexString(byteRead) + "), i=" + i);
-            buffer.write( byteRead );
-            dic.visit( (byte)byteRead );
-            out.setBitsInChunk( dic.getCodeSize() );
-
-            //log.debug( "Getting node '" + new String( buffer.toByteArray() ) 
+ "', buffer.size = " + buffer.size() );
-            LZWNode node = dic.getNode( buffer.toByteArray() );
-            int nextByte = input.read();
-            if( nextByte != -1 )
-            {
-                //log.debug( "nextByte = '" + (char)nextByte + "' (0x" + 
Integer.toHexString(nextByte) + ")");
-                LZWNode next = node.getNode( (byte)nextByte );
-                if( next == null )
-                {
-                    //log.debug("encode - No next node, writing node and 
resetting buffer (" +
-                    //          " node.getCode: " + node.getCode() + ")" +
-                    //          " bitsInChunk: " + out.getBitsInChunk() +
-                    //          ")");
-                    out.write( node.getCode() );
-                    buffer.reset();
-                }
+        initCodeTable();
+        int chunk = 9;
 
-                input.unread( nextByte );
+        byte[] inputPattern = null;
+        MemoryCacheImageOutputStream out = new 
MemoryCacheImageOutputStream(result);
+        out.writeBits(CLEAR_TABLE, chunk);
+        int foundCode = -1;
+        int r;
+        while ((r = rawData.read()) != -1)
+        {
+            byte by = (byte) r;
+            if (inputPattern == null)
+            {
+                inputPattern = new byte[]
+                {
+                    by
+                };
+                foundCode = by & 0xff;
             }
             else
             {
-                //log.debug("encode - EOF on lookahead: writing node, 
resetting buffer, and terminating read loop (" +
-                //          " node.getCode: " + node.getCode() + ")" +
-                //          " bitsInChunk: " + out.getBitsInChunk() +
-                //          ")");
-                out.write( node.getCode() );
-                buffer.reset();
-                break;
+                inputPattern = Arrays.copyOf(inputPattern, inputPattern.length 
+ 1);
+                inputPattern[inputPattern.length - 1] = by;
+                int newFoundCode = findPatternCode(codeTable, inputPattern);
+                if (newFoundCode == -1)
+                {
+                    // use previous
+                    out.writeBits(foundCode, chunk);
+                    // create new table entry
+                    codeTable.add(inputPattern);
+
+                    if (codeTable.size() == 4096)
+                    {
+                        // code table is full
+                        out.writeBits(CLEAR_TABLE, chunk);
+                        chunk = 9;
+                        initCodeTable();
+                    }
+
+                    inputPattern = new byte[]
+                    {
+                        by
+                    };
+                    foundCode = by & 0xff;
+                }
+                else
+                {
+                    foundCode = newFoundCode;
+                }
             }
-
-            if( dic.getNextCode() == 4096 )
+            if (codeTable.size() - 1 >= 2047)
             {
-                //log.debug("encode - Clearing dictionary and unreading 
pending buffer data (" +
-                //          " bitsInChunk: " + out.getBitsInChunk() +
-                //          ")");
-                out.write( CLEAR_TABLE );
-                dic = new LZWDictionary();
-                input.unread( buffer.toByteArray() );
-                buffer.reset();
+                chunk = 12;
+            }
+            else if (codeTable.size() - 1 >= 1023)
+            {
+                chunk = 11;
+            }
+            else if (codeTable.size() - 1 >= 511)
+            {
+                chunk = 10;
+            }
+            else
+            {
+                chunk = 9;
             }
         }
-
-        // Fix the code size based on the fact that we are writing the EOD
-        //
-        if( dic.getNextCode() >= 2047 )
-        {
-            out.setBitsInChunk( 12 );
-        }
-        else if( dic.getNextCode() >= 1023 )
+        if (foundCode != -1)
         {
-            out.setBitsInChunk( 11 );
+            out.writeBits(foundCode, chunk);
         }
-        else if( dic.getNextCode() >= 511 )
+        out.writeBits(EOD, chunk);
+        out.writeBits(0, 7);
+        out.flush(); // must do or file will be empty :-(
+        codeTable.clear();
+    }
+
+    /**
+     * Find the longest matching pattern in the code table.
+     *
+     * @param codeTable The LZW code table.
+     * @param pattern The pattern to be searched for.
+     * @return The index of the longest matching pattern or -1 if nothing is
+     * found.
+     */
+    private int findPatternCode(ArrayList<byte[]> codeTable, byte[] pattern)
+    {
+        int foundCode = -1;
+        int foundLen = 0;
+        for (int i = codeTable.size() - 1; i >= 0; --i)
         {
-            out.setBitsInChunk( 10 );
+            if (i <= EOD)
+            {
+                // we're in the single byte area
+                if (foundCode != -1)
+                {
+                    return foundCode; // we already found pattern with size > 1
+                }
+                else if (pattern.length > 1)
+                {
+                    return -1; // we won't find anything here anyway
+                }
+            }
+            byte[] tryPattern = codeTable.get(i);
+            if (foundCode != -1 || tryPattern.length > foundLen)
+            {
+                if (Arrays.equals(tryPattern, pattern))
+                {
+                    foundCode = i;
+                    foundLen = tryPattern.length;
+                }
+            }
         }
-        else
+        return foundCode;
+    }
+
+    /**
+     * Init the code table with 1 byte entries and the EOD and CLEAR_TABLE
+     * markers.
+     */
+    private void initCodeTable()
+    {
+        codeTable = new ArrayList<byte[]>(4096);
+        for (int i = 0; i < 256; ++i)
         {
-            out.setBitsInChunk( 9 );
+            codeTable.add(new byte[]
+            {
+                (byte) (i & 0xFF)
+            });
         }
-
-        //log.debug("encode - Writing EOD (" +
-        //          " bitsInChunk: " + out.getBitsInChunk() +
-        //          ")");
-        out.write( EOD );
-        out.close();
-        result.flush();
+        codeTable.add(null); // 256 EOD
+        codeTable.add(null); // 257 CLEAR_TABLE
     }
+
 }


Reply via email to