Author: tilman Date: Tue Sep 9 08:34:00 2025 New Revision: 1928315 Log: PDFBOX-6065: handle KwKwK special case, by Daniel Persson with ChatGPT
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java Tue Sep 9 08:33:57 2025 (r1928314) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/filter/LZWFilter.java Tue Sep 9 08:34:00 2025 (r1928315) @@ -54,9 +54,6 @@ public class LZWFilter extends Filter * The LZW end of data code. */ public static final long EOD = 257; - - //BEWARE: codeTable must be local to each method, because there is only - // one instance of each filter /** * {@inheritDoc} @@ -73,11 +70,12 @@ public class LZWFilter extends Filter private static void doLZWDecode(InputStream encoded, OutputStream decoded, boolean earlyChange) throws IOException { - List<byte[]> codeTable = new ArrayList<>(); + List<byte[]> codeTable = createCodeTable(); // includes CLEAR/EOD handling as needed int chunk = 9; final MemoryCacheImageInputStream in = new MemoryCacheImageInputStream(encoded); + + byte[] prev = null; // no previous string yet long nextCommand; - long prevCommand = -1; try { @@ -87,60 +85,50 @@ public class LZWFilter extends Filter { chunk = 9; codeTable = createCodeTable(); - prevCommand = -1; + prev = null; + continue; } - else + + byte[] curr; + + if (nextCommand < codeTable.size()) { - if (nextCommand < codeTable.size()) - { - byte[] data = codeTable.get((int) nextCommand); - byte firstByte = data[0]; - decoded.write(data); - if (prevCommand != -1) - { - checkIndexBounds(codeTable, prevCommand, in); - data = codeTable.get((int) prevCommand); - byte[] newData = Arrays.copyOf(data, data.length + 1); - newData[data.length] = firstByte; - codeTable.add(newData); - } - } - else + // Normal case: code exists + curr = codeTable.get((int) nextCommand); + decoded.write(curr); + + if (prev != null) { - checkIndexBounds(codeTable, prevCommand, in); - byte[] data = codeTable.get((int) prevCommand); - byte[] newData = Arrays.copyOf(data, data.length + 1); - newData[data.length] = data[0]; - decoded.write(newData); - codeTable.add(newData); + // Add prev + first(curr) + byte[] entry = Arrays.copyOf(prev, prev.length + 1); + entry[prev.length] = curr[0]; + codeTable.add(entry); } - - chunk = calculateChunk(codeTable.size(), earlyChange); - prevCommand = nextCommand; } + else if (nextCommand == codeTable.size() && prev != null) + { + // KwKwK case: code equals next available index + curr = Arrays.copyOf(prev, prev.length + 1); + curr[prev.length] = prev[0]; + decoded.write(curr); + codeTable.add(curr); + } + else + { + // Corrupt stream (code out of range, or KwKwK without prev) + throw new EOFException("Invalid LZW code: " + nextCommand); + } + + prev = curr; // move forward + chunk = calculateChunk(codeTable.size(), earlyChange); } } catch (EOFException ex) { LOG.warn("Premature EOF in LZW stream, EOD code missing", ex); } - decoded.flush(); - } - private static void checkIndexBounds(List<byte[]> codeTable, long index, MemoryCacheImageInputStream in) - throws IOException - { - if (index < 0) - { - throw new IOException("negative array index: " + index + " near offset " - + in.getStreamPosition()); - } - if (index >= codeTable.size()) - { - throw new IOException("array index overflow: " + index + - " >= " + codeTable.size() + " near offset " - + in.getStreamPosition()); - } + decoded.flush(); } /**