Author: lehmi Date: Fri May 5 16:00:21 2017 New Revision: 1794069 URL: http://svn.apache.org/viewvc?rev=1794069&view=rev Log: PDFBOX-3347: fallback to ISO-8859-1 for names with invalid UTF-8
Modified: pdfbox/branches/2.0/ (props changed) pdfbox/branches/2.0/pdfbox/ (props changed) pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Propchange: pdfbox/branches/2.0/ ------------------------------------------------------------------------------ --- svn:mergeinfo (original) +++ svn:mergeinfo Fri May 5 16:00:21 2017 @@ -1,3 +1,3 @@ /pdfbox/branches/no-awt:1618517-1621410 /pdfbox/no-awt:1618514-1618516 -/pdfbox/trunk:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160,1742437,1742442,1745595,1745606,1745772,1745774,1745776,1745779,1746032,1746151,1749162,1749165,1749432,1766088,1766213,1768061,1770985,1770988,1772528,1778172,1782679,1786586,1786603,1787546,1790745 +/pdfbox/trunk:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160,1742437,1742442,1743248,1745595,1745606,1745772,1745774,1745776,1745779,1746032,1746151,1749162,1749165,1749432,1766088,1766213,1768061,1770985,1770988,1772528,1778172,1782679,1786586,1786603,1787546,1790745 Propchange: pdfbox/branches/2.0/pdfbox/ ------------------------------------------------------------------------------ --- svn:mergeinfo (original) +++ svn:mergeinfo Fri May 5 16:00:21 2017 @@ -1,3 +1,3 @@ /pdfbox/branches/no-awt/pdfbox:1618517-1621410 /pdfbox/no-awt/pdfbox:1618514-1618516 -/pdfbox/trunk/pdfbox:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160,1742437,1742442,1745595,1745606,1745772,1745774,1745776,1745779,1746032,1746151,1749162,1749165,1749432,1757165,1758817,1770988,1772528,1778172,1782679,1786586,1786603,1787546,1790745 +/pdfbox/trunk/pdfbox:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160,1742437,1742442,1743248,1745595,1745606,1745772,1745774,1745776,1745779,1746032,1746151,1749162,1749165,1749432,1757165,1758817,1770988,1772528,1778172,1782679,1786586,1786603,1787546,1790745 Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1794069&r1=1794068&r2=1794069&view=diff ============================================================================== --- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original) +++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Fri May 5 16:00:21 2017 @@ -18,6 +18,9 @@ package org.apache.pdfbox.pdfparser; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.CharsetDecoder; import java.util.Arrays; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -779,11 +782,39 @@ public abstract class BaseParser { seqSource.unread(c); } - String string = new String(buffer.toByteArray(), Charsets.UTF_8); + + byte[] bytes = buffer.toByteArray(); + String string; + if (isValidUTF8(bytes)) + { + string = new String(buffer.toByteArray(), Charsets.UTF_8); + } + else + { + // some malformed PDFs don't use UTF-8 see PDFBOX-3347 + string = new String(buffer.toByteArray(), Charsets.ISO_8859_1); + } return COSName.getPDFName(string); } /** + * Returns true if a byte sequence is valid UTF-8. + */ + private boolean isValidUTF8(byte[] input) + { + CharsetDecoder cs = Charsets.UTF_8.newDecoder(); + try + { + cs.decode(ByteBuffer.wrap(input)); + return true; + } + catch (CharacterCodingException e) + { + return false; + } + } + + /** * This will parse a boolean object from the stream. * * @return The parsed boolean object.