Hello, While using PoDoFo, I've encountered some slightly non-compliant PDFs with an ID entry in the trailer stored as an indirect object, when the spec mandates that it be a direct object. (At least one of the offending files was from QuarkXPress on Mac OS X 10.3; this may not be a widespread problem.) This causes PoDoFo to error out when parsing the file.
I've added a very simple fix and a test case to accompany it. When PoDoFo writes out a modified version of one of these PDFs, it writes the ID into the trailer as a direct object regardless, so this only needs to be dealt with when parsing the PDFs. Does this seem sensible? Thanks, -- Clayton Wheeler cwhee...@genomenon.com
From e3daead203d7ebb6a35bc84398531f03bb174b6b Mon Sep 17 00:00:00 2001 From: Clayton Wheeler <cwhee...@genomenon.com> Date: Thu, 11 Oct 2018 17:49:49 -0500 Subject: [PATCH] Handle trailer ID (incorrectly) being an indirect object Some non-conforming PDF writers (QuarkXPress and/or Quartz ca. Mac OS X 10.3.9, evidently) can write a file identifier in the trailer as an indirect object. This is contrary to the specification, but worth handling since it otherwise breaks PoDoFo. --- src/base/PdfWriter.cpp | 6 ++++ test/unit/ParserTest.cpp | 66 ++++++++++++++++++++++++++++++++++++++++ test/unit/ParserTest.h | 3 ++ 3 files changed, 75 insertions(+) diff --git a/src/base/PdfWriter.cpp b/src/base/PdfWriter.cpp index 237e0ff..b2a558f 100644 --- a/src/base/PdfWriter.cpp +++ b/src/base/PdfWriter.cpp @@ -686,6 +686,12 @@ void PdfWriter::CreateFileIdentifier( PdfString & identifier, const PdfObject* p if( pOriginalIdentifier && pTrailer->GetDictionary().HasKey( "ID" )) { const PdfObject* idObj = pTrailer->GetDictionary().GetKey("ID"); + // Per the PDF spec, section 7.5.5, the ID shall be an indirect object. + // If a non-conforming writer (e.g. Quark and/or Quartz) writes it as + // an indirect object, we should handle that case. + if ( idObj->IsReference() ) { + idObj = m_vecObjects->GetObject( idObj->GetReference() ); + } TCIVariantList it = idObj->GetArray().begin(); if( it != idObj->GetArray().end() && diff --git a/test/unit/ParserTest.cpp b/test/unit/ParserTest.cpp index d0014cd..a34c039 100644 --- a/test/unit/ParserTest.cpp +++ b/test/unit/ParserTest.cpp @@ -1981,6 +1981,72 @@ void ParserTest::testIsPdfFile() } } +void ParserTest::testRoundTripIndirectTrailerID() +{ + std::ostringstream oss; + oss << "%PDF-1.1\n"; + int nCurObj = 0; + int objPos[20]; + + // Pages + + int nPagesObj = nCurObj; + objPos[objN] = oss.tellp(); + oss << nCurObj++ << " 0 obj\n"; + oss << "<</Type /Pages /Count 0 /Kids []>>\n"; + oss << "endobj"; + + // Root catalog + + int rootObj = nCurObj; + objPos[objN] = oss.tellp(); + oss << nCurObj++ << " 0 obj\n"; + oss << "<</Type /Catalog /Pages " << nPagesObj << " 0 R>>\n"; + oss << "endobj\n"; + + // ID + int nIdObj = nCurObj; + objPos[objN] = oss.tellp(); + oss << nCurObj++ << " 0 obj\n"; + oss << "[<F1E375363A6314E3766EDF396D614748> <F1E375363A6314E3766EDF396D614748>]\n"; + oss << "endobj\n"; + + int nXrefPos = oss.tellp(); + oss << "xref\n"; + oss << "0 " << nCurObj << "\n"; + char objRec[21]; + for ( int i = 0; i < nCurObj; i++ ) { + snprintf( objRec, 21, "%010d 00000 n \n", objPos[i] ); + oss << objRec; + } + oss << "trailer <<\n" + << " /Size " << nCurObj << "\n" + << " /Root " << rootObj << " 0 R\n" + << " /ID " << nIdObj << " 0 R\n" // illegal indirect ID + << ">>\n" + << "startxref\n" + << nXrefPos << "\n" + << "%%EOF\n"; + + std::string sInBuf = oss.str(); + //std::cerr << inBuf; + try { + PoDoFo::PdfMemDocument doc; + // load for update + doc.LoadFromBuffer( inBuf.c_str(), inBuf.size(), true ); + + PoDoFo::PdfRefCountedBuffer outBuf; + PoDoFo::PdfOutputDevice outDev( &outBuf ); + + doc.WriteUpdate( &outDev ); + // should not throw + CPPUNIT_ASSERT( true ); + } catch ( PoDoFo::PdfError& error ) { + //error.PrintErrorMsg(); + CPPUNIT_FAIL( "Unexpected PdfError" ); + } +} + std::string ParserTest::generateXRefEntries( size_t count ) { std::string strXRefEntries; diff --git a/test/unit/ParserTest.h b/test/unit/ParserTest.h index b8f7ea9..cffcaaa 100644 --- a/test/unit/ParserTest.h +++ b/test/unit/ParserTest.h @@ -41,6 +41,7 @@ class ParserTest : public CppUnit::TestFixture CPPUNIT_TEST( testReadXRefStreamContents ); CPPUNIT_TEST( testReadObjects ); CPPUNIT_TEST( testIsPdfFile ); + CPPUNIT_TEST( testRoundTripIndirectTrailerID ); CPPUNIT_TEST_SUITE_END(); public: @@ -77,6 +78,8 @@ public: //void testReadNextTrailer(); //void testCheckEOFMarker(); + void testRoundTripIndirectTrailerID(); + private: std::string generateXRefEntries( size_t count ); bool canOutOfMemoryKillUnitTests(); -- 2.19.1
_______________________________________________ Podofo-users mailing list Podofo-users@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/podofo-users