Hello,

While using PoDoFo, I've encountered some slightly non-compliant PDFs with
an ID entry in the trailer stored as an indirect object, when the spec
mandates that it be a direct object. (At least one of the offending files
was from QuarkXPress on Mac OS X 10.3; this may not be a widespread
problem.) This causes PoDoFo to error out when parsing the file.

I've added a very simple fix and a test case to accompany it. When PoDoFo
writes out a modified version of one of these PDFs, it writes the ID into
the trailer as a direct object regardless, so this only needs to be dealt
with when parsing the PDFs.

Does this seem sensible?

Thanks,

-- 
Clayton Wheeler
cwhee...@genomenon.com
From e3daead203d7ebb6a35bc84398531f03bb174b6b Mon Sep 17 00:00:00 2001
From: Clayton Wheeler <cwhee...@genomenon.com>
Date: Thu, 11 Oct 2018 17:49:49 -0500
Subject: [PATCH] Handle trailer ID (incorrectly) being an indirect object

Some non-conforming PDF writers (QuarkXPress and/or Quartz ca. Mac OS
X 10.3.9, evidently) can write a file identifier in the trailer as an
indirect object. This is contrary to the specification, but worth
handling since it otherwise breaks PoDoFo.
---
 src/base/PdfWriter.cpp   |  6 ++++
 test/unit/ParserTest.cpp | 66 ++++++++++++++++++++++++++++++++++++++++
 test/unit/ParserTest.h   |  3 ++
 3 files changed, 75 insertions(+)

diff --git a/src/base/PdfWriter.cpp b/src/base/PdfWriter.cpp
index 237e0ff..b2a558f 100644
--- a/src/base/PdfWriter.cpp
+++ b/src/base/PdfWriter.cpp
@@ -686,6 +686,12 @@ void PdfWriter::CreateFileIdentifier( PdfString & identifier, const PdfObject* p
     if( pOriginalIdentifier && pTrailer->GetDictionary().HasKey( "ID" ))
     {
         const PdfObject* idObj = pTrailer->GetDictionary().GetKey("ID");
+        // Per the PDF spec, section 7.5.5, the ID shall be an indirect object.
+        // If a non-conforming writer (e.g. Quark and/or Quartz) writes it as
+        // an indirect object, we should handle that case.
+        if ( idObj->IsReference() ) {
+            idObj = m_vecObjects->GetObject( idObj->GetReference() );
+        }
 
         TCIVariantList it = idObj->GetArray().begin();
         if( it != idObj->GetArray().end() &&
diff --git a/test/unit/ParserTest.cpp b/test/unit/ParserTest.cpp
index d0014cd..a34c039 100644
--- a/test/unit/ParserTest.cpp
+++ b/test/unit/ParserTest.cpp
@@ -1981,6 +1981,72 @@ void ParserTest::testIsPdfFile()
     }     
 }
 
+void ParserTest::testRoundTripIndirectTrailerID()
+{
+    std::ostringstream oss;
+    oss << "%PDF-1.1\n";
+    int nCurObj = 0;
+    int objPos[20];
+
+    // Pages
+
+    int nPagesObj = nCurObj;
+    objPos[objN] = oss.tellp();
+    oss << nCurObj++ << " 0 obj\n";
+    oss << "<</Type /Pages /Count 0 /Kids []>>\n";
+    oss << "endobj";
+
+    // Root catalog
+
+    int rootObj = nCurObj;
+    objPos[objN] = oss.tellp();
+    oss << nCurObj++ << " 0 obj\n";
+    oss << "<</Type /Catalog /Pages " << nPagesObj << " 0 R>>\n";
+    oss << "endobj\n";
+
+    // ID
+    int nIdObj = nCurObj;
+    objPos[objN] = oss.tellp();
+    oss << nCurObj++ << " 0 obj\n";
+    oss << "[<F1E375363A6314E3766EDF396D614748> <F1E375363A6314E3766EDF396D614748>]\n";
+    oss << "endobj\n";
+
+    int nXrefPos = oss.tellp();
+    oss << "xref\n";
+    oss << "0 " << nCurObj << "\n";
+    char objRec[21];
+    for ( int i = 0; i < nCurObj; i++ ) {
+        snprintf( objRec, 21, "%010d 00000 n \n", objPos[i] );
+        oss << objRec;
+    }
+    oss << "trailer <<\n"
+        << "  /Size " << nCurObj << "\n"
+        << "  /Root " << rootObj << " 0 R\n"
+        << "  /ID " << nIdObj << " 0 R\n" // illegal indirect ID
+        << ">>\n"
+        << "startxref\n"
+        << nXrefPos << "\n"
+        << "%%EOF\n";
+
+    std::string sInBuf = oss.str();
+    //std::cerr << inBuf;
+    try {
+        PoDoFo::PdfMemDocument doc;
+        // load for update
+        doc.LoadFromBuffer( inBuf.c_str(), inBuf.size(), true );
+
+        PoDoFo::PdfRefCountedBuffer outBuf;
+        PoDoFo::PdfOutputDevice outDev( &outBuf );
+
+        doc.WriteUpdate( &outDev );
+        // should not throw
+        CPPUNIT_ASSERT( true );
+    } catch ( PoDoFo::PdfError& error ) {
+        //error.PrintErrorMsg();
+        CPPUNIT_FAIL( "Unexpected PdfError" );
+    }
+}
+
 std::string ParserTest::generateXRefEntries( size_t count )
 {
     std::string strXRefEntries;
diff --git a/test/unit/ParserTest.h b/test/unit/ParserTest.h
index b8f7ea9..cffcaaa 100644
--- a/test/unit/ParserTest.h
+++ b/test/unit/ParserTest.h
@@ -41,6 +41,7 @@ class ParserTest : public CppUnit::TestFixture
     CPPUNIT_TEST( testReadXRefStreamContents );
     CPPUNIT_TEST( testReadObjects );
     CPPUNIT_TEST( testIsPdfFile );
+    CPPUNIT_TEST( testRoundTripIndirectTrailerID );
     CPPUNIT_TEST_SUITE_END();
 
 public:
@@ -77,6 +78,8 @@ public:
     //void testReadNextTrailer();
     //void testCheckEOFMarker();
 
+    void testRoundTripIndirectTrailerID();
+
 private:
     std::string generateXRefEntries( size_t count );
     bool canOutOfMemoryKillUnitTests();
-- 
2.19.1

_______________________________________________
Podofo-users mailing list
Podofo-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/podofo-users

Reply via email to