Author: tilman
Date: Wed Mar 12 15:23:19 2025
New Revision: 1924338

URL: http://svn.apache.org/viewvc?rev=1924338&view=rev
Log:
PDFBOX-5974: check that all MCIDs of a page content stream have an entry in the 
ParentTree.

Modified:
    
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java

Modified: 
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java?rev=1924338&r1=1924337&r2=1924338&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
 Wed Mar 12 15:23:19 2025
@@ -27,6 +27,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.TreeSet;
 
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.cos.COSArray;
@@ -44,9 +45,12 @@ import org.apache.pdfbox.pdmodel.PDResou
 import org.apache.pdfbox.pdmodel.common.COSObjectable;
 import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
 import org.apache.pdfbox.pdmodel.common.PDNumberTreeNode;
+import 
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDMarkedContentReference;
+import 
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDParentTreeValue;
 import 
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
 import 
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode;
 import 
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
+import 
org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
 import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
@@ -62,6 +66,7 @@ import org.apache.pdfbox.pdmodel.interac
 import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
 import org.apache.pdfbox.pdmodel.interactive.form.PDField;
 import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFMarkedContentExtractor;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotEquals;
@@ -648,6 +653,9 @@ class PDFMergerUtilityTest
     /**
      * PDFBOX-4408: Check that /StructParents values from pages and 
/StructParent values from
      * annotations are found in the /ParentTree.
+     * <p>
+     * Expanded in 2025 to check that all MCIDs of a page content stream have 
an entry in the
+     * ParentTree.
      *
      * @param document
      */
@@ -673,11 +681,71 @@ class PDFMergerUtilityTest
                 }
             }
         }
-        for (PDPage page : document.getPages())
+        PDPageTree pageTree = document.getPages();
+        for (PDPage page : pageTree)
         {
+            int pageNum = pageTree.indexOf(page) + 1;
             if (page.getStructParents() >= 0)
             {
-                assertTrue(keySet.contains(page.getStructParents()));
+                assertTrue(keySet.contains(page.getStructParents()), 
"/StructParents " + page.getStructParents() + " from page " +
+                           pageNum + " not found in /ParentTree");
+                PDParentTreeValue obj = (PDParentTreeValue) 
numberTreeAsMap.get(page.getStructParents());
+                assertTrue(obj.getCOSObject() instanceof COSArray, "Expected 
array in page " + pageNum + ", got " + obj.getClass());
+                COSArray array = (COSArray) obj.getCOSObject();
+
+                PDFMarkedContentExtractor markedContentExtractor = new 
PDFMarkedContentExtractor();
+                markedContentExtractor.processPage(page);
+                List<PDMarkedContent> markedContents = 
markedContentExtractor.getMarkedContents();
+                TreeSet<Integer> set = new TreeSet<>();
+                for (PDMarkedContent pdMarkedContent : markedContents)
+                {
+                    COSDictionary pdmcProperties = 
pdMarkedContent.getProperties();
+                    if (pdmcProperties == null)
+                    {
+                        continue;
+                    }
+                    int mcid = pdMarkedContent.getMCID();
+                    if (mcid >= 0)
+                    {
+                        // "For a page object (...), the value shall be an 
array of references
+                        // to the parent elements of those marked-content 
sequences."
+                        // this means that the /Pg entry doesn't have to match 
the page
+                        COSDictionary dict = (COSDictionary) 
array.getObject(mcid);
+                        assertNotNull(dict);
+                        set.add(mcid);
+                        PDStructureElement structureElemen = 
(PDStructureElement) PDStructureNode.create(dict);
+                        List<Object> kids = structureElemen.getKids();
+                        boolean found = false;
+                        for (Object kid : kids)
+                        {
+                            if (kid instanceof Integer && ((Integer) kid) == 
mcid)
+                            {
+                                found = true;
+                                break;
+                            }
+                            if (kid instanceof PDMarkedContentReference)
+                            {
+                                PDMarkedContentReference mcr = 
(PDMarkedContentReference) kid;
+                                if (mcid == mcr.getMCID())
+                                {
+                                    found = true;
+                                    if (mcr.getPage() != null)
+                                    {
+                                        assertEquals(page, mcr.getPage());
+                                    }
+                                    else
+                                    {
+                                        assertEquals(page, 
structureElemen.getPage());
+                                    }
+                                    break;
+                                }
+                            }
+                        }
+                        assertTrue(found, "page: " + pageNum + ", mcid: " + 
mcid + " not found");
+                    }
+                }
+                // actual count may be larger if last element is null, e.g. 
PDFBOX-4408
+                assertTrue(set.last() <= array.size() - 1);
             }
             for (PDAnnotation ann : page.getAnnotations())
             {


Reply via email to