[ 
https://issues.apache.org/jira/browse/PDFBOX-5355?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17474883#comment-17474883
 ] 

Tilman Hausherr commented on PDFBOX-5355:
-----------------------------------------

here's some code for the ParentTree. Not much tested, but it shows the general 
direction. Sadly there's more to do, the K tree and the IDTree.
{code}
        PDStructureTreeRoot structureTreeRoot = 
doc.getDocumentCatalog().getStructureTreeRoot();
        PDNumberTreeNode parentTree = structureTreeRoot.getParentTree();

        processNumberTreeNode(parentTree, doc);
    }

    private static void processNumberTreeNode(PDNumberTreeNode numberTreeNode, 
PDDocument doc) throws IOException
    {
        Map<Integer, COSObjectable> numbersMap = numberTreeNode.getNumbers();
        Set<Integer> keysToDelete = new HashSet<>();
        if (numbersMap != null)
        {
            for (Map.Entry<Integer, COSObjectable> entry : 
numbersMap.entrySet())
            {
                COSObjectable value = entry.getValue();
                if (value instanceof COSObject)
                {
                    value = value.getCOSObject();
                }
                if (value instanceof PDParentTreeValue)
                {
                    PDParentTreeValue ptValue = (PDParentTreeValue) value;
                    value = ptValue.getCOSObject();
                }
                System.out.println("key: " + entry.getKey());
                if (value instanceof COSArray)
                {
                    ArrayDeque<Integer> stack = new ArrayDeque<>();

                    COSArray array = (COSArray) value;
                    System.out.println("array size before processing: " + 
array.size());
                    for (int i = 0; i < array.size(); ++i)
                    {
                        COSBase base = array.getObject(i);
                        if (base instanceof COSDictionary)
                        {
                            PDStructureElement structureElement = 
(PDStructureElement) PDStructureNode.create((COSDictionary) base);
                            if (structureElement != null)
                            {
                                PDPage page = structureElement.getPage();
                                if (doc.getPages().indexOf(page) < 0)
                                {
                                    System.out.println("orphan at element " + 
i);
                                    stack.push(i);
                                }
                            }
                        }
                    }
                    // delete in reverse order
                    while (!stack.isEmpty())
                    {
                        int i = stack.pop();
                        array.remove(i);
                        System.out.println("orphan at element " + i + " 
removed");
                    }
                    System.out.println("array size after processing: " + 
array.size());
                    if (array.size() == 0)
                    {
                        System.out.println("empty array at key " + 
entry.getKey());
                        keysToDelete.add(entry.getKey());
                    }
                }
                else if (value instanceof COSDictionary)
                {
                    PDStructureNode node = 
PDStructureNode.create((COSDictionary) value);
                    if (node instanceof PDStructureElement)
                    {
                        PDPage page = ((PDStructureElement) node).getPage();
                        if (doc.getPages().indexOf(page) < 0)
                        {
                            System.out.println("orphan at key " + 
entry.getKey());
                            keysToDelete.add(entry.getKey());
                        }
                    }
                }
            }
        }
        List<PDNumberTreeNode> kids = numberTreeNode.getKids();
        if (kids != null)
        {
            for (PDNumberTreeNode node : kids)
            {
                if (node != null)
                {
                    processNumberTreeNode(node, doc);
                }
            }
        }
        if (!keysToDelete.isEmpty())
        {
            // delete after the loop
            Map<Integer, COSObjectable> newNumbersMap = new HashMap<>();
            for (Map.Entry<Integer, COSObjectable> entry : 
numbersMap.entrySet())
            {
                if (!keysToDelete.contains(entry.getKey()))
                {
                    newNumbersMap.put(entry.getKey(), entry.getValue());
                }
            }
            numberTreeNode.setNumbers(newNumbersMap);
        }
        doc.save(new 
File("96977696-26b5-4d53-8fe9-8c8f974de749.pdf-saved.txt"));
{code}


> remove page from pdf with image violate conformance level pdf1.7
> ----------------------------------------------------------------
>
>                 Key: PDFBOX-5355
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-5355
>             Project: PDFBox
>          Issue Type: Bug
>    Affects Versions: 2.0.20, 3.0.0 PDFBox
>            Reporter: lappa-lappa
>            Priority: Major
>              Labels: StructureTree
>         Attachments: 554dd11e-3c3d-44cd-bc31-d8f960e671e3.pdf, 
> 96977696-26b5-4d53-8fe9-8c8f974de749.pdf, image-2022-01-11-12-57-46-755.png, 
> pdf_result.pdf, with_image.pdf
>
>
> open [https://www.pdf-online.com/osa/validate.aspx] and upload 
> "with_image.pdf" document, validation passed
> Execute following code (update absolute paths to files):
> {{{}byte[] withImage = 
> readFile("C:/r/{}}}{{{}pdf/{}}}{{{}with_image.pdf");{}}}
> {{try (PDDocument boxDocument = Loader.loadPDF(withImage)) {}}
> {{  boxDocument.removePage(0);}}
>     try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
>         boxDocument.save(bos);
> {{    byte[] pdfBytes = bos.toByteArray();}}
> {{{}    Files.write(Path.of("C:/r/{}}}{{{}pdf/{}}}{{{}pdf_result.pdf"), 
> pdfBytes);{}}}
>     }
> {{} catch (IOException e) {}}
> {{{}  e.printS{}}}tackTrace();
> {{}}}
> {{upload pdf_result.pdf into [https://www.pdf-online.com/osa/validate.aspx] , 
> validation is not passed.}}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to