[
https://issues.apache.org/jira/browse/PDFBOX-5355?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17474883#comment-17474883
]
Tilman Hausherr commented on PDFBOX-5355:
-----------------------------------------
here's some code for the ParentTree. Not much tested, but it shows the general
direction. Sadly there's more to do, the K tree and the IDTree.
{code}
PDStructureTreeRoot structureTreeRoot =
doc.getDocumentCatalog().getStructureTreeRoot();
PDNumberTreeNode parentTree = structureTreeRoot.getParentTree();
processNumberTreeNode(parentTree, doc);
}
private static void processNumberTreeNode(PDNumberTreeNode numberTreeNode,
PDDocument doc) throws IOException
{
Map<Integer, COSObjectable> numbersMap = numberTreeNode.getNumbers();
Set<Integer> keysToDelete = new HashSet<>();
if (numbersMap != null)
{
for (Map.Entry<Integer, COSObjectable> entry :
numbersMap.entrySet())
{
COSObjectable value = entry.getValue();
if (value instanceof COSObject)
{
value = value.getCOSObject();
}
if (value instanceof PDParentTreeValue)
{
PDParentTreeValue ptValue = (PDParentTreeValue) value;
value = ptValue.getCOSObject();
}
System.out.println("key: " + entry.getKey());
if (value instanceof COSArray)
{
ArrayDeque<Integer> stack = new ArrayDeque<>();
COSArray array = (COSArray) value;
System.out.println("array size before processing: " +
array.size());
for (int i = 0; i < array.size(); ++i)
{
COSBase base = array.getObject(i);
if (base instanceof COSDictionary)
{
PDStructureElement structureElement =
(PDStructureElement) PDStructureNode.create((COSDictionary) base);
if (structureElement != null)
{
PDPage page = structureElement.getPage();
if (doc.getPages().indexOf(page) < 0)
{
System.out.println("orphan at element " +
i);
stack.push(i);
}
}
}
}
// delete in reverse order
while (!stack.isEmpty())
{
int i = stack.pop();
array.remove(i);
System.out.println("orphan at element " + i + "
removed");
}
System.out.println("array size after processing: " +
array.size());
if (array.size() == 0)
{
System.out.println("empty array at key " +
entry.getKey());
keysToDelete.add(entry.getKey());
}
}
else if (value instanceof COSDictionary)
{
PDStructureNode node =
PDStructureNode.create((COSDictionary) value);
if (node instanceof PDStructureElement)
{
PDPage page = ((PDStructureElement) node).getPage();
if (doc.getPages().indexOf(page) < 0)
{
System.out.println("orphan at key " +
entry.getKey());
keysToDelete.add(entry.getKey());
}
}
}
}
}
List<PDNumberTreeNode> kids = numberTreeNode.getKids();
if (kids != null)
{
for (PDNumberTreeNode node : kids)
{
if (node != null)
{
processNumberTreeNode(node, doc);
}
}
}
if (!keysToDelete.isEmpty())
{
// delete after the loop
Map<Integer, COSObjectable> newNumbersMap = new HashMap<>();
for (Map.Entry<Integer, COSObjectable> entry :
numbersMap.entrySet())
{
if (!keysToDelete.contains(entry.getKey()))
{
newNumbersMap.put(entry.getKey(), entry.getValue());
}
}
numberTreeNode.setNumbers(newNumbersMap);
}
doc.save(new
File("96977696-26b5-4d53-8fe9-8c8f974de749.pdf-saved.txt"));
{code}
> remove page from pdf with image violate conformance level pdf1.7
> ----------------------------------------------------------------
>
> Key: PDFBOX-5355
> URL: https://issues.apache.org/jira/browse/PDFBOX-5355
> Project: PDFBox
> Issue Type: Bug
> Affects Versions: 2.0.20, 3.0.0 PDFBox
> Reporter: lappa-lappa
> Priority: Major
> Labels: StructureTree
> Attachments: 554dd11e-3c3d-44cd-bc31-d8f960e671e3.pdf,
> 96977696-26b5-4d53-8fe9-8c8f974de749.pdf, image-2022-01-11-12-57-46-755.png,
> pdf_result.pdf, with_image.pdf
>
>
> open [https://www.pdf-online.com/osa/validate.aspx] and upload
> "with_image.pdf" document, validation passed
> Execute following code (update absolute paths to files):
> {{{}byte[] withImage =
> readFile("C:/r/{}}}{{{}pdf/{}}}{{{}with_image.pdf");{}}}
> {{try (PDDocument boxDocument = Loader.loadPDF(withImage)) {}}
> {{ boxDocument.removePage(0);}}
> try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
> boxDocument.save(bos);
> {{ byte[] pdfBytes = bos.toByteArray();}}
> {{{} Files.write(Path.of("C:/r/{}}}{{{}pdf/{}}}{{{}pdf_result.pdf"),
> pdfBytes);{}}}
> }
> {{} catch (IOException e) {}}
> {{{} e.printS{}}}tackTrace();
> {{}}}
> {{upload pdf_result.pdf into [https://www.pdf-online.com/osa/validate.aspx] ,
> validation is not passed.}}
--
This message was sent by Atlassian Jira
(v8.20.1#820001)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]