(tika) branch branch_2x updated: TIKA-4363: refactor

tilman Wed, 15 Jan 2025 03:40:58 -0800

This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/branch_2x by this push:
     new 636f57b40 TIKA-4363: refactor
636f57b40 is described below

commit 636f57b40ad610f5dfbc8dce203a0b251ccff56d
Author: Tilman Hausherr <[email protected]>
AuthorDate: Wed Jan 15 12:40:21 2025 +0100

    TIKA-4363: refactor
---
 .../org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index cc829fc07..e2ee57bc7 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -139,7 +139,7 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
                 throw new TikaException("Unable to extract PDF content", e);
             }
         }
-        if (pdfMarkedContent2XHTML.exceptions.size() > 0) {
+        if (!pdfMarkedContent2XHTML.exceptions.isEmpty()) {
             //throw the first
             throw new TikaException("Unable to extract PDF content",
                     pdfMarkedContent2XHTML.exceptions.get(0));
@@ -192,7 +192,7 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
     }
 
     @Override
-    protected void processPages(PDPageTree pages) throws IOException {
+    protected void processPages(PDPageTree pageTree) throws IOException {
 
         //this is a 0-indexed list of object refs for each page
         //we need this to map the mcids later...
@@ -200,7 +200,7 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
 
         List<ObjectRef> pageRefs = new ArrayList<>();
         //STEP 1: get the page refs
-        
findPages(pdDocument.getPages().getCOSObject().getDictionaryObject(COSName.KIDS),
 pageRefs);
+        findPages(pageTree.getCOSObject().getDictionaryObject(COSName.KIDS), 
pageRefs);
         //confirm the right number of pages was found
         if (pageRefs.size() != pdDocument.getNumberOfPages()) {
             throw new IOException(new TikaException(
@@ -215,7 +215,7 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
         Map<String, HtmlTag> roleMap = 
loadRoleMap(structureTreeRoot.getRoleMap());
 
         //STEP 3: load all of the text, mapped to MCIDs
-        Map<MCID, String> paragraphs = loadTextByMCID(pageRefs);
+        Map<MCID, String> paragraphs = loadTextByMCID(pageTree, pageRefs);
 
         //STEP 4: now recurse the the structure tree root and output the 
structure
         //and the text bits from paragraphs
@@ -254,7 +254,7 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
         //TODO: figure out when we're crossing page boundaries during the 
recursion
         // step above and do the page by page processing then...rather than 
dumping this
         // all here.
-        for (PDPage page : pdDocument.getPages()) {
+        for (PDPage page : pageTree) {
             startPage(page);
             endPage(page);
         }
@@ -406,10 +406,10 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
         return roleMap.get(name);
     }
 
-    private Map<MCID, String> loadTextByMCID(List<ObjectRef> pageRefs) throws 
IOException {
+    private Map<MCID, String> loadTextByMCID(PDPageTree pageTree, 
List<ObjectRef> pageRefs) throws IOException {
         int pageCount = 1;
         Map<MCID, String> paragraphs = new HashMap<>();
-        for (PDPage page : pdDocument.getPages()) {
+        for (PDPage page : pageTree) {
             ObjectRef pageRef = pageRefs.get(pageCount - 1);
             PDFMarkedContentExtractor ex = new PDFMarkedContentExtractor();
             try {

(tika) branch branch_2x updated: TIKA-4363: refactor

Reply via email to