Author: mikemccand
Date: Tue Dec  4 12:21:58 2012
New Revision: 1416913

URL: http://svn.apache.org/viewvc?rev=1416913&view=rev
Log:
TIKA-1035: move bookmarks before </body>, use <ul>,<li>

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1416913&r1=1416912&r2=1416913&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
Tue Dec  4 12:21:58 2012
@@ -81,9 +81,6 @@ class PDF2XHTML extends PDFTextStripper 
                 }
             });
 
-            // Also extract text for any bookmarks:
-            pdf2XHTML.extractBookmarkText();
-
         } catch (IOException e) {
             if (e.getCause() instanceof SAXException) {
                 throw (SAXException) e.getCause();
@@ -118,20 +115,23 @@ class PDF2XHTML extends PDFTextStripper 
     void extractBookmarkText() throws SAXException {
         PDDocumentOutline outline = 
document.getDocumentCatalog().getDocumentOutline();
         if (outline != null) {
-            handler.newline();
-            extractBookmarkText(outline, "");
+            extractBookmarkText(outline);
         }
     }
 
-    void extractBookmarkText(PDOutlineNode bookmark, String indent) throws 
SAXException {
+    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
         PDOutlineItem current = bookmark.getFirstChild();
-        while (current != null) {
-          handler.characters(indent);
-          handler.characters(current.getTitle());
-          handler.newline();
-          // Recurse:
-          extractBookmarkText(current, indent + "    ");
-          current = current.getNextSibling();
+        if (current != null) {
+            handler.startElement("ul");
+            while (current != null) {
+                handler.startElement("li");
+                handler.characters(current.getTitle());
+                handler.endElement("li");
+                // Recurse:
+                extractBookmarkText(current);
+                current = current.getNextSibling();
+            }
+            handler.endElement("ul");
         }
     }
 
@@ -147,6 +147,8 @@ class PDF2XHTML extends PDFTextStripper 
     @Override
     protected void endDocument(PDDocument pdf) throws IOException {
         try {
+            // Extract text for any bookmarks:
+            extractBookmarkText();
             handler.endDocument();
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to end a document", e);
@@ -296,5 +298,4 @@ class PDF2XHTML extends PDFTextStripper 
                     "Unable to write a newline character", e);
         }
     }
-
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1416913&r1=1416912&r2=1416913&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 Tue Dec  4 12:21:58 2012
@@ -449,6 +449,10 @@ public class PDFParserTest extends TikaT
     // TIKA-1035
     public void testBookmarks() throws Exception {
         String xml = getXML("testPDF_bookmarks.pdf").xml;
-        assertContains("Denmark bookmark is here", xml);
+        int i = xml.indexOf("Denmark bookmark is here");
+        int j = xml.indexOf("</body>");
+        assertTrue(i != -1);
+        assertTrue(j != -1);
+        assertTrue(i < j);
     }
 }


Reply via email to