Author: mikemccand
Date: Tue Dec 4 12:21:58 2012
New Revision: 1416913
URL: http://svn.apache.org/viewvc?rev=1416913&view=rev
Log:
TIKA-1035: move bookmarks before </body>, use <ul>,<li>
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1416913&r1=1416912&r2=1416913&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Tue Dec 4 12:21:58 2012
@@ -81,9 +81,6 @@ class PDF2XHTML extends PDFTextStripper
}
});
- // Also extract text for any bookmarks:
- pdf2XHTML.extractBookmarkText();
-
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
@@ -118,20 +115,23 @@ class PDF2XHTML extends PDFTextStripper
void extractBookmarkText() throws SAXException {
PDDocumentOutline outline =
document.getDocumentCatalog().getDocumentOutline();
if (outline != null) {
- handler.newline();
- extractBookmarkText(outline, "");
+ extractBookmarkText(outline);
}
}
- void extractBookmarkText(PDOutlineNode bookmark, String indent) throws
SAXException {
+ void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
PDOutlineItem current = bookmark.getFirstChild();
- while (current != null) {
- handler.characters(indent);
- handler.characters(current.getTitle());
- handler.newline();
- // Recurse:
- extractBookmarkText(current, indent + " ");
- current = current.getNextSibling();
+ if (current != null) {
+ handler.startElement("ul");
+ while (current != null) {
+ handler.startElement("li");
+ handler.characters(current.getTitle());
+ handler.endElement("li");
+ // Recurse:
+ extractBookmarkText(current);
+ current = current.getNextSibling();
+ }
+ handler.endElement("ul");
}
}
@@ -147,6 +147,8 @@ class PDF2XHTML extends PDFTextStripper
@Override
protected void endDocument(PDDocument pdf) throws IOException {
try {
+ // Extract text for any bookmarks:
+ extractBookmarkText();
handler.endDocument();
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a document", e);
@@ -296,5 +298,4 @@ class PDF2XHTML extends PDFTextStripper
"Unable to write a newline character", e);
}
}
-
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1416913&r1=1416912&r2=1416913&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Tue Dec 4 12:21:58 2012
@@ -449,6 +449,10 @@ public class PDFParserTest extends TikaT
// TIKA-1035
public void testBookmarks() throws Exception {
String xml = getXML("testPDF_bookmarks.pdf").xml;
- assertContains("Denmark bookmark is here", xml);
+ int i = xml.indexOf("Denmark bookmark is here");
+ int j = xml.indexOf("</body>");
+ assertTrue(i != -1);
+ assertTrue(j != -1);
+ assertTrue(i < j);
}
}