Author: jukka
Date: Fri Sep 25 15:36:07 2009
New Revision: 818887
URL: http://svn.apache.org/viewvc?rev=818887&view=rev
Log:
TIKA-158: Upgrade to Apache PDFBox
Adapt the PDF parser to the latest PDFBox code.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=818887&r1=818886&r2=818887&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Fri Sep 25 15:36:07 2009
@@ -67,6 +67,7 @@
this.handler = new XHTMLContentHandler(handler, metadata);
}
+ @Override
protected void startDocument(PDDocument pdf) throws IOException {
try {
handler.startDocument();
@@ -75,6 +76,7 @@
}
}
+ @Override
protected void endDocument(PDDocument pdf) throws IOException {
try {
handler.endDocument();
@@ -83,38 +85,37 @@
}
}
+ @Override
protected void startPage(PDPage page) throws IOException {
try {
- handler.startElement("div");
+ handler.startElement("div", "class", "page");
+ handler.startElement("p");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to start a page", e);
}
}
+ @Override
protected void endPage(PDPage page) throws IOException {
try {
+ handler.endElement("p");
handler.endElement("div");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a page", e);
}
}
- protected void startParagraph() throws IOException {
- try {
- handler.startElement("p");
- } catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to start a paragraph", e);
- }
- }
-
- protected void endParagraph() throws IOException {
+ @Override
+ protected void writeString(String text) throws IOException {
try {
- handler.endElement("p");
+ handler.characters(text);
} catch (SAXException e) {
- throw new IOExceptionWithCause("Unable to end a paragraph", e);
+ throw new IOExceptionWithCause(
+ "Unable to write a string: " + text, e);
}
}
+ @Override
protected void writeCharacters(TextPosition text) throws IOException {
try {
handler.characters(text.getCharacter());
@@ -126,6 +127,7 @@
// Two methods added to work around lack of support for
processWordSeparator
// and processLineSeparator in PDFBox-0.7.3. This is fixed in CVS Head
(PDFBox-0.7.4)
+ @Override
public String getWordSeparator()
{
try
@@ -137,6 +139,7 @@
return super.getWordSeparator(); //To change body of overridden
methods use File | Settings | File Templates.
}
+ @Override
public String getLineSeparator()
{
try