Author: jukka
Date: Fri Sep 25 15:36:07 2009
New Revision: 818887

URL: http://svn.apache.org/viewvc?rev=818887&view=rev
Log:
TIKA-158: Upgrade to Apache PDFBox

Adapt the PDF parser to the latest PDFBox code.

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=818887&r1=818886&r2=818887&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
 Fri Sep 25 15:36:07 2009
@@ -67,6 +67,7 @@
         this.handler = new XHTMLContentHandler(handler, metadata);
     }
 
+    @Override
     protected void startDocument(PDDocument pdf) throws IOException {
         try {
             handler.startDocument();
@@ -75,6 +76,7 @@
         }
     }
 
+    @Override
     protected void endDocument(PDDocument pdf) throws IOException {
         try {
             handler.endDocument();
@@ -83,38 +85,37 @@
         }
     }
 
+    @Override
     protected void startPage(PDPage page) throws IOException {
         try {
-            handler.startElement("div");
+            handler.startElement("div", "class", "page");
+            handler.startElement("p");
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to start a page", e);
         }
     }
 
+    @Override
     protected void endPage(PDPage page) throws IOException {
         try {
+            handler.endElement("p");
             handler.endElement("div");
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to end a page", e);
         }
     }
 
-    protected void startParagraph() throws IOException {
-        try {
-            handler.startElement("p");
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to start a paragraph", e);
-        }
-    }
-
-    protected void endParagraph() throws IOException {
+    @Override
+    protected void writeString(String text) throws IOException {
         try {
-            handler.endElement("p");
+            handler.characters(text);
         } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to end a paragraph", e);
+            throw new IOExceptionWithCause(
+                    "Unable to write a string: " + text, e);
         }
     }
 
+    @Override
     protected void writeCharacters(TextPosition text) throws IOException {
         try {
             handler.characters(text.getCharacter());
@@ -126,6 +127,7 @@
 
     // Two methods added to work around lack of support for 
processWordSeparator
     // and processLineSeparator in PDFBox-0.7.3. This is fixed in CVS Head 
(PDFBox-0.7.4)
+    @Override
     public String getWordSeparator()
     {
         try
@@ -137,6 +139,7 @@
         return super.getWordSeparator();    //To change body of overridden 
methods use File | Settings | File Templates.
     }
 
+    @Override
     public String getLineSeparator()
     {
         try


Reply via email to