Author: jukka
Date: Mon Nov 12 17:04:30 2007
New Revision: 594376
URL: http://svn.apache.org/viewvc?rev=594376&view=rev
Log:
TIKA-100 - Structured PDF parsing
- Customized the PdfTextStripper class to produce XHTML SAX events
(there's a somewhat similar PdfText2HTML class in PDFBox, but
that class produces a character stream instead of SAX events)
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
(with props)
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=594376&r1=594375&r2=594376&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Nov 12 17:04:30 2007
@@ -125,3 +125,5 @@
56. TIKA-84 - Add MimeTypes.getMimeType(InputStream) (jukka)
57. TIKA-85 - Add glob patterns from the ASF svn:eol-style documentation
(jukka)
+
+58. TIKA-100 - Structured PDF parsing (jukka)
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=594376&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Mon Nov 12 17:04:30 2007
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdmodel.PDPage;
+import org.pdfbox.util.PDFTextStripper;
+import org.pdfbox.util.TextPosition;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class that overrides the [EMAIL PROTECTED] PDFTextStripper}
functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends PDFTextStripper {
+
+ /**
+ * Converts the given PDF document (and related metadata) to a stream
+ * of XHTML SAX events sent to the given content handler.
+ *
+ * @param document PDF document
+ * @param handler SAX content handler
+ * @param metadata PDF metadata
+ * @throws SAXException if the content handler fails to process SAX events
+ * @throws TikaException if the PDF document can not be processed
+ */
+ public static void process(
+ PDDocument document, ContentHandler handler, Metadata metadata)
+ throws SAXException, TikaException {
+ try {
+ new PDF2XHTML(handler, metadata).getText(document);
+ } catch (IOException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Unable to extract PDF content", e);
+ }
+ }
+ }
+
+ private final XHTMLContentHandler handler;
+
+ private PDF2XHTML(ContentHandler handler, Metadata metadata)
+ throws IOException {
+ this.handler = new XHTMLContentHandler(handler, metadata);
+ }
+
+ protected void startDocument(PDDocument pdf) throws IOException {
+ try {
+ handler.startDocument();
+ } catch (SAXException e) {
+ throw new IOException("Unable to start a document", e);
+ }
+ }
+
+ protected void endDocument(PDDocument pdf) throws IOException {
+ try {
+ handler.endDocument();
+ } catch (SAXException e) {
+ throw new IOException("Unable to end a document", e);
+ }
+ }
+
+ protected void startPage(PDPage page) throws IOException {
+ try {
+ handler.startElement("div");
+ } catch (SAXException e) {
+ throw new IOException("Unable to start a page", e);
+ }
+ }
+
+ protected void endPage(PDPage page) throws IOException {
+ try {
+ handler.endElement("div");
+ } catch (SAXException e) {
+ throw new IOException("Unable to end a page", e);
+ }
+ }
+
+ protected void startParagraph() throws IOException {
+ try {
+ handler.startElement("p");
+ } catch (SAXException e) {
+ throw new IOException("Unable to start a paragraph", e);
+ }
+ }
+
+ protected void endParagraph() throws IOException {
+ try {
+ handler.endElement("p");
+ } catch (SAXException e) {
+ throw new IOException("Unable to end a paragraph", e);
+ }
+ }
+
+ protected void writeCharacters(TextPosition text) throws IOException {
+ try {
+ handler.characters(text.getCharacter());
+ } catch (SAXException e) {
+ throw new IOException("Unable to write a newline", e);
+ }
+ }
+
+ protected void processLineSeparator(TextPosition p) throws IOException {
+ try {
+ handler.characters("\n");
+ } catch (SAXException e) {
+ throw new IOException("Unable to write a newline", e);
+ }
+ }
+
+ protected void processWordSeparator(TextPosition a, TextPosition b)
+ throws IOException {
+ try {
+ handler.characters(" ");
+ } catch (SAXException e) {
+ throw new IOException("Unable to write a space", e);
+ }
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=594376&r1=594375&r2=594376&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Mon Nov 12 17:04:30 2007
@@ -18,17 +18,14 @@
import java.io.IOException;
import java.io.InputStream;
-import java.io.StringWriter;
import java.util.Calendar;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
-import org.pdfbox.util.PDFTextStripper;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -40,64 +37,55 @@
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
+ PDDocument pdfDocument = PDDocument.load(stream);
try {
- PDDocument pdfDocument = PDDocument.load(stream);
- try {
- if (pdfDocument.isEncrypted()) {
+ if (pdfDocument.isEncrypted()) {
+ try {
pdfDocument.decrypt("");
+ } catch (Exception e) {
+ // Ignore
}
-
- PDDocumentInformation info =
- pdfDocument.getDocumentInformation();
- if (info.getTitle() != null) {
- metadata.set(Metadata.TITLE, info.getTitle());
- }
- if (info.getAuthor() != null) {
- metadata.set(Metadata.AUTHOR, info.getAuthor());
- }
- if (info.getCreator() != null) {
- metadata.set(Metadata.CREATOR, info.getCreator());
- }
- if (info.getKeywords() != null) {
- metadata.set(Metadata.KEYWORDS, info.getKeywords());
- }
- if (info.getProducer() != null) {
- // TODO: Need a Metadata key for producer
- metadata.set("producer", info.getProducer());
- }
- if (info.getSubject() != null) {
- metadata.set(Metadata.SUBJECT, info.getSubject());
- }
- if (info.getTrapped() != null) {
- // TODO: Need a Metadata key for producer
- metadata.set("trapped", info.getTrapped());
- }
- Calendar created = info.getCreationDate();
- if (created != null) {
- metadata.set("created", created.getTime().toString());
- }
- Calendar modified = info.getModificationDate();
- if (modified != null) {
- metadata.set(
- Metadata.LAST_MODIFIED,
- modified.getTime().toString());
- }
-
- StringWriter writer = new StringWriter();
- new PDFTextStripper().writeText(pdfDocument, writer);
-
- XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.element("p", writer.getBuffer().toString());
- xhtml.endDocument();
- } finally {
- pdfDocument.close();
}
+ metadata.add(Metadata.CONTENT_TYPE, "application/pdf");
+ extractMetadata(pdfDocument, metadata);
+ PDF2XHTML.process(pdfDocument, handler, metadata);
+ } finally {
+ pdfDocument.close();
+ }
+ }
+
+ private void extractMetadata(PDDocument document, Metadata metadata)
+ throws TikaException {
+ PDDocumentInformation info = document.getDocumentInformation();
+ addMetadata(metadata, Metadata.TITLE, info.getTitle());
+ addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
+ addMetadata(metadata, Metadata.CREATOR, info.getCreator());
+ addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
+ addMetadata(metadata, "producer", info.getProducer());
+ addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
+ addMetadata(metadata, "trapped", info.getTrapped());
+ try {
+ addMetadata(metadata, "created", info.getCreationDate());
+ } catch (IOException e) {
+ // Invalid date format, just ignore
+ }
+ try {
+ Calendar modified = info.getModificationDate();
+ addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
} catch (IOException e) {
- throw e;
- } catch (Exception e) {
- throw new TikaException("Error parsing a PDF document", e);
+ // Invalid date format, just ignore
+ }
+ }
+
+ private void addMetadata(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.add(name, value);
+ }
+ }
+
+ private void addMetadata(Metadata metadata, String name, Calendar value) {
+ if (value != null) {
+ metadata.set(name, value.getTime().toString());
}
}