Author: mikemccand
Date: Thu Nov 17 17:24:26 2011
New Revision: 1203287
URL: http://svn.apache.org/viewvc?rev=1203287&view=rev
Log:
TIKA-612: enable controlling PDFBox's setSortByPosition from PDFParser
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1203287&r1=1203286&r2=1203287&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Nov 17 17:24:26 2011
@@ -10,7 +10,10 @@ Release 1.1 - Current Development
* PDF: Allow controlling whether overlapping duplicated text should
be removed. Disabling this (the default) can give big
speedups to text extraction and may workaround cases where
- non-duplicated characters were incorrectly removed. (TIKA-767)
+ non-duplicated characters were incorrectly removed (TIKA-767).
+ Allow controlling whether text tokens should be sorted by their x/y
+ position before extracting text (TIKA-612); this is necessary for
+ certain PDFs.
* RTF: Fixed case where a font change would result in processing
bytes in the wrong font's charset, producing bogus text output
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1203287&r1=1203286&r2=1203287&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Thu Nov 17 17:24:26 2011
@@ -55,14 +55,14 @@ class PDF2XHTML extends PDFTextStripper
public static void process(
PDDocument document, ContentHandler handler, Metadata metadata,
boolean extractAnnotationText, boolean enableAutoSpace,
- boolean suppressDuplicateOverlappingText)
+ boolean suppressDuplicateOverlappingText, boolean sortByPosition)
throws SAXException, TikaException {
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content handler.
new PDF2XHTML(handler, metadata,
extractAnnotationText, enableAutoSpace,
-
suppressDuplicateOverlappingText).writeText(document, new Writer() {
+ suppressDuplicateOverlappingText,
sortByPosition).writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@@ -87,12 +87,12 @@ class PDF2XHTML extends PDFTextStripper
private PDF2XHTML(ContentHandler handler, Metadata metadata,
boolean extractAnnotationText, boolean enableAutoSpace,
- boolean suppressDuplicateOverlappingText)
+ boolean suppressDuplicateOverlappingText, boolean
sortByPosition)
throws IOException {
this.handler = new XHTMLContentHandler(handler, metadata);
this.extractAnnotationText = extractAnnotationText;
setForceParsing(true);
- setSortByPosition(false);
+ setSortByPosition(sortByPosition);
if (enableAutoSpace) {
setWordSeparator(" ");
} else {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1203287&r1=1203286&r2=1203287&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Thu Nov 17 17:24:26 2011
@@ -60,6 +60,14 @@ public class PDFParser extends AbstractP
// True if we let PDFBox remove duplicate overlapping text:
private boolean suppressDuplicateOverlappingText;
+ // True if we extract annotation text ourselves
+ // (workaround for PDFBOX-1143):
+ private boolean extractAnnotationText = true;
+
+ // True if we should sort text tokens by position
+ // (necessary for some PDFs, but messes up other PDFs):
+ private boolean sortByPosition = false;
+
/**
* Metadata key for giving the document password to the parser.
*
@@ -67,8 +75,6 @@ public class PDFParser extends AbstractP
*/
public static final String PASSWORD =
"org.apache.tika.parser.pdf.password";
- private boolean extractAnnotationText = true;
-
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("pdf"));
@@ -96,7 +102,9 @@ public class PDFParser extends AbstractP
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
- PDF2XHTML.process(pdfDocument, handler, metadata,
extractAnnotationText, enableAutoSpace, suppressDuplicateOverlappingText);
+ PDF2XHTML.process(pdfDocument, handler, metadata,
+ extractAnnotationText, enableAutoSpace,
+ suppressDuplicateOverlappingText,
sortByPosition);
} finally {
pdfDocument.close();
}
@@ -222,4 +230,21 @@ public class PDFParser extends AbstractP
return suppressDuplicateOverlappingText;
}
+ /**
+ * If true, sort text tokens by their x/y position
+ * before extracting text. This may be necessary for
+ * some PDFs (if the text tokens are not rendered "in
+ * order"), while for other PDFs it can produce the
+ * wrong result (for example if there are 2 columns,
+ * the text will be interleaved). Default is false.
+ */
+ public void setSortByPosition(boolean v) {
+ sortByPosition = v;
+ }
+
+ /** @see #setSortByPosition. */
+ public boolean getSortByPosition() {
+ return sortByPosition;
+ }
+
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1203287&r1=1203286&r2=1203287&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Thu Nov 17 17:24:26 2011
@@ -323,6 +323,39 @@ public class PDFParserTest extends TikaT
assertContains("Text the first timesecond time", content);
}
+ public void testSortByPosition() throws Exception {
+ PDFParser parser = new PDFParser();
+ parser.setEnableAutoSpace(false);
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream =
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+ // Default is false (do not sort):
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ String content = handler.toString();
+ content = content.replaceAll("\\s+", " ");
+ assertContains("Left column line 1 Left column line 2 Right column
line 1 Right column line 2", content);
+
+ parser.setSortByPosition(true);
+ handler = new BodyContentHandler();
+ metadata = new Metadata();
+ context = new ParseContext();
+ stream =
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ content = handler.toString();
+ content = content.replaceAll("\\s+", " ");
+ // Column text is now interleaved:
+ assertContains("Left column line 1 Right column line 1 Left colu mn
line 2 Right column line 2", content);
+ }
+
private static class XMLResult {
public final String xml;
public final Metadata metadata;