Author: tallison
Date: Fri Dec 6 19:49:43 2013
New Revision: 1548700
URL: http://svn.apache.org/r1548700
Log:
TIKA-1202 added PDFParserConfig and refactored PDFParserTest and TikaTest to
reduce boilerplate
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1548700&r1=1548699&r2=1548700&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Fri Dec 6 19:49:43 2013
@@ -71,16 +71,14 @@ class PDF2XHTML extends PDFTextStripper
*/
public static void process(
PDDocument document, ContentHandler handler, ParseContext context,
Metadata metadata,
- boolean extractAnnotationText, boolean enableAutoSpace,
- boolean suppressDuplicateOverlappingText, boolean sortByPosition)
+ PDFParserConfig config)
throws SAXException, TikaException {
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content
// handler.
- PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata,
- extractAnnotationText,
enableAutoSpace,
-
suppressDuplicateOverlappingText, sortByPosition);
+ PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata,
config);
+
pdf2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
@@ -105,19 +103,19 @@ class PDF2XHTML extends PDFTextStripper
private final ContentHandler originalHandler;
private final ParseContext context;
private final XHTMLContentHandler handler;
- private final boolean extractAnnotationText;
-
- private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata
metadata,
- boolean extractAnnotationText, boolean enableAutoSpace,
- boolean suppressDuplicateOverlappingText, boolean
sortByPosition)
+ private final PDFParserConfig config;
+
+ private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata
metadata,
+ PDFParserConfig defaultConfig)
throws IOException {
+
+ this.config = context.get(PDFParserConfig.class, defaultConfig);
this.originalHandler = handler;
this.context = context;
this.handler = new XHTMLContentHandler(handler, metadata);
- this.extractAnnotationText = extractAnnotationText;
setForceParsing(true);
- setSortByPosition(sortByPosition);
- if (enableAutoSpace) {
+ setSortByPosition(config.getSortByPosition());
+ if (config.getEnableAutoSpace()) {
setWordSeparator(" ");
} else {
setWordSeparator("");
@@ -125,7 +123,7 @@ class PDF2XHTML extends PDFTextStripper
// TODO: maybe expose setting these too:
//setAverageCharTolerance(1.0f);
//setSpacingTolerance(1.0f);
- setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
+
setSuppressDuplicateOverlappingText(config.getSuppressDuplicateOverlappingText());
}
void extractBookmarkText() throws SAXException {
@@ -190,7 +188,7 @@ class PDF2XHTML extends PDFTextStripper
try {
writeParagraphEnd();
// TODO: remove once PDFBOX-1143 is fixed:
- if (extractAnnotationText) {
+ if (config.getExtractAnnotationText()) {
for(Object o : page.getAnnotations()) {
if( o instanceof PDAnnotationLink ) {
PDAnnotationLink annotationlink = (PDAnnotationLink) o;
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1548700&r1=1548699&r2=1548700&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Fri Dec 6 19:49:43 2013
@@ -64,22 +64,7 @@ public class PDFParser extends AbstractP
/** Serial version UID */
private static final long serialVersionUID = -752276948656079347L;
- // True if we let PDFBox "guess" where spaces should go:
- private boolean enableAutoSpace = true;
-
- // True if we let PDFBox remove duplicate overlapping text:
- private boolean suppressDuplicateOverlappingText;
-
- // True if we extract annotation text ourselves
- // (workaround for PDFBOX-1143):
- private boolean extractAnnotationText = true;
-
- // True if we should sort text tokens by position
- // (necessary for some PDFs, but messes up other PDFs):
- private boolean sortByPosition = false;
-
- //True if we should use PDFBox's NonSequentialParser
- private boolean useNonSequentialParser = false;
+ private PDFParserConfig config = new PDFParserConfig();
/**
* Metadata key for giving the document password to the parser.
*
@@ -108,7 +93,7 @@ public class PDFParser extends AbstractP
// for unpacked / processed resources
// Decide which to do based on if we're reading from a file or not
already
TikaInputStream tstream = TikaInputStream.cast(stream);
- if (useNonSequentialParser == true) {
+ if (config.getUseNonSequentialParser() == true) {
RandomAccess scratchFile = new
RandomAccessFile(tmp.createTemporaryFile(), "rw");
pdfDocument = PDDocument.loadNonSeq(new
CloseShieldInputStream(stream), scratchFile);
} else if (tstream != null && tstream.hasFile()) {
@@ -148,9 +133,7 @@ public class PDFParser extends AbstractP
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
- PDF2XHTML.process(pdfDocument, handler, context, metadata,
- extractAnnotationText, enableAutoSpace,
- suppressDuplicateOverlappingText,
sortByPosition);
+ PDF2XHTML.process(pdfDocument, handler, context, metadata, config);
} finally {
if (pdfDocument != null) {
@@ -244,18 +227,31 @@ public class PDFParser extends AbstractP
}
}
+ public void setPDFParserConfig(PDFParserConfig config){
+ this.config = config;
+ }
+
+ public PDFParserConfig getPDFParserConfig(){
+ return config;
+ }
+
/**
* If true, the parser will use the NonSequentialParser. This may
* be faster than the full doc parser.
* If false (default), this will use the full doc parser.
+ *
+ * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setUseNonSequentialParser(boolean v){
- useNonSequentialParser = v;
+ config.setUseNonSequentialParser(v);
}
- /** @see #setUseNonSequentialParser(boolean) */
+ /**
+ * @see #setUseNonSequentialParser(boolean)
+ * @deprecated use {@link #getPDFParserConfig()}
+ */
public boolean getUseNonSequentialParser(){
- return useNonSequentialParser;
+ return config.getUseNonSequentialParser();
}
/**
@@ -263,29 +259,37 @@ public class PDFParser extends AbstractP
* where spaces should be inserted between words. For
* many PDFs this is necessary as they do not include
* explicit whitespace characters.
+ *
+ * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setEnableAutoSpace(boolean v) {
- enableAutoSpace = v;
+ config.setEnableAutoSpace(v);
}
- /** @see #setEnableAutoSpace. */
+ /**
+ * @see #setEnableAutoSpace.
+ * @deprecated use {@link #getPDFParserConfig()}
+ */
public boolean getEnableAutoSpace() {
- return enableAutoSpace;
+ return config.getEnableAutoSpace();
}
/**
* If true (the default), text in annotations will be
* extracted.
+ * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setExtractAnnotationText(boolean v) {
- extractAnnotationText = v;
+ config.setExtractAnnotationText(v);
}
/**
* If true, text in annotations will be extracted.
+ *
+ * @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getExtractAnnotationText() {
- return extractAnnotationText;
+ return config.getExtractAnnotationText();
}
/**
@@ -296,14 +300,20 @@ public class PDFParser extends AbstractP
* slow down extraction substantially (PDFBOX-956) and
* sometimes remove characters that were not in fact
* duplicated (PDFBOX-1155). By default this is disabled.
+ *
+ * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setSuppressDuplicateOverlappingText(boolean v) {
- suppressDuplicateOverlappingText = v;
+ config.setSuppressDuplicateOverlappingText(v);
}
- /** @see #setSuppressDuplicateOverlappingText. */
+ /**
+ * @see #setSuppressDuplicateOverlappingText.
+ *
+ * @deprecated use {@link #getPDFParserConfig()}
+ */
public boolean getSuppressDuplicateOverlappingText() {
- return suppressDuplicateOverlappingText;
+ return config.getSuppressDuplicateOverlappingText();
}
/**
@@ -313,14 +323,20 @@ public class PDFParser extends AbstractP
* order"), while for other PDFs it can produce the
* wrong result (for example if there are 2 columns,
* the text will be interleaved). Default is false.
+ *
+ * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setSortByPosition(boolean v) {
- sortByPosition = v;
+ config.setSortByPosition(v);
}
- /** @see #setSortByPosition. */
+ /**
+ * @see #setSortByPosition.
+ *
+ * @deprecated use {@link #getPDFParserConfig()}
+ */
public boolean getSortByPosition() {
- return sortByPosition;
+ return config.getSortByPosition();
}
}
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1548700&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
Fri Dec 6 19:49:43 2013
@@ -0,0 +1,252 @@
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Properties;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Config for PDFParser.
+ *
+ * This allows parameters to be set programmatically:
+ * <ol>
+ * <li>Calls to PDFParser, i.e.
parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
+ * <li>Constructor of PDFParser</li>
+ * <li>Passing to PDFParser through a ParseContext:
context.set(PDFParserConfig.class, config);</li>
+ * </ol>
+ *
+ * Parameters can also be set by modifying the PDFParserConfig.properties file,
+ * which lives here in trunk:
+ * tika-parsers/src/main/resources/org/apache/tika/parser/pdf
+ *
+ * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
+ * org/apache/tika/parser/pdf
+ *
+ */
+public class PDFParserConfig implements Serializable{
+
+ private static final long serialVersionUID = 6492570218190936986L;
+
+ // True if we let PDFBox "guess" where spaces should go:
+ private boolean enableAutoSpace = true;
+
+ // True if we let PDFBox remove duplicate overlapping text:
+ private boolean suppressDuplicateOverlappingText;
+
+ // True if we extract annotation text ourselves
+ // (workaround for PDFBOX-1143):
+ private boolean extractAnnotationText = true;
+
+ // True if we should sort text tokens by position
+ // (necessary for some PDFs, but messes up other PDFs):
+ private boolean sortByPosition = false;
+
+ //True if we should use PDFBox's NonSequentialParser
+ private boolean useNonSequentialParser = false;
+
+ public PDFParserConfig(){
+ init(this.getClass().getResourceAsStream("PDFParser.properties"));
+ }
+
+ /**
+ * Loads properties from InputStream and then tries to close InputStream.
+ * If there is an IOException, this silently swallows the exception
+ * and goes back to the default.
+ *
+ * @param is
+ */
+ public PDFParserConfig(InputStream is){
+ init(is);
+ }
+
+ //initializes object and then tries to close inputstream
+ private void init(InputStream is){
+
+ if (is == null){
+ return;
+ }
+ Properties props = new Properties();
+ try{
+ props.load(is);
+ } catch (IOException e){
+ } finally {
+ if (is != null){
+ try{
+ is.close();
+ } catch (IOException e){
+ //swallow
+ }
+ }
+ }
+ setEnableAutoSpace(
+ getProp(props.getProperty("enableAutoSpace"),
getEnableAutoSpace()));
+ setSuppressDuplicateOverlappingText(
+ getProp(props.getProperty("suppressDuplicateOverlappingText"),
+ getSuppressDuplicateOverlappingText()));
+ setExtractAnnotationText(
+ getProp(props.getProperty("extractAnnotationText"),
+ getExtractAnnotationText()));
+ setSortByPosition(
+ getProp(props.getProperty("sortByPosition"),
+ getSortByPosition()));
+ setUseNonSequentialParser(
+ getProp(props.getProperty("useNonSequentialParser"),
+ getUseNonSequentialParser()));
+ }
+
+ /** @see #setEnableAutoSpace. */
+ public boolean getEnableAutoSpace() {
+ return enableAutoSpace;
+ }
+
+ /**
+ * If true (the default), the parser should estimate
+ * where spaces should be inserted between words. For
+ * many PDFs this is necessary as they do not include
+ * explicit whitespace characters.
+ */
+ public void setEnableAutoSpace(boolean enableAutoSpace) {
+ this.enableAutoSpace = enableAutoSpace;
+ }
+
+ /** @see #setSuppressDuplicateOverlappingText(boolean)*/
+ public boolean getSuppressDuplicateOverlappingText() {
+ return suppressDuplicateOverlappingText;
+ }
+
+ /**
+ * If true, the parser should try to remove duplicated
+ * text over the same region. This is needed for some
+ * PDFs that achieve bolding by re-writing the same
+ * text in the same area. Note that this can
+ * slow down extraction substantially (PDFBOX-956) and
+ * sometimes remove characters that were not in fact
+ * duplicated (PDFBOX-1155). By default this is disabled.
+ */
+ public void setSuppressDuplicateOverlappingText(
+ boolean suppressDuplicateOverlappingText) {
+ this.suppressDuplicateOverlappingText =
suppressDuplicateOverlappingText;
+ }
+
+ /** @see #setExtractAnnotationText(boolean)*/
+ public boolean getExtractAnnotationText() {
+ return extractAnnotationText;
+ }
+
+ /**
+ * If true (the default), text in annotations will be
+ * extracted.
+ */
+ public void setExtractAnnotationText(boolean extractAnnotationText) {
+ this.extractAnnotationText = extractAnnotationText;
+ }
+ /** @see #setSortByPosition(boolean)*/
+ public boolean getSortByPosition() {
+ return sortByPosition;
+ }
+
+ /**
+ * If true, sort text tokens by their x/y position
+ * before extracting text. This may be necessary for
+ * some PDFs (if the text tokens are not rendered "in
+ * order"), while for other PDFs it can produce the
+ * wrong result (for example if there are 2 columns,
+ * the text will be interleaved). Default is false.
+ */
+ public void setSortByPosition(boolean sortByPosition) {
+ this.sortByPosition = sortByPosition;
+ }
+
+ /** @see #setUseNonSequentialParser(boolean)*/
+ public boolean getUseNonSequentialParser() {
+ return useNonSequentialParser;
+ }
+
+ /**
+ * If true, uses PDFBox's non-sequential parser.
+ * The non-sequential parser should be much faster than the traditional
+ * full doc parser. However, until PDFBOX-XXX is fixed,
+ * the non-sequential parser fails
+ * to extract some document metadata.
+ * <p>
+ * Default is false (use the traditional parser)
+ * @param useNonSequentialParser
+ */
+ public void setUseNonSequentialParser(boolean useNonSequentialParser) {
+ this.useNonSequentialParser = useNonSequentialParser;
+ }
+
+ private boolean getProp(String p, boolean defaultMissing){
+ if (p == null){
+ return defaultMissing;
+ }
+ if (p.toLowerCase().equals("true")){
+ return true;
+ } else if (p.toLowerCase().equals("false")){
+ return false;
+ } else {
+ return defaultMissing;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + (enableAutoSpace ? 1231 : 1237);
+ result = prime * result + (extractAnnotationText ? 1231 : 1237);
+ result = prime * result + (sortByPosition ? 1231 : 1237);
+ result = prime * result
+ + (suppressDuplicateOverlappingText ? 1231 : 1237);
+ result = prime * result + (useNonSequentialParser ? 1231 : 1237);
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ PDFParserConfig other = (PDFParserConfig) obj;
+ if (enableAutoSpace != other.enableAutoSpace)
+ return false;
+ if (extractAnnotationText != other.extractAnnotationText)
+ return false;
+ if (sortByPosition != other.sortByPosition)
+ return false;
+ if (suppressDuplicateOverlappingText !=
other.suppressDuplicateOverlappingText)
+ return false;
+ if (useNonSequentialParser != other.useNonSequentialParser)
+ return false;
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace
+ + ", suppressDuplicateOverlappingText="
+ + suppressDuplicateOverlappingText + ", extractAnnotationText="
+ + extractAnnotationText + ", sortByPosition=" + sortByPosition
+ + ", useNonSequentialParser=" + useNonSequentialParser + "]";
+ }
+
+}
Added:
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties?rev=1548700&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
(added)
+++
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
Fri Dec 6 19:49:43 2013
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+enableAutospace true
+extractAnnotationText true
+sortByPosition false
+suppressDuplicateOverlappingText false
+useNonSequentialParser false
\ No newline at end of file
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1548700&r1=1548699&r2=1548700&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Fri Dec
6 19:49:43 2013
@@ -27,6 +27,7 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ToXMLContentHandler;
import org.xml.sax.ContentHandler;
@@ -100,4 +101,32 @@ public abstract class TikaTest extends T
}
}
+ /**
+ * Basic text extraction.
+ * <p>
+ * Tries to close input stream after processing.
+ */
+ public String getText(InputStream is, Parser parser, ParseContext context,
Metadata metadata) throws Exception{
+ ContentHandler handler = new BodyContentHandler();
+ try {
+ parser.parse(is, handler, metadata, context);
+ } finally {
+ is.close();
+ }
+ return handler.toString();
+ }
+
+ public String getText(InputStream is, Parser parser, Metadata metadata)
throws Exception{
+ return getText(is, parser, new ParseContext(), metadata);
+ }
+
+ public String getText(InputStream is, Parser parser, ParseContext context)
throws Exception{
+ return getText(is, parser, context, new Metadata());
+ }
+
+ public String getText(InputStream is, Parser parser) throws Exception{
+ return getText(is, parser, new ParseContext(), new Metadata());
+ }
+
+
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1548700&r1=1548699&r2=1548700&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Fri Dec 6 19:49:43 2013
@@ -47,17 +47,12 @@ public class PDFParserTest extends TikaT
public void testPdfParsing() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
InputStream stream = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDF.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
+
+ String content = getText(stream, parser, metadata);
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Bertrand Delacr\u00e9taz",
metadata.get(TikaCoreProperties.CREATOR));
@@ -69,7 +64,6 @@ public class PDFParserTest extends TikaT
// assertEquals("Sat Sep 15 10:02:31 BST 2007",
metadata.get(Metadata.CREATION_DATE));
// assertEquals("Sat Sep 15 10:02:31 BST 2007",
metadata.get(Metadata.LAST_MODIFIED));
- String content = handler.toString();
assertTrue(content.contains("Apache Tika"));
assertTrue(content.contains("Tika - Content Analysis Toolkit"));
assertTrue(content.contains("incubator"));
@@ -83,17 +77,12 @@ public class PDFParserTest extends TikaT
public void testCustomMetadata() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
InputStream stream = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDF-custommetadata.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
+
+ String content = getText(stream, parser, metadata);
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Document author",
metadata.get(TikaCoreProperties.CREATOR));
@@ -106,7 +95,6 @@ public class PDFParserTest extends TikaT
assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
- String content = handler.toString();
assertTrue(content.contains("Hello World!"));
}
@@ -174,38 +162,20 @@ public class PDFParserTest extends TikaT
public void testTwoTextBoxes() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
-
InputStream stream = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDFTwoTextBoxes.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
-
- String content = handler.toString();
+ String content = getText(stream, parser);
content = content.replaceAll("\\s+"," ");
assertTrue(content.contains("Left column line 1 Left column line 2
Right column line 1 Right column line 2"));
}
public void testVarious() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
-
InputStream stream = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDFVarious.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- String content = handler.toString();
+ String content = getText(stream, parser, metadata);
//content = content.replaceAll("\\s+"," ");
assertContains("Footnote appears here", content);
assertContains("This is a footnote.", content);
@@ -266,37 +236,33 @@ public class PDFParserTest extends TikaT
public void testAnnotations() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
InputStream stream =
getResourceAsStream("/test-documents/testAnnotations.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- String content = handler.toString();
+ String content = getText(stream, parser);
content = content.replaceAll("[\\s\u00a0]+"," ");
assertContains("Here is some text", content);
assertContains("Here is a comment", content);
// Test w/ annotation text disabled:
PDFParser pdfParser = new PDFParser();
- pdfParser.setExtractAnnotationText(false);
- handler = new BodyContentHandler();
- metadata = new Metadata();
- context = new ParseContext();
+ pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
- try {
- pdfParser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- content = handler.toString();
+ content = getText(stream, pdfParser);
content = content.replaceAll("[\\s\u00a0]+"," ");
assertContains("Here is some text", content);
assertEquals(-1, content.indexOf("Here is a comment"));
+ // annotation text disabled through parsecontext
+ ParseContext context = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractAnnotationText(false);
+ context.set(PDFParserConfig.class, config);
+ stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
+ content = getText(stream, parser, context);
+ content = content.replaceAll("[\\s\u00a0]+"," ");
+ assertContains("Here is some text", content);
+ assertEquals(-1, content.indexOf("Here is a comment"));
+
+
// TIKA-738: make sure no extra </p> tags
String xml = getXML("testAnnotations.pdf").xml;
assertEquals(substringCount("<p>", xml),
@@ -306,16 +272,8 @@ public class PDFParserTest extends TikaT
// TIKA-981
public void testPopupAnnotation() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
InputStream stream =
getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- String content = handler.toString();
+ String content = getText(stream, parser);
assertContains("this is the note", content);
assertContains("igalsh", content);
}
@@ -362,100 +320,110 @@ public class PDFParserTest extends TikaT
public void testDisableAutoSpace() throws Exception {
PDFParser parser = new PDFParser();
- parser.setEnableAutoSpace(false);
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
+ parser.getPDFParserConfig().setEnableAutoSpace(false);
InputStream stream =
getResourceAsStream("/test-documents/testExtraSpaces.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- String content = handler.toString();
+ String content = getText(stream, parser);
content = content.replaceAll("[\\s\u00a0]+"," ");
// Text is correct when autoSpace is off:
assertContains("Here is some formatted text", content);
- parser.setEnableAutoSpace(true);
- handler = new BodyContentHandler();
- metadata = new Metadata();
- context = new ParseContext();
+ parser.getPDFParserConfig().setEnableAutoSpace(true);
stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- content = handler.toString();
+ content = getText(stream, parser);
content = content.replaceAll("[\\s\u00a0]+"," ");
// Text is correct when autoSpace is off:
// Text has extra spaces when autoSpace is on
assertEquals(-1, content.indexOf("Here is some formatted text"));
+
+ //now try with autodetect
+ Parser autoParser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ context.set(PDFParserConfig.class, config);
+ //default is true
+ stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+ content = getText(stream, autoParser, context);
+ content = content.replaceAll("[\\s\u00a0]+"," ");
+ // Text has extra spaces when autoSpace is on
+ assertEquals(-1, content.indexOf("Here is some formatted text"));
+
+ config.setEnableAutoSpace(false);
+
+ stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+ content = getText(stream, parser, context);
+ content = content.replaceAll("[\\s\u00a0]+"," ");
+ // Text is correct when autoSpace is off:
+ assertContains("Here is some formatted text", content);
+
}
public void testDuplicateOverlappingText() throws Exception {
PDFParser parser = new PDFParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
InputStream stream =
getResourceAsStream("/test-documents/testOverlappingText.pdf");
// Default is false (keep overlapping text):
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- String content = handler.toString();
+ String content = getText(stream, parser);
assertContains("Text the first timeText the second time", content);
- parser.setSuppressDuplicateOverlappingText(true);
- handler = new BodyContentHandler();
- metadata = new Metadata();
- context = new ParseContext();
+ parser.getPDFParserConfig().setSuppressDuplicateOverlappingText(true);
stream =
getResourceAsStream("/test-documents/testOverlappingText.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- content = handler.toString();
+ content = getText(stream, parser);
+ // "Text the first" was dedup'd:
+ assertContains("Text the first timesecond time", content);
+
+ //now try with autodetect
+ Parser autoParser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ context.set(PDFParserConfig.class, config);
+ stream =
getResourceAsStream("/test-documents/testOverlappingText.pdf");
+ // Default is false (keep overlapping text):
+ content = getText(stream, autoParser, context);
+ assertContains("Text the first timeText the second time", content);
+
+ config.setSuppressDuplicateOverlappingText(true);
+ stream =
getResourceAsStream("/test-documents/testOverlappingText.pdf");
+ content = getText(stream, autoParser, context);
// "Text the first" was dedup'd:
assertContains("Text the first timesecond time", content);
+
}
public void testSortByPosition() throws Exception {
PDFParser parser = new PDFParser();
- parser.setEnableAutoSpace(false);
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
+ parser.getPDFParserConfig().setEnableAutoSpace(false);
InputStream stream =
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
// Default is false (do not sort):
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- String content = handler.toString();
+ String content = getText(stream, parser);
content = content.replaceAll("\\s+", " ");
assertContains("Left column line 1 Left column line 2 Right column
line 1 Right column line 2", content);
- parser.setSortByPosition(true);
- handler = new BodyContentHandler();
- metadata = new Metadata();
- context = new ParseContext();
+ parser.getPDFParserConfig().setSortByPosition(true);
stream =
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
- try {
- parser.parse(stream, handler, metadata, context);
- } finally {
- stream.close();
- }
- content = handler.toString();
+ content = getText(stream, parser);
+ content = content.replaceAll("\\s+", " ");
+ // Column text is now interleaved:
+ assertContains("Left column line 1 Right column line 1 Left colu mn
line 2 Right column line 2", content);
+
+ //now try setting autodetect via parsecontext
+ AutoDetectParser autoParser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ context.set(PDFParserConfig.class, config);
+ stream =
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+ // Default is false (do not sort):
+ content = getText(stream, autoParser, context);
+ content = content.replaceAll("\\s+", " ");
+ assertContains("Left column line 1 Left column line 2 Right column
line 1 Right column line 2", content);
+
+ config.setSortByPosition(true);
+ context.set(PDFParserConfig.class, config);
+ stream =
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+ content = getText(stream, parser);
content = content.replaceAll("\\s+", " ");
// Column text is now interleaved:
assertContains("Left column line 1 Right column line 1 Left colu mn
line 2 Right column line 2", content);
+
}
// TIKA-1035
@@ -527,7 +495,7 @@ public class PDFParserTest extends TikaT
public void testSequentialParser() throws Exception{
PDFParser defaultParser = new PDFParser();
PDFParser sequentialParser = new PDFParser();
- sequentialParser.setUseNonSequentialParser(true);
+ sequentialParser.getPDFParserConfig().setUseNonSequentialParser(true);
File testDocs = new
File(this.getClass().getResource("/test-documents").toURI());
int pdfs = 0;
for (File f : testDocs.listFiles()){
@@ -536,10 +504,10 @@ public class PDFParserTest extends TikaT
}
pdfs++;
Metadata defaultMetadata = new Metadata();
- String defaultContent = getText(f, defaultParser, defaultMetadata);
+ String defaultContent = getText(new FileInputStream(f),
defaultParser, defaultMetadata);
Metadata sequentialMetadata = new Metadata();
- String sequentialContent = getText(f, sequentialParser,
sequentialMetadata);
+ String sequentialContent = getText(new FileInputStream(f),
sequentialParser, sequentialMetadata);
assertEquals(f.getName(), defaultContent, sequentialContent);
//TODO: until PDFBox fixes metadata extraction for this file,
@@ -553,17 +521,5 @@ public class PDFParserTest extends TikaT
assertEquals("Number of pdf files tested", 14, pdfs);
}
- private String getText(File f, PDFParser parser, Metadata metadata) throws
Exception{
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
- FileInputStream is = null;
- try {
- is = new FileInputStream(f);
- parser.parse(is, handler, metadata, context);
- } finally {
- is.close();
- }
- return handler.toString();
- }
}