Author: tallison
Date: Tue Dec 3 00:53:29 2013
New Revision: 1547250
URL: http://svn.apache.org/r1547250
Log:
TIKA-1201 enable parameter for NonSequentialPDFParser
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1547250&r1=1547249&r2=1547250&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Tue Dec 3 00:53:29 2013
@@ -78,6 +78,8 @@ public class PDFParser extends AbstractP
// (necessary for some PDFs, but messes up other PDFs):
private boolean sortByPosition = false;
+ //True if we should use PDFBox's NonSequentialParser
+ private boolean useNonSequentialParser = false;
/**
* Metadata key for giving the document password to the parser.
*
@@ -106,14 +108,18 @@ public class PDFParser extends AbstractP
// for unpacked / processed resources
// Decide which to do based on if we're reading from a file or not
already
TikaInputStream tstream = TikaInputStream.cast(stream);
- if (tstream != null && tstream.hasFile()) {
- // File based, take that as a cue to use a temporary file
- RandomAccess scratchFile = new
RandomAccessFile(tmp.createTemporaryFile(), "rw");
- pdfDocument = PDDocument.load(new
CloseShieldInputStream(stream), scratchFile, true);
+ if (useNonSequentialParser == true) {
+ RandomAccess scratchFile = new
RandomAccessFile(tmp.createTemporaryFile(), "rw");
+ pdfDocument = PDDocument.loadNonSeq(new
CloseShieldInputStream(stream), scratchFile);
+ } else if (tstream != null && tstream.hasFile()) {
+ // File based, take that as a cue to use a temporary file
+ RandomAccess scratchFile = new
RandomAccessFile(tmp.createTemporaryFile(), "rw");
+ pdfDocument = PDDocument.load(new
CloseShieldInputStream(stream), scratchFile, true);
} else {
- // Go for the normal, stream based in-memory parsing
- pdfDocument = PDDocument.load(new
CloseShieldInputStream(stream), true);
+ // Go for the normal, stream based in-memory parsing
+ pdfDocument = PDDocument.load(new
CloseShieldInputStream(stream), true);
}
+
if (pdfDocument.isEncrypted()) {
String password = null;
@@ -233,12 +239,26 @@ public class PDFParser extends AbstractP
}
} else if(value instanceof COSString) {
addMetadata(metadata, name, ((COSString)value).getString());
- } else {
+ } else if (value != null){
addMetadata(metadata, name, value.toString());
}
}
/**
+ * If true, the parser will use the NonSequentialParser. This may
+ * be faster than the full doc parser.
+ * If false (default), this will use the full doc parser.
+ */
+ public void setUseNonSequentialParser(boolean v){
+ useNonSequentialParser = v;
+ }
+
+ /** @see #setUseNonSequentialParser(boolean) */
+ public boolean getUseNonSequentialParser(){
+ return useNonSequentialParser;
+ }
+
+ /**
* If true (the default), the parser should estimate
* where spaces should be inserted between words. For
* many PDFs this is necessary as they do not include
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1547250&r1=1547249&r2=1547250&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Tue Dec 3 00:53:29 2013
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.pdf;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.InputStream;
import org.apache.tika.TikaTest;
@@ -516,4 +518,52 @@ public class PDFParserTest extends TikaT
assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
}
+ /**
+ * tests for equality between traditional sequential parser
+ * and newer nonsequential parser.
+ *
+ * TODO: more testing
+ */
+ public void testSequentialParser() throws Exception{
+ PDFParser defaultParser = new PDFParser();
+ PDFParser sequentialParser = new PDFParser();
+ sequentialParser.setUseNonSequentialParser(true);
+ File testDocs = new
File(this.getClass().getResource("/test-documents").toURI());
+ int pdfs = 0;
+ for (File f : testDocs.listFiles()){
+ if (! f.getName().toLowerCase().endsWith(".pdf")){
+ continue;
+ }
+ pdfs++;
+ Metadata defaultMetadata = new Metadata();
+ String defaultContent = getText(f, defaultParser, defaultMetadata);
+
+ Metadata sequentialMetadata = new Metadata();
+ String sequentialContent = getText(f, sequentialParser,
sequentialMetadata);
+
+ assertEquals(f.getName(), defaultContent, sequentialContent);
+ //TODO: until PDFBox fixes metadata extraction for this file,
+ //skip this one file.
+ if (f.getName().equals("testAnnotations.pdf")){
+ continue;
+ }
+
+ assertEquals(f.getName(), defaultMetadata, sequentialMetadata);
+ }
+ assertEquals("Number of pdf files tested", 14, pdfs);
+ }
+
+ private String getText(File f, PDFParser parser, Metadata metadata) throws
Exception{
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ FileInputStream is = null;
+ try {
+ is = new FileInputStream(f);
+ parser.parse(is, handler, metadata, context);
+ } finally {
+ is.close();
+ }
+ return handler.toString();
+ }
+
}