Author: tallison
Date: Mon Dec 9 19:03:39 2013
New Revision: 1549646
URL: http://svn.apache.org/r1549646
Log:
TIKA-1202 -- small bug in using default or context config; added in-memory
option for nonsequential parser; added more constraints to tests
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1549646&r1=1549645&r2=1549646&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Mon Dec 9 19:03:39 2013
@@ -106,10 +106,11 @@ class PDF2XHTML extends PDFTextStripper
private final PDFParserConfig config;
private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata
metadata,
- PDFParserConfig defaultConfig)
+ PDFParserConfig config)
throws IOException {
-
- this.config = context.get(PDFParserConfig.class, defaultConfig);
+ //source of config (derives from context or PDFParser?) is
+ //already determined in PDFParser. No need to check context here.
+ this.config = config;
this.originalHandler = handler;
this.context = context;
this.handler = new XHTMLContentHandler(handler, metadata);
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1549646&r1=1549645&r2=1549646&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Mon Dec 9 19:03:39 2013
@@ -29,6 +29,7 @@ import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.RandomAccess;
+import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
@@ -64,7 +65,7 @@ public class PDFParser extends AbstractP
/** Serial version UID */
private static final long serialVersionUID = -752276948656079347L;
- private PDFParserConfig config = new PDFParserConfig();
+ private PDFParserConfig defaultConfig = new PDFParserConfig();
/**
* Metadata key for giving the document password to the parser.
*
@@ -87,22 +88,28 @@ public class PDFParser extends AbstractP
PDDocument pdfDocument = null;
TemporaryResources tmp = new TemporaryResources();
-
+ //config from context, or default if not set via context
+ PDFParserConfig localConfig = context.get(PDFParserConfig.class,
defaultConfig);
try {
// PDFBox can process entirely in memory, or can use a temp file
// for unpacked / processed resources
// Decide which to do based on if we're reading from a file or not
already
TikaInputStream tstream = TikaInputStream.cast(stream);
- if (config.getUseNonSequentialParser() == true) {
- RandomAccess scratchFile = new
RandomAccessFile(tmp.createTemporaryFile(), "rw");
- pdfDocument = PDDocument.loadNonSeq(new
CloseShieldInputStream(stream), scratchFile);
- } else if (tstream != null && tstream.hasFile()) {
- // File based, take that as a cue to use a temporary file
- RandomAccess scratchFile = new
RandomAccessFile(tmp.createTemporaryFile(), "rw");
- pdfDocument = PDDocument.load(new
CloseShieldInputStream(stream), scratchFile, true);
+ if (tstream != null && tstream.hasFile()) {
+ // File based, take that as a cue to use a temporary file
+ RandomAccess scratchFile = new
RandomAccessFile(tmp.createTemporaryFile(), "rw");
+ if (localConfig.getUseNonSequentialParser() == true){
+ pdfDocument = PDDocument.loadNonSeq(new
CloseShieldInputStream(stream), scratchFile);
+ } else {
+ pdfDocument = PDDocument.load(new
CloseShieldInputStream(stream), scratchFile, true);
+ }
} else {
- // Go for the normal, stream based in-memory parsing
- pdfDocument = PDDocument.load(new
CloseShieldInputStream(stream), true);
+ // Go for the normal, stream based in-memory parsing
+ if (localConfig.getUseNonSequentialParser() == true){
+ pdfDocument = PDDocument.loadNonSeq(new
CloseShieldInputStream(stream), new RandomAccessBuffer());
+ } else {
+ pdfDocument = PDDocument.load(new
CloseShieldInputStream(stream), true);
+ }
}
@@ -133,7 +140,7 @@ public class PDFParser extends AbstractP
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
- PDF2XHTML.process(pdfDocument, handler, context, metadata, config);
+ PDF2XHTML.process(pdfDocument, handler, context, metadata,
localConfig);
} finally {
if (pdfDocument != null) {
@@ -228,11 +235,11 @@ public class PDFParser extends AbstractP
}
public void setPDFParserConfig(PDFParserConfig config){
- this.config = config;
+ this.defaultConfig = config;
}
public PDFParserConfig getPDFParserConfig(){
- return config;
+ return defaultConfig;
}
/**
@@ -243,7 +250,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setUseNonSequentialParser(boolean v){
- config.setUseNonSequentialParser(v);
+ defaultConfig.setUseNonSequentialParser(v);
}
/**
@@ -251,7 +258,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getUseNonSequentialParser(){
- return config.getUseNonSequentialParser();
+ return defaultConfig.getUseNonSequentialParser();
}
/**
@@ -263,7 +270,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setEnableAutoSpace(boolean v) {
- config.setEnableAutoSpace(v);
+ defaultConfig.setEnableAutoSpace(v);
}
/**
@@ -271,7 +278,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getEnableAutoSpace() {
- return config.getEnableAutoSpace();
+ return defaultConfig.getEnableAutoSpace();
}
/**
@@ -280,7 +287,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setExtractAnnotationText(boolean v) {
- config.setExtractAnnotationText(v);
+ defaultConfig.setExtractAnnotationText(v);
}
/**
@@ -289,7 +296,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getExtractAnnotationText() {
- return config.getExtractAnnotationText();
+ return defaultConfig.getExtractAnnotationText();
}
/**
@@ -304,7 +311,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setSuppressDuplicateOverlappingText(boolean v) {
- config.setSuppressDuplicateOverlappingText(v);
+ defaultConfig.setSuppressDuplicateOverlappingText(v);
}
/**
@@ -313,7 +320,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getSuppressDuplicateOverlappingText() {
- return config.getSuppressDuplicateOverlappingText();
+ return defaultConfig.getSuppressDuplicateOverlappingText();
}
/**
@@ -327,7 +334,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setSortByPosition(boolean v) {
- config.setSortByPosition(v);
+ defaultConfig.setSortByPosition(v);
}
/**
@@ -336,7 +343,7 @@ public class PDFParser extends AbstractP
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getSortByPosition() {
- return config.getSortByPosition();
+ return defaultConfig.getSortByPosition();
}
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1549646&r1=1549645&r2=1549646&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Mon Dec 9 19:03:39 2013
@@ -19,6 +19,8 @@ package org.apache.tika.parser.pdf;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.tika.TikaTest;
import org.apache.tika.extractor.ContainerExtractor;
@@ -90,6 +92,7 @@ public class PDFParserTest extends TikaT
assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Custom Value", metadata.get("Custom Property"));
+
assertEquals("Array Entry 1", metadata.get("Custom Array"));
assertEquals(2, metadata.getValues("Custom Array").length);
assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
@@ -493,11 +496,22 @@ public class PDFParserTest extends TikaT
* TODO: more testing
*/
public void testSequentialParser() throws Exception{
- PDFParser defaultParser = new PDFParser();
- PDFParser sequentialParser = new PDFParser();
- sequentialParser.getPDFParserConfig().setUseNonSequentialParser(true);
+ Parser defaultParser = new AutoDetectParser();
+ Parser sequentialParser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setUseNonSequentialParser(true);
+ context.set(PDFParserConfig.class, config);
+
File testDocs = new
File(this.getClass().getResource("/test-documents").toURI());
int pdfs = 0;
+ Set<String> knownMetadataDiffs = new HashSet<String>();
+ //PDFBox-1792/Tika-1203
+ knownMetadataDiffs.add("testAnnotations.pdf");
+
+ //empty for now
+ Set<String> knownContentDiffs = new HashSet<String>();
+
for (File f : testDocs.listFiles()){
if (! f.getName().toLowerCase().endsWith(".pdf")){
continue;
@@ -507,17 +521,22 @@ public class PDFParserTest extends TikaT
String defaultContent = getText(new FileInputStream(f),
defaultParser, defaultMetadata);
Metadata sequentialMetadata = new Metadata();
- String sequentialContent = getText(new FileInputStream(f),
sequentialParser, sequentialMetadata);
+ String sequentialContent = getText(new FileInputStream(f),
sequentialParser, context, sequentialMetadata);
- assertEquals(f.getName(), defaultContent, sequentialContent);
- //TODO: until PDFBox fixes metadata extraction for this file,
+ if (knownContentDiffs.contains(f.getName())){
+ assertFalse(f.getName(),
defaultContent.equals(sequentialContent));
+ } else {
+ assertEquals(f.getName(), defaultContent, sequentialContent);
+ }
+
//skip this one file.
- if (f.getName().equals("testAnnotations.pdf")){
- continue;
+ if (knownMetadataDiffs.contains(f.getName())){
+ assertFalse(f.getName(),
defaultMetadata.equals(sequentialMetadata));
+ } else {
+ assertEquals(f.getName(), defaultMetadata, sequentialMetadata);
}
-
- assertEquals(f.getName(), defaultMetadata, sequentialMetadata);
}
+ //make sure nothing went wrong with getting the resource to
test-documents
assertEquals("Number of pdf files tested", 14, pdfs);
}