svn commit: r1548700 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pdf/ main/resources/org/apache/tika/parser/pdf/ test/java/org/apache/tika/ test/java/org/apache/tika/parser/pdf/

tallison Fri, 06 Dec 2013 11:50:42 -0800

Author: tallison
Date: Fri Dec  6 19:49:43 2013
New Revision: 1548700

URL: http://svn.apache.org/r1548700
Log:
TIKA-1202 added PDFParserConfig and refactored PDFParserTest and TikaTest to 
reduce boilerplate


Added:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
    tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/
    
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1548700&r1=1548699&r2=1548700&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
Fri Dec  6 19:49:43 2013
@@ -71,16 +71,14 @@ class PDF2XHTML extends PDFTextStripper 
      */
     public static void process(
             PDDocument document, ContentHandler handler, ParseContext context, 
Metadata metadata,
-            boolean extractAnnotationText, boolean enableAutoSpace,
-            boolean suppressDuplicateOverlappingText, boolean sortByPosition)
+            PDFParserConfig config)
             throws SAXException, TikaException {
         try {
             // Extract text using a dummy Writer as we override the
             // key methods to output to the given content
             // handler.
-            PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata,
-                                                extractAnnotationText, 
enableAutoSpace,
-                                                
suppressDuplicateOverlappingText, sortByPosition);
+            PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, 
config);
+
             pdf2XHTML.writeText(document, new Writer() {
                 @Override
                 public void write(char[] cbuf, int off, int len) {
@@ -105,19 +103,19 @@ class PDF2XHTML extends PDFTextStripper 
     private final ContentHandler originalHandler;
     private final ParseContext context;
     private final XHTMLContentHandler handler;
-    private final boolean extractAnnotationText;
-
-    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata 
metadata,
-                      boolean extractAnnotationText, boolean enableAutoSpace,
-                      boolean suppressDuplicateOverlappingText, boolean 
sortByPosition)
+    private final PDFParserConfig config;
+    
+    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata 
metadata, 
+            PDFParserConfig defaultConfig)
             throws IOException {
+        
+        this.config = context.get(PDFParserConfig.class, defaultConfig);
         this.originalHandler = handler;
         this.context = context;
         this.handler = new XHTMLContentHandler(handler, metadata);
-        this.extractAnnotationText = extractAnnotationText;
         setForceParsing(true);
-        setSortByPosition(sortByPosition);
-        if (enableAutoSpace) {
+        setSortByPosition(config.getSortByPosition());
+        if (config.getEnableAutoSpace()) {
             setWordSeparator(" ");
         } else {
             setWordSeparator("");
@@ -125,7 +123,7 @@ class PDF2XHTML extends PDFTextStripper 
         // TODO: maybe expose setting these too:
         //setAverageCharTolerance(1.0f);
         //setSpacingTolerance(1.0f);
-        setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
+        
setSuppressDuplicateOverlappingText(config.getSuppressDuplicateOverlappingText());
     }
 
     void extractBookmarkText() throws SAXException {
@@ -190,7 +188,7 @@ class PDF2XHTML extends PDFTextStripper 
         try {
             writeParagraphEnd();
             // TODO: remove once PDFBOX-1143 is fixed:
-            if (extractAnnotationText) {
+            if (config.getExtractAnnotationText()) {
                 for(Object o : page.getAnnotations()) {
                     if( o instanceof PDAnnotationLink ) {
                         PDAnnotationLink annotationlink = (PDAnnotationLink) o;

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1548700&r1=1548699&r2=1548700&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
Fri Dec  6 19:49:43 2013
@@ -64,22 +64,7 @@ public class PDFParser extends AbstractP
     /** Serial version UID */
     private static final long serialVersionUID = -752276948656079347L;
 
-    // True if we let PDFBox "guess" where spaces should go:
-    private boolean enableAutoSpace = true;
-
-    // True if we let PDFBox remove duplicate overlapping text:
-    private boolean suppressDuplicateOverlappingText;
-
-    // True if we extract annotation text ourselves
-    // (workaround for PDFBOX-1143):
-    private boolean extractAnnotationText = true;
-
-    // True if we should sort text tokens by position
-    // (necessary for some PDFs, but messes up other PDFs):
-    private boolean sortByPosition = false;
-
-    //True if we should use PDFBox's NonSequentialParser
-    private boolean useNonSequentialParser = false;
+    private PDFParserConfig config = new PDFParserConfig();
     /**
      * Metadata key for giving the document password to the parser.
      *
@@ -108,7 +93,7 @@ public class PDFParser extends AbstractP
             //  for unpacked / processed resources
             // Decide which to do based on if we're reading from a file or not 
already
             TikaInputStream tstream = TikaInputStream.cast(stream);
-            if (useNonSequentialParser == true) {
+            if (config.getUseNonSequentialParser() == true) {
                   RandomAccess scratchFile = new 
RandomAccessFile(tmp.createTemporaryFile(), "rw");
                   pdfDocument = PDDocument.loadNonSeq(new 
CloseShieldInputStream(stream), scratchFile);
             } else if (tstream != null && tstream.hasFile()) {
@@ -148,9 +133,7 @@ public class PDFParser extends AbstractP
             }
             metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
             extractMetadata(pdfDocument, metadata);
-            PDF2XHTML.process(pdfDocument, handler, context, metadata,
-                              extractAnnotationText, enableAutoSpace,
-                              suppressDuplicateOverlappingText, 
sortByPosition);
+            PDF2XHTML.process(pdfDocument, handler, context, metadata, config);
             
         } finally {
             if (pdfDocument != null) {
@@ -244,18 +227,31 @@ public class PDFParser extends AbstractP
         }
     }
 
+    public void setPDFParserConfig(PDFParserConfig config){
+        this.config = config;
+    }
+    
+    public PDFParserConfig getPDFParserConfig(){
+        return config;
+    }
+    
     /**
      * If true, the parser will use the NonSequentialParser.  This may
      * be faster than the full doc parser.
      * If false (default), this will use the full doc parser.
+     * 
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setUseNonSequentialParser(boolean v){
-        useNonSequentialParser = v;
+        config.setUseNonSequentialParser(v);
     }
     
-    /** @see #setUseNonSequentialParser(boolean) */
+    /** 
+     * @see #setUseNonSequentialParser(boolean) 
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
     public boolean getUseNonSequentialParser(){
-        return useNonSequentialParser;
+        return config.getUseNonSequentialParser();
     }
     
     /**
@@ -263,29 +259,37 @@ public class PDFParser extends AbstractP
      *  where spaces should be inserted between words.  For
      *  many PDFs this is necessary as they do not include
      *  explicit whitespace characters.
+     *
+     *  @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setEnableAutoSpace(boolean v) {
-        enableAutoSpace = v;
+        config.setEnableAutoSpace(v);
     }
 
-    /** @see #setEnableAutoSpace. */
+    /** 
+     * @see #setEnableAutoSpace. 
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
     public boolean getEnableAutoSpace() {
-        return enableAutoSpace;
+        return config.getEnableAutoSpace();
     }
 
     /**
      * If true (the default), text in annotations will be
      * extracted.
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setExtractAnnotationText(boolean v) {
-        extractAnnotationText = v;
+        config.setExtractAnnotationText(v);
     }
 
     /**
      * If true, text in annotations will be extracted.
+     * 
+     * @deprecated use {@link #getPDFParserConfig()}
      */
     public boolean getExtractAnnotationText() {
-        return extractAnnotationText;
+        return config.getExtractAnnotationText();
     }
 
     /**
@@ -296,14 +300,20 @@ public class PDFParser extends AbstractP
      *  slow down extraction substantially (PDFBOX-956) and
      *  sometimes remove characters that were not in fact
      *  duplicated (PDFBOX-1155).  By default this is disabled.
+     *  
+     *  @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setSuppressDuplicateOverlappingText(boolean v) {
-        suppressDuplicateOverlappingText = v;
+        config.setSuppressDuplicateOverlappingText(v);
     }
 
-    /** @see #setSuppressDuplicateOverlappingText. */
+    /** 
+     * @see #setSuppressDuplicateOverlappingText. 
+     * 
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
     public boolean getSuppressDuplicateOverlappingText() {
-        return suppressDuplicateOverlappingText;
+        return config.getSuppressDuplicateOverlappingText();
     }
 
     /**
@@ -313,14 +323,20 @@ public class PDFParser extends AbstractP
      *  order"), while for other PDFs it can produce the
      *  wrong result (for example if there are 2 columns,
      *  the text will be interleaved).  Default is false.
+     *  
+     *  @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setSortByPosition(boolean v) {
-        sortByPosition = v;
+        config.setSortByPosition(v);
     }
 
-    /** @see #setSortByPosition. */
+    /** 
+     * @see #setSortByPosition. 
+     * 
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
     public boolean getSortByPosition() {
-        return sortByPosition;
+        return config.getSortByPosition();
     }
 
 }

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1548700&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 Fri Dec  6 19:49:43 2013
@@ -0,0 +1,252 @@
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Properties;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Config for PDFParser.
+ * 
+ * This allows parameters to be set programmatically:
+ * <ol>
+ * <li>Calls to PDFParser, i.e. 
parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
+ * <li>Constructor of PDFParser</li>
+ * <li>Passing to PDFParser through a ParseContext: 
context.set(PDFParserConfig.class, config);</li>
+ * </ol>
+ * 
+ * Parameters can also be set by modifying the PDFParserConfig.properties file,
+ * which lives here in trunk:
+ * tika-parsers/src/main/resources/org/apache/tika/parser/pdf
+ * 
+ * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
+ * org/apache/tika/parser/pdf
+ *
+ */
+public class PDFParserConfig implements Serializable{
+
+    private static final long serialVersionUID = 6492570218190936986L;
+
+    // True if we let PDFBox "guess" where spaces should go:
+    private boolean enableAutoSpace = true;
+
+    // True if we let PDFBox remove duplicate overlapping text:
+    private boolean suppressDuplicateOverlappingText;
+
+    // True if we extract annotation text ourselves
+    // (workaround for PDFBOX-1143):
+    private boolean extractAnnotationText = true;
+
+    // True if we should sort text tokens by position
+    // (necessary for some PDFs, but messes up other PDFs):
+    private boolean sortByPosition = false;
+
+    //True if we should use PDFBox's NonSequentialParser
+    private boolean useNonSequentialParser = false;
+
+    public PDFParserConfig(){
+        init(this.getClass().getResourceAsStream("PDFParser.properties"));
+    }
+
+    /**
+     * Loads properties from InputStream and then tries to close InputStream.
+     * If there is an IOException, this silently swallows the exception
+     * and goes back to the default.
+     * 
+     * @param is
+     */
+    public PDFParserConfig(InputStream is){
+        init(is);
+    }
+
+    //initializes object and then tries to close inputstream
+    private void init(InputStream is){
+
+        if (is == null){
+            return;
+        }
+        Properties props = new Properties();
+        try{
+            props.load(is);
+        } catch (IOException e){
+        } finally {
+            if (is != null){
+                try{
+                    is.close();
+                } catch (IOException e){
+                    //swallow
+                }
+            }
+        }
+        setEnableAutoSpace(
+                getProp(props.getProperty("enableAutoSpace"), 
getEnableAutoSpace()));
+        setSuppressDuplicateOverlappingText(
+                getProp(props.getProperty("suppressDuplicateOverlappingText"), 
+                        getSuppressDuplicateOverlappingText()));
+        setExtractAnnotationText(
+                getProp(props.getProperty("extractAnnotationText"), 
+                        getExtractAnnotationText()));
+        setSortByPosition(
+                getProp(props.getProperty("sortByPosition"), 
+                        getSortByPosition()));
+        setUseNonSequentialParser(
+                getProp(props.getProperty("useNonSequentialParser"), 
+                        getUseNonSequentialParser()));
+    }
+
+    /** @see #setEnableAutoSpace. */
+    public boolean getEnableAutoSpace() {
+        return enableAutoSpace;
+    }
+
+    /**
+     *  If true (the default), the parser should estimate
+     *  where spaces should be inserted between words.  For
+     *  many PDFs this is necessary as they do not include
+     *  explicit whitespace characters.
+     */
+    public void setEnableAutoSpace(boolean enableAutoSpace) {
+        this.enableAutoSpace = enableAutoSpace;
+    }
+
+    /** @see #setSuppressDuplicateOverlappingText(boolean)*/
+    public boolean getSuppressDuplicateOverlappingText() {
+        return suppressDuplicateOverlappingText;
+    }
+
+    /**
+     *  If true, the parser should try to remove duplicated
+     *  text over the same region.  This is needed for some
+     *  PDFs that achieve bolding by re-writing the same
+     *  text in the same area.  Note that this can
+     *  slow down extraction substantially (PDFBOX-956) and
+     *  sometimes remove characters that were not in fact
+     *  duplicated (PDFBOX-1155).  By default this is disabled.
+     */
+    public void setSuppressDuplicateOverlappingText(
+            boolean suppressDuplicateOverlappingText) {
+        this.suppressDuplicateOverlappingText = 
suppressDuplicateOverlappingText;
+    }
+
+    /** @see #setExtractAnnotationText(boolean)*/
+    public boolean getExtractAnnotationText() {
+        return extractAnnotationText;
+    }
+
+    /**
+     * If true (the default), text in annotations will be
+     * extracted.
+     */
+    public void setExtractAnnotationText(boolean extractAnnotationText) {
+        this.extractAnnotationText = extractAnnotationText;
+    }
+    /** @see #setSortByPosition(boolean)*/
+    public boolean getSortByPosition() {
+        return sortByPosition;
+    }
+
+    /**
+     *  If true, sort text tokens by their x/y position
+     *  before extracting text.  This may be necessary for
+     *  some PDFs (if the text tokens are not rendered "in
+     *  order"), while for other PDFs it can produce the
+     *  wrong result (for example if there are 2 columns,
+     *  the text will be interleaved).  Default is false.
+     */
+    public void setSortByPosition(boolean sortByPosition) {
+        this.sortByPosition = sortByPosition;
+    }
+
+    /** @see #setUseNonSequentialParser(boolean)*/
+    public boolean getUseNonSequentialParser() {
+        return useNonSequentialParser;
+    }
+
+    /**
+     * If true, uses PDFBox's non-sequential parser.
+     * The non-sequential parser should be much faster than the traditional
+     * full doc parser.  However, until PDFBOX-XXX is fixed, 
+     * the non-sequential parser fails
+     * to extract some document metadata.
+     * <p>
+     * Default is false (use the traditional parser)
+     * @param useNonSequentialParser
+     */
+    public void setUseNonSequentialParser(boolean useNonSequentialParser) {
+        this.useNonSequentialParser = useNonSequentialParser;
+    }
+
+    private boolean getProp(String p, boolean defaultMissing){
+        if (p == null){
+            return defaultMissing;
+        }
+        if (p.toLowerCase().equals("true")){
+            return true;
+        } else if (p.toLowerCase().equals("false")){
+            return false;
+        } else {
+            return defaultMissing;
+        }
+    }
+
+    @Override
+    public int hashCode() {
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + (enableAutoSpace ? 1231 : 1237);
+        result = prime * result + (extractAnnotationText ? 1231 : 1237);
+        result = prime * result + (sortByPosition ? 1231 : 1237);
+        result = prime * result
+                + (suppressDuplicateOverlappingText ? 1231 : 1237);
+        result = prime * result + (useNonSequentialParser ? 1231 : 1237);
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+        PDFParserConfig other = (PDFParserConfig) obj;
+        if (enableAutoSpace != other.enableAutoSpace)
+            return false;
+        if (extractAnnotationText != other.extractAnnotationText)
+            return false;
+        if (sortByPosition != other.sortByPosition)
+            return false;
+        if (suppressDuplicateOverlappingText != 
other.suppressDuplicateOverlappingText)
+            return false;
+        if (useNonSequentialParser != other.useNonSequentialParser)
+            return false;
+        return true;
+    }
+
+    @Override
+    public String toString() {
+        return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace
+                + ", suppressDuplicateOverlappingText="
+                + suppressDuplicateOverlappingText + ", extractAnnotationText="
+                + extractAnnotationText + ", sortByPosition=" + sortByPosition
+                + ", useNonSequentialParser=" + useNonSequentialParser + "]";
+    }
+
+}

Added: 
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties?rev=1548700&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
 (added)
+++ 
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
 Fri Dec  6 19:49:43 2013
@@ -0,0 +1,20 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+enableAutospace true
+extractAnnotationText true
+sortByPosition false
+suppressDuplicateOverlappingText       false
+useNonSequentialParser false
\ No newline at end of file

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1548700&r1=1548699&r2=1548700&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java 
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Fri Dec 
 6 19:49:43 2013
@@ -27,6 +27,7 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ToXMLContentHandler;
 import org.xml.sax.ContentHandler;
 
@@ -100,4 +101,32 @@ public abstract class TikaTest extends T
         }
     }
 
+    /**
+     * Basic text extraction.
+     * <p>
+     * Tries to close input stream after processing.
+     */
+    public String getText(InputStream is, Parser parser, ParseContext context, 
Metadata metadata) throws Exception{
+        ContentHandler handler = new BodyContentHandler();
+        try {
+            parser.parse(is, handler, metadata, context);
+        } finally {
+            is.close();
+        }
+        return handler.toString();
+    }
+    
+    public String getText(InputStream is, Parser parser, Metadata metadata) 
throws Exception{
+        return getText(is, parser, new ParseContext(), metadata);
+    }
+
+    public String getText(InputStream is, Parser parser, ParseContext context) 
throws Exception{
+        return getText(is, parser, context, new Metadata());
+    }
+
+    public String getText(InputStream is, Parser parser) throws Exception{
+        return getText(is, parser, new ParseContext(), new Metadata());
+    }
+
+
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1548700&r1=1548699&r2=1548700&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 Fri Dec  6 19:49:43 2013
@@ -47,17 +47,12 @@ public class PDFParserTest extends TikaT
 
     public void testPdfParsing() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
 
         InputStream stream = PDFParserTest.class.getResourceAsStream(
                 "/test-documents/testPDF.pdf");
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
+
+        String content = getText(stream, parser, metadata);
 
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Bertrand Delacr\u00e9taz", 
metadata.get(TikaCoreProperties.CREATOR));
@@ -69,7 +64,6 @@ public class PDFParserTest extends TikaT
 //        assertEquals("Sat Sep 15 10:02:31 BST 2007", 
metadata.get(Metadata.CREATION_DATE));
 //        assertEquals("Sat Sep 15 10:02:31 BST 2007", 
metadata.get(Metadata.LAST_MODIFIED));
 
-        String content = handler.toString();
         assertTrue(content.contains("Apache Tika"));
         assertTrue(content.contains("Tika - Content Analysis Toolkit"));
         assertTrue(content.contains("incubator"));
@@ -83,17 +77,12 @@ public class PDFParserTest extends TikaT
 
     public void testCustomMetadata() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
 
         InputStream stream = PDFParserTest.class.getResourceAsStream(
                 "/test-documents/testPDF-custommetadata.pdf");
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
+
+        String content = getText(stream, parser, metadata);
 
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Document author", 
metadata.get(TikaCoreProperties.CREATOR));
@@ -106,7 +95,6 @@ public class PDFParserTest extends TikaT
         assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
         assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
         
-        String content = handler.toString();
         assertTrue(content.contains("Hello World!"));
     }
     
@@ -174,38 +162,20 @@ public class PDFParserTest extends TikaT
 
     public void testTwoTextBoxes() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
-
         InputStream stream = PDFParserTest.class.getResourceAsStream(
                 "/test-documents/testPDFTwoTextBoxes.pdf");
-        try {
-          parser.parse(stream, handler, metadata, context);
-        } finally {
-          stream.close();
-        }
-
-        String content = handler.toString();
+        String content = getText(stream, parser);
         content = content.replaceAll("\\s+"," ");
         assertTrue(content.contains("Left column line 1 Left column line 2 
Right column line 1 Right column line 2"));
     }
 
     public void testVarious() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
-
         InputStream stream = PDFParserTest.class.getResourceAsStream(
                 "/test-documents/testPDFVarious.pdf");
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
 
-        String content = handler.toString();
+        String content = getText(stream, parser, metadata);
         //content = content.replaceAll("\\s+"," ");
         assertContains("Footnote appears here", content);
         assertContains("This is a footnote.", content);
@@ -266,37 +236,33 @@ public class PDFParserTest extends TikaT
 
     public void testAnnotations() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
         InputStream stream = 
getResourceAsStream("/test-documents/testAnnotations.pdf");
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
-        String content = handler.toString();
+        String content = getText(stream, parser);
         content = content.replaceAll("[\\s\u00a0]+"," ");
         assertContains("Here is some text", content);
         assertContains("Here is a comment", content);
 
         // Test w/ annotation text disabled:
         PDFParser pdfParser = new PDFParser();
-        pdfParser.setExtractAnnotationText(false);
-        handler = new BodyContentHandler();
-        metadata = new Metadata();
-        context = new ParseContext();
+        pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
         stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
-        try {
-            pdfParser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
-        content = handler.toString();
+        content = getText(stream, pdfParser);
         content = content.replaceAll("[\\s\u00a0]+"," ");
         assertContains("Here is some text", content);
         assertEquals(-1, content.indexOf("Here is a comment"));
 
+        // annotation text disabled through parsecontext
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractAnnotationText(false);
+        context.set(PDFParserConfig.class, config);
+        stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
+        content = getText(stream, parser, context);
+        content = content.replaceAll("[\\s\u00a0]+"," ");
+        assertContains("Here is some text", content);
+        assertEquals(-1, content.indexOf("Here is a comment"));
+        
+        
         // TIKA-738: make sure no extra </p> tags
         String xml = getXML("testAnnotations.pdf").xml;
         assertEquals(substringCount("<p>", xml),
@@ -306,16 +272,8 @@ public class PDFParserTest extends TikaT
     // TIKA-981
     public void testPopupAnnotation() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
         InputStream stream = 
getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
-        String content = handler.toString();
+        String content = getText(stream, parser);
         assertContains("this is the note", content);
         assertContains("igalsh", content);
     }
@@ -362,100 +320,110 @@ public class PDFParserTest extends TikaT
 
     public void testDisableAutoSpace() throws Exception {
         PDFParser parser = new PDFParser();
-        parser.setEnableAutoSpace(false);
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
+        parser.getPDFParserConfig().setEnableAutoSpace(false);
         InputStream stream = 
getResourceAsStream("/test-documents/testExtraSpaces.pdf");
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
-        String content = handler.toString();
+        String content = getText(stream, parser);
         content = content.replaceAll("[\\s\u00a0]+"," ");
         // Text is correct when autoSpace is off:
         assertContains("Here is some formatted text", content);
 
-        parser.setEnableAutoSpace(true);
-        handler = new BodyContentHandler();
-        metadata = new Metadata();
-        context = new ParseContext();
+        parser.getPDFParserConfig().setEnableAutoSpace(true);
         stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
-        content = handler.toString();
+        content = getText(stream, parser);
         content = content.replaceAll("[\\s\u00a0]+"," ");
         // Text is correct when autoSpace is off:
 
         // Text has extra spaces when autoSpace is on
         assertEquals(-1, content.indexOf("Here is some formatted text"));
+        
+        //now try with autodetect
+        Parser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        //default is true
+        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        content = getText(stream, autoParser, context);
+        content = content.replaceAll("[\\s\u00a0]+"," ");
+        // Text has extra spaces when autoSpace is on
+        assertEquals(-1, content.indexOf("Here is some formatted text"));
+
+        config.setEnableAutoSpace(false);
+        
+        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        content = getText(stream, parser, context);
+        content = content.replaceAll("[\\s\u00a0]+"," ");
+        // Text is correct when autoSpace is off:
+        assertContains("Here is some formatted text", content);
+        
     }
 
     public void testDuplicateOverlappingText() throws Exception {
         PDFParser parser = new PDFParser();
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
         InputStream stream = 
getResourceAsStream("/test-documents/testOverlappingText.pdf");
         // Default is false (keep overlapping text):
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
-        String content = handler.toString();
+        String content = getText(stream, parser);
         assertContains("Text the first timeText the second time", content);
 
-        parser.setSuppressDuplicateOverlappingText(true);
-        handler = new BodyContentHandler();
-        metadata = new Metadata();
-        context = new ParseContext();
+        parser.getPDFParserConfig().setSuppressDuplicateOverlappingText(true);
         stream = 
getResourceAsStream("/test-documents/testOverlappingText.pdf");
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
-        content = handler.toString();
+        content = getText(stream, parser);
+        // "Text the first" was dedup'd:
+        assertContains("Text the first timesecond time", content);
+        
+        //now try with autodetect
+        Parser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        stream = 
getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        // Default is false (keep overlapping text):
+        content = getText(stream, autoParser, context);
+        assertContains("Text the first timeText the second time", content);
+
+        config.setSuppressDuplicateOverlappingText(true);
+        stream = 
getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        content = getText(stream, autoParser, context);
         // "Text the first" was dedup'd:
         assertContains("Text the first timesecond time", content);
+
     }
 
     public void testSortByPosition() throws Exception {
         PDFParser parser = new PDFParser();
-        parser.setEnableAutoSpace(false);
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
+        parser.getPDFParserConfig().setEnableAutoSpace(false);
         InputStream stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
         // Default is false (do not sort):
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
-        String content = handler.toString();
+        String content = getText(stream, parser);
         content = content.replaceAll("\\s+", " ");
         assertContains("Left column line 1 Left column line 2 Right column 
line 1 Right column line 2", content);
 
-        parser.setSortByPosition(true);
-        handler = new BodyContentHandler();
-        metadata = new Metadata();
-        context = new ParseContext();
+        parser.getPDFParserConfig().setSortByPosition(true);
         stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
-        try {
-            parser.parse(stream, handler, metadata, context);
-        } finally {
-            stream.close();
-        }
-        content = handler.toString();
+        content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn 
line 2 Right column line 2", content);
+        
+        //now try setting autodetect via parsecontext        
+        AutoDetectParser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        // Default is false (do not sort):
+        content = getText(stream, autoParser, context);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column 
line 1 Right column line 2", content);
+        
+        config.setSortByPosition(true);
+        context.set(PDFParserConfig.class, config);
+        stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        content = getText(stream, parser);
         content = content.replaceAll("\\s+", " ");
         // Column text is now interleaved:
         assertContains("Left column line 1 Right column line 1 Left colu mn 
line 2 Right column line 2", content);
+        
     }
 
     // TIKA-1035
@@ -527,7 +495,7 @@ public class PDFParserTest extends TikaT
     public void testSequentialParser() throws Exception{
         PDFParser defaultParser = new PDFParser();
         PDFParser sequentialParser = new PDFParser();
-        sequentialParser.setUseNonSequentialParser(true);
+        sequentialParser.getPDFParserConfig().setUseNonSequentialParser(true);
         File testDocs = new 
File(this.getClass().getResource("/test-documents").toURI());
         int pdfs = 0;
         for (File f : testDocs.listFiles()){
@@ -536,10 +504,10 @@ public class PDFParserTest extends TikaT
             }
             pdfs++;
             Metadata defaultMetadata = new Metadata();
-            String defaultContent = getText(f, defaultParser, defaultMetadata);
+            String defaultContent = getText(new FileInputStream(f), 
defaultParser, defaultMetadata);
 
             Metadata sequentialMetadata = new Metadata();
-            String sequentialContent = getText(f, sequentialParser, 
sequentialMetadata);
+            String sequentialContent = getText(new FileInputStream(f), 
sequentialParser, sequentialMetadata);
             
             assertEquals(f.getName(), defaultContent, sequentialContent);
             //TODO: until PDFBox fixes metadata extraction for this file,
@@ -553,17 +521,5 @@ public class PDFParserTest extends TikaT
         assertEquals("Number of pdf files tested", 14, pdfs);
     }
 
-    private String getText(File f, PDFParser parser, Metadata metadata) throws 
Exception{
-        ContentHandler handler = new BodyContentHandler();
-        ParseContext context = new ParseContext();
-        FileInputStream is = null;
-        try {
-            is = new FileInputStream(f);
-            parser.parse(is, handler, metadata, context);
-        } finally {
-            is.close();
-        }
-        return handler.toString();
-    }
 
 }

svn commit: r1548700 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pdf/ main/resources/org/apache/tika/parser/pdf/ test/java/org/apache/tika/ test/java/org/apache/tika/parser/pdf/

Reply via email to