[PDFParser] - patch proposal

Stefano Fornari Sat, 27 Sep 2014 06:09:07 -0700

Hi All,
with regards to the thread "[PDFParser] - read limited number of
characters" on Mar 29, I would like to propose the attached patch. I
noticed that in Tika 1.6 there have been some work around a better handling
of the WriteLimitReachedException condition, but I believe it could be even
improved.


What do you think?
Ste

Index: tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
===================================================================
--- tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java	(revision 1627940)
+++ tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java	(working copy)
@@ -50,6 +50,11 @@
     private int writeCount = 0;
 
     /**
+     * Flag to mark if the limit has been reached
+     */
+    private boolean writeLimitReached = false;
+
+    /**
      * Creates a content handler that writes content up to the given
      * write limit to the given content handler.
      *
@@ -138,6 +143,7 @@
         } else {
             super.characters(ch, start, writeLimit - writeCount);
             writeCount = writeLimit;
+            writeLimitReached = true;
             throw new WriteLimitReachedException(
                     "Your document contained more than " + writeLimit
                     + " characters, and so your requested limit has been"
@@ -156,6 +162,7 @@
         } else {
             super.ignorableWhitespace(ch, start, writeLimit - writeCount);
             writeCount = writeLimit;
+            writeLimitReached = true;
             throw new WriteLimitReachedException(
                     "Your document contained more than " + writeLimit
                     + " characters, and so your requested limit has been"
@@ -173,31 +180,26 @@
      * @param t throwable
      * @return <code>true</code> if the write limit was reached,
      *         <code>false</code> otherwise
+     * 
+     * Deprecated in Tika 1.6, use isWriteLimitReached(); the current 
+     * implementation ignores the given Throwable and is equivalent to 
+     * isWriteLimitReached()
+     * 
      */
+    @Deprecated
     public boolean isWriteLimitReached(Throwable t) {
-        if (t instanceof WriteLimitReachedException) {
-            return tag.equals(((WriteLimitReachedException) t).tag);
-        } else {
-            return t.getCause() != null && isWriteLimitReached(t.getCause());
-        }
+        return isWriteLimitReached();
     }
-
+    
     /**
-     * The exception used as a signal when the write limit has been reached.
+     * Returns true if the limit has been reached, false otherwise.
+     *
+     * @since Apache Tika 1.6
+     * @return <code>true</code> if the write limit was reached,
+     *         <code>false</code> otherwise
      */
-    private static class WriteLimitReachedException extends SAXException {
-
-        /** Serial version UID */
-        private static final long serialVersionUID = -1850581945459429943L;
-
-        /** Serializable tag of the handler that caused this exception */
-        private final Serializable tag;
-
-        public WriteLimitReachedException(String message, Serializable tag) {
-           super(message);
-           this.tag = tag;
-        }
-
+    public boolean isWriteLimitReached() {
+        return writeLimitReached;
     }
 
 }
Index: tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java	(revision 1627940)
+++ tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java	(working copy)
@@ -52,6 +52,7 @@
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.WriteLimitReachedException;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -157,7 +158,13 @@
             metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
             extractMetadata(pdfDocument, metadata);
             if (handler != null) {
-                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+                try {
+                    PDF2Text.process(pdfDocument, handler, context, metadata, localConfig);
+                } catch (WriteLimitReachedException x) {
+                    //
+                    // This is a valid condition; just ignoring the exception
+                    //
+                }
             }
             
         } finally {
Index: tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java	(revision 1627940)
+++ tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java	(working copy)
@@ -144,21 +144,21 @@
      * 
      * @param pdf2XHTML
      */
-    public void configure(PDF2XHTML pdf2XHTML) {
-        pdf2XHTML.setForceParsing(true);
-        pdf2XHTML.setSortByPosition(getSortByPosition());
+    public void configure(PDF2Text pdf2text) {
+        pdf2text.setForceParsing(true);
+        pdf2text.setSortByPosition(getSortByPosition());
         if (getEnableAutoSpace()) {
-            pdf2XHTML.setWordSeparator(" ");
+            pdf2text.setWordSeparator(" ");
         } else {
-            pdf2XHTML.setWordSeparator("");
+            pdf2text.setWordSeparator("");
         }
         if (getAverageCharTolerance() != null) {
-            pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
+            pdf2text.setAverageCharTolerance(getAverageCharTolerance());
         }
         if (getSpacingTolerance() != null) {
-            pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
+            pdf2text.setSpacingTolerance(getSpacingTolerance());
         }
-        pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
+        pdf2text.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
     }
 
     
Index: tika-parsers/src/test/java/org/apache/tika/TikaTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/TikaTest.java	(revision 1627940)
+++ tika-parsers/src/test/java/org/apache/tika/TikaTest.java	(working copy)
@@ -16,9 +16,7 @@
  */
 package org.apache.tika;
 
-import org.apache.tika.extractor.EmbeddedResourceHandler;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.TikaInputStream;
+
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
@@ -38,6 +36,9 @@
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
 
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
@@ -102,6 +103,10 @@
     protected XMLResult getXML(String filePath) throws Exception {
         return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
     }
+    
+    protected String getText(String filePath) throws Exception {
+        return getText(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
+    }
 
     protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
       ParseContext context = new ParseContext();
Index: tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java	(revision 1627940)
+++ tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java	(working copy)
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.pdf;
 
+
 import org.apache.tika.TikaTest;
 import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.extractor.DocumentSelector;
@@ -910,7 +911,30 @@
 
     }
 
-
+   @Test
+    public void testLimitTextToParse() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        
+        new PDFParser().parse(
+            getResourceAsStream("/test-documents/testPDF.pdf"), 
+            handler, 
+            new Metadata(), 
+            new ParseContext()
+        );
+        
+        assertEquals(1067, handler.toString().length());
+        
+        handler = new BodyContentHandler(500);
+        
+        new PDFParser().parse(
+            getResourceAsStream("/test-documents/testPDF.pdf"), 
+            handler, 
+            new Metadata(), 
+            new ParseContext()
+        );
+        
+        assertEquals(500, handler.toString().length());
+    }
     @Test
     public void testInlineConfig() throws Exception {
         
Index: tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java	(revision 1627940)
+++ tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java	(working copy)
@@ -91,7 +91,7 @@
 
     @Test
     public void testUmlautSpacesExtraction2() throws Exception {
-        String content = getText("testRTFUmlautSpaces2.rtf");
+        String content = getResultText("testRTFUmlautSpaces2.rtf");
         content = content.replaceAll("\\s+", "");
         assertEquals("\u00DCbersicht", content);
     }
@@ -98,7 +98,7 @@
 
     @Test
     public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
-        String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
+        String content = getResultText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
 
         assertContains("\u5E74", content);
         assertContains("\u5ff5", content);
@@ -109,13 +109,13 @@
 
     @Test
     public void testHexEscapeInsideWord() throws Exception {
-        String content = getText("testRTFHexEscapeInsideWord.rtf");
+        String content = getResultText("testRTFHexEscapeInsideWord.rtf");
         assertContains("ESP\u00cdRITO", content);
     }
 
     @Test
     public void testWindowsCodepage1250() throws Exception {
-        String content = getText("testRTFWindowsCodepage1250.rtf");
+        String content = getResultText("testRTFWindowsCodepage1250.rtf");
         assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content);
         assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content);
     }
@@ -131,7 +131,7 @@
     
     @Test
     public void testTableCellSeparation2() throws Exception {
-        String content = getText("testRTFTableCellSeparation2.rtf");
+        String content = getResultText("testRTFTableCellSeparation2.rtf");
         // TODO: why do we insert extra whitespace...?
         content = content.replaceAll("\\s+"," ");
         assertContains("Station Fax", content);
@@ -175,7 +175,7 @@
 
     @Test
     public void testGothic() throws Exception {
-        String content = getText("testRTFUnicodeGothic.rtf");
+        String content = getResultText("testRTFUnicodeGothic.rtf");
         assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
     }
 
@@ -231,7 +231,7 @@
 
     @Test
     public void testTextWithCurlyBraces() throws Exception {
-        String content = getText("testRTFWithCurlyBraces.rtf");
+        String content = getResultText("testRTFWithCurlyBraces.rtf");
         assertContains("{ some text inside curly brackets }", content);
     }
 
@@ -597,7 +597,8 @@
         return new Result(content, metadata);
     }
 
-    private String getText(String filename) throws Exception {
+    private String getResultText(String filename) throws Exception {
         return getResult(filename).text;
     }
 }
+    
\ No newline at end of file

[PDFParser] - patch proposal

Reply via email to