This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 80a93bb  fix for TIKA-2996 contributed by fsonntag (#297)
80a93bb is described below

commit 80a93bbffa12e4d58a1b6a026d06e680f3132509
Author: Felix Sonntag <[email protected]>
AuthorDate: Fri Nov 22 19:51:09 2019 +0100

    fix for TIKA-2996 contributed by fsonntag (#297)
---
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 196c5ce..b5d54d0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -116,6 +116,9 @@ public class PDFParserConfig implements Serializable {
     //The space width-based tolerance value used to estimate where spaces in 
text should be added
     private Float spacingTolerance;
 
+    // The multiplication factor for line height to decide when a new 
paragraph starts.
+    private float dropThreshold;
+
     //If the PDF has an XFA element, process only that and skip extracting
     //content from elsewhere in the document.
     private boolean ifXFAExtractOnlyXFA = false;
@@ -254,6 +257,9 @@ public class PDFParserConfig implements Serializable {
         if (getSpacingTolerance() != null) {
             pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
         }
+        if (getDropThreshold() != null) {
+            pdf2XHTML.setDropThreshold(dropThreshold);
+        }
         
pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
     }
 
@@ -469,6 +475,20 @@ public class PDFParserConfig implements Serializable {
         this.spacingTolerance = spacingTolerance;
     }
 
+    /**
+     * @see #setDropThreshold(Float)
+     */
+    public Float getDropThreshold() {
+        return dropThreshold;
+    }
+
+    /**
+     * See {@link PDFTextStripper#setDropThreshold(float)}
+     */
+    public void setDropThreshold(Float dropThreshold) {
+        this.dropThreshold = dropThreshold;
+    }
+
     public AccessChecker getAccessChecker() {
         return accessChecker;
     }
@@ -762,6 +782,7 @@ public class PDFParserConfig implements Serializable {
         if (getCatchIntermediateIOExceptions() != 
config.getCatchIntermediateIOExceptions()) return false;
         if 
(!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return 
false;
         if (!getSpacingTolerance().equals(config.getSpacingTolerance())) 
return false;
+        if (!getDropThreshold().equals(config.getDropThreshold())) return 
false;
         if (!getOcrStrategy().equals(config.getOcrStrategy())) return false;
         if (getOcrImageType() != config.getOcrImageType()) return false;
         if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) 
return false;
@@ -782,6 +803,7 @@ public class PDFParserConfig implements Serializable {
         result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
         result = 31 * result + getAverageCharTolerance().hashCode();
         result = 31 * result + getSpacingTolerance().hashCode();
+        result = 31 * result + getDropThreshold().hashCode();
         result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
         result = 31 * result + ocrStrategy.hashCode();
         result = 31 * result + getOcrDPI();
@@ -807,6 +829,7 @@ public class PDFParserConfig implements Serializable {
                 ", extractUniqueInlineImagesOnly=" + 
extractUniqueInlineImagesOnly +
                 ", averageCharTolerance=" + averageCharTolerance +
                 ", spacingTolerance=" + spacingTolerance +
+                ", dropThreshold=" + dropThreshold +
                 ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
                 ", ocrStrategy=" + ocrStrategy +
                 ", ocrDPI=" + ocrDPI +

Reply via email to