This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 80a93bb fix for TIKA-2996 contributed by fsonntag (#297)
80a93bb is described below
commit 80a93bbffa12e4d58a1b6a026d06e680f3132509
Author: Felix Sonntag <[email protected]>
AuthorDate: Fri Nov 22 19:51:09 2019 +0100
fix for TIKA-2996 contributed by fsonntag (#297)
---
.../apache/tika/parser/pdf/PDFParserConfig.java | 23 ++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 196c5ce..b5d54d0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -116,6 +116,9 @@ public class PDFParserConfig implements Serializable {
//The space width-based tolerance value used to estimate where spaces in
text should be added
private Float spacingTolerance;
+ // The multiplication factor for line height to decide when a new
paragraph starts.
+ private float dropThreshold;
+
//If the PDF has an XFA element, process only that and skip extracting
//content from elsewhere in the document.
private boolean ifXFAExtractOnlyXFA = false;
@@ -254,6 +257,9 @@ public class PDFParserConfig implements Serializable {
if (getSpacingTolerance() != null) {
pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
}
+ if (getDropThreshold() != null) {
+ pdf2XHTML.setDropThreshold(dropThreshold);
+ }
pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
}
@@ -469,6 +475,20 @@ public class PDFParserConfig implements Serializable {
this.spacingTolerance = spacingTolerance;
}
+ /**
+ * @see #setDropThreshold(Float)
+ */
+ public Float getDropThreshold() {
+ return dropThreshold;
+ }
+
+ /**
+ * See {@link PDFTextStripper#setDropThreshold(float)}
+ */
+ public void setDropThreshold(Float dropThreshold) {
+ this.dropThreshold = dropThreshold;
+ }
+
public AccessChecker getAccessChecker() {
return accessChecker;
}
@@ -762,6 +782,7 @@ public class PDFParserConfig implements Serializable {
if (getCatchIntermediateIOExceptions() !=
config.getCatchIntermediateIOExceptions()) return false;
if
(!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return
false;
if (!getSpacingTolerance().equals(config.getSpacingTolerance()))
return false;
+ if (!getDropThreshold().equals(config.getDropThreshold())) return
false;
if (!getOcrStrategy().equals(config.getOcrStrategy())) return false;
if (getOcrImageType() != config.getOcrImageType()) return false;
if (!getOcrImageFormatName().equals(config.getOcrImageFormatName()))
return false;
@@ -782,6 +803,7 @@ public class PDFParserConfig implements Serializable {
result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
result = 31 * result + getAverageCharTolerance().hashCode();
result = 31 * result + getSpacingTolerance().hashCode();
+ result = 31 * result + getDropThreshold().hashCode();
result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
result = 31 * result + ocrStrategy.hashCode();
result = 31 * result + getOcrDPI();
@@ -807,6 +829,7 @@ public class PDFParserConfig implements Serializable {
", extractUniqueInlineImagesOnly=" +
extractUniqueInlineImagesOnly +
", averageCharTolerance=" + averageCharTolerance +
", spacingTolerance=" + spacingTolerance +
+ ", dropThreshold=" + dropThreshold +
", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
", ocrStrategy=" + ocrStrategy +
", ocrDPI=" + ocrDPI +