This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit f51ae0aef10a052308d6830fcefad77347bc5ebd Author: tallison <[email protected]> AuthorDate: Tue Apr 14 11:09:52 2020 -0400 TIKA-3091 prevent npe in PDFParserConfig by initializing three parameters with default values. # Conflicts: # tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java --- .../java/org/apache/tika/parser/pdf/PDFParser.java | 11 +++++++++ .../apache/tika/parser/pdf/PDFParserConfig.java | 28 ++++++++++++++++++++-- .../org/apache/tika/parser/pdf/PDFParserTest.java | 10 ++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 2e637e0..6d8b5b1 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -524,6 +524,17 @@ public class PDFParser extends AbstractParser implements Initializable { } @Field + void setAverageCharTolerance(float averageCharTolerance) { + defaultConfig.setAverageCharTolerance(averageCharTolerance); + } + + @Field + void setSpacingTolerance(float spacingTolerance) { + defaultConfig.setSpacingTolerance(spacingTolerance); + } + + + @Field void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) { defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index b5d6824..da8b309 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -114,10 +114,16 @@ public class PDFParserConfig implements Serializable { private boolean extractMarkedContent = false; //The character width-based tolerance value used to estimate where spaces in text should be added - private Float averageCharTolerance; + //Default taken from PDFBox. + private Float averageCharTolerance = 0.5f; //The space width-based tolerance value used to estimate where spaces in text should be added - private Float spacingTolerance; + //Default taken from PDFBox. + private Float spacingTolerance = 0.3f; + + // The multiplication factor for line height to decide when a new paragraph starts. + //Default taken from PDFBox. + private Float dropThreshold = 2.5f; //If the PDF has an XFA element, process only that and skip extracting //content from elsewhere in the document. @@ -238,6 +244,10 @@ public class PDFParserConfig implements Serializable { setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false)); + setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance)); + setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance)); + setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold)); + boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false); boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true); @@ -287,6 +297,9 @@ public class PDFParserConfig implements Serializable { if (getSpacingTolerance() != null) { pdf2XHTML.setSpacingTolerance(getSpacingTolerance()); } + if (getDropThreshold() != null) { + pdf2XHTML.setDropThreshold(dropThreshold); + } pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); } @@ -513,6 +526,14 @@ public class PDFParserConfig implements Serializable { this.spacingTolerance = spacingTolerance; } + public Float getDropThreshold() { + return dropThreshold; + } + + public void setDropThreshold(float dropThreshold) { + this.dropThreshold = dropThreshold; + } + public AccessChecker getAccessChecker() { return accessChecker; } @@ -824,6 +845,7 @@ public class PDFParserConfig implements Serializable { if (getCatchIntermediateIOExceptions() != config.getCatchIntermediateIOExceptions()) return false; if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false; if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false; + if (!getDropThreshold().equals(config.getDropThreshold())) return false; if (!getOcrStrategy().equals(config.getOcrStrategy())) return false; if (getOcrImageType() != config.getOcrImageType()) return false; if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false; @@ -844,6 +866,7 @@ public class PDFParserConfig implements Serializable { result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0); result = 31 * result + getAverageCharTolerance().hashCode(); result = 31 * result + getSpacingTolerance().hashCode(); + result = 31 * result + getDropThreshold().hashCode(); result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0); result = 31 * result + ocrStrategy.hashCode(); result = 31 * result + getOcrDPI(); @@ -869,6 +892,7 @@ public class PDFParserConfig implements Serializable { ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly + ", averageCharTolerance=" + averageCharTolerance + ", spacingTolerance=" + spacingTolerance + + ", dropThreshold=" + dropThreshold + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + ", ocrStrategy=" + ocrStrategy + ", ocrDPI=" + ocrDPI + diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index edcd513..4e2e3c5 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1528,6 +1528,16 @@ public class PDFParserTest extends TikaTest { } + @Test + public void testNPEInPDFParserConfig() { + //TIKA-3091 + PDFParserConfig config = new PDFParserConfig(); + //don't care about values; want to make sure no NPE is thrown + String txt = config.toString(); + config.hashCode(); + config.equals(new PDFParserConfig()); + } + @Test //TIKA-3041 @Ignore("turn back on if we add file from PDFBOX-52") public void testPDFBox52() throws Exception {
