This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new 484a340 TIKA-3361 Make ocrStrategy=Auto more intelligent (#447) 484a340 is described below commit 484a340a4643ed2335413ba4feddbe8d64f4e9d8 Author: Peter Kronenberg <pakronenb...@gmail.com> AuthorDate: Fri Jul 16 15:42:38 2021 -0400 TIKA-3361 Make ocrStrategy=Auto more intelligent (#447) Co-authored-by: Peter Kronenberg <peter.kronenb...@torch.ai> --- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 17 +++-- .../java/org/apache/tika/parser/pdf/PDFParser.java | 5 ++ .../apache/tika/parser/pdf/PDFParserConfig.java | 81 ++++++++++++++++++++++ 3 files changed, 98 insertions(+), 5 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 79a4160..968e97c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -583,12 +583,19 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } } - if (config.getOcrStrategy() - .equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { + if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) { doOCROnCurrentPage(OCR_AND_TEXT_EXTRACTION); - } else if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.AUTO)) { - //TODO add more sophistication - if (totalCharsPerPage < 10 || unmappedUnicodeCharsPerPage > 10) { + } else if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.AUTO) { + boolean unmappedExceedsLimit = false; + if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) { + // There are enough characters to not have to do OCR. Check number of unmapped characters + final float percentUnmapped = (float) unmappedUnicodeCharsPerPage / totalCharsPerPage; + final float unmappedCharacterLimit = config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage(); + unmappedExceedsLimit = (unmappedCharacterLimit < 1) + ? percentUnmapped > unmappedCharacterLimit + : unmappedUnicodeCharsPerPage > unmappedCharacterLimit; + } + if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() || unmappedExceedsLimit) { doOCROnCurrentPage(AUTO); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 0351da1..0300705 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -478,6 +478,11 @@ public class PDFParser extends AbstractParser implements Initializable { } @Field + public void setOcrStrategyAuto(String ocrStrategyAuto) { + defaultConfig.setOcrStrategyAuto(ocrStrategyAuto); + } + + @Field public void setOcrRenderingStrategy(String ocrRenderingStrategy) { defaultConfig.setOcrRenderingStrategy(ocrRenderingStrategy); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index b665e4c..c74281b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -22,6 +22,8 @@ import java.lang.reflect.Modifier; import java.util.HashSet; import java.util.Locale; import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.text.PDFTextStripper; @@ -96,6 +98,13 @@ public class PDFParserConfig implements Serializable { private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.AUTO; + // If OCR_Strategy=AUTO, then this controls the algorithm used + private static final OCRStrategyAuto OCR_STRATEGY_AUTO_BETTER = new OCRStrategyAuto(10, 10); + private static final OCRStrategyAuto OCR_STRATEGY_AUTO_FASTER = new OCRStrategyAuto(.1f, 10); + private static final int OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE = 10; + + private OCRStrategyAuto ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER; + private OCR_RENDERING_STRATEGY ocrRenderingStrategy = OCR_RENDERING_STRATEGY.NO_TEXT; private int ocrDPI = 300; @@ -485,6 +494,13 @@ public class PDFParserConfig implements Serializable { } /** + * @return ocr auto strategy to use when ocr_strategy = Auto + */ + public OCRStrategyAuto getOcrStrategyAuto() { + return ocrStrategyAuto; + } + + /** * Which strategy to use for OCR * * @param ocrStrategy @@ -494,6 +510,41 @@ public class PDFParserConfig implements Serializable { userConfigured.add("ocrStrategy"); } + + public void setOcrStrategyAuto(String ocrStrategyAuto) { + final String regex = "^\\s*(faster|better)|(\\d{1,3})(%)?(?:,\\s*(\\d{1,3}))?\\s*$"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(ocrStrategyAuto); + if (matcher.matches()) { + final String group1 = matcher.group(1); + + if ("better".equals(group1)) { + this.ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER; + } else if ("faster".equals(group1)) { + this.ocrStrategyAuto = OCR_STRATEGY_AUTO_FASTER; + } else { + float unmappedUnicodeCharsPerPage = Integer.parseInt(matcher.group(2)); + if (matcher.group(3) != null) { + // If we have the percent sign, then convert + if (unmappedUnicodeCharsPerPage > 100.0) { + throw new IllegalArgumentException + ("Error parsing OCRStrategyAuto - Percent cannot exceed 100%"); + } + unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage / 100f; + } + // The 2nd number is optional. Default to 10 chars per page + int totalCharsPerPage = matcher.group(4) == null + ? OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE + : Integer.parseInt(matcher.group(4)); + this.ocrStrategyAuto = new OCRStrategyAuto(unmappedUnicodeCharsPerPage, totalCharsPerPage); + } + userConfigured.add("ocrStrategyAuto"); + + } else { + throw new IllegalArgumentException("Error parsing OCRStrategyAuto - Must be in the form 'num[%], num'"); + } + } + /** * Which strategy to use for OCR * @@ -878,6 +929,36 @@ public class PDFParserConfig implements Serializable { } } + /** + * Encapsulate the numbers used to control OCR Strategy when set to auto + * <p> + * If the total characters on the page < this.totalCharsPerPage + * or + * total unmapped unicode characters on the page > this.unmappedUnicodeCharsPerPage + * then we will perform OCR on the page + * <p> + * If unamppedUnicodeCharsPerPage is an integer > 0, then we compare absolute number of characters. + * If it is a float < 1, then we assume it is a percentage and we compare it to the + * percentage of unmappedCharactersPerPage/totalCharsPerPage + */ + public static class OCRStrategyAuto implements Serializable { + private final float unmappedUnicodeCharsPerPage; + private final int totalCharsPerPage; + + public OCRStrategyAuto(float unmappedUnicodeCharsPerPage, int totalCharsPerPage) { + this.totalCharsPerPage = totalCharsPerPage; + this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage; + } + + public float getUnmappedUnicodeCharsPerPage() { + return unmappedUnicodeCharsPerPage; + } + + public int getTotalCharsPerPage() { + return totalCharsPerPage; + } + } + public enum OCR_RENDERING_STRATEGY { NO_TEXT, ALL; //AUTO? // Would TEXT_ONLY be useful in instances where the unicode mappings