This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4567 in repository https://gitbox.apache.org/repos/asf/tika.git
commit acad345d6209df34fcaa5deb45cf32f0e043b559 Author: tallison <[email protected]> AuthorDate: Thu Dec 11 10:06:25 2025 -0500 TIKA-4567 -- checkpoint commit --- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 18 +- .../org/apache/tika/parser/pdf/AccessChecker.java | 84 +++---- .../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 2 +- .../java/org/apache/tika/parser/pdf/OcrConfig.java | 190 +++++++++++++++ .../java/org/apache/tika/parser/pdf/PDFParser.java | 42 ++-- .../apache/tika/parser/pdf/PDFParserConfig.java | 257 ++++----------------- .../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java | 2 +- .../apache/tika/parser/pdf/AccessCheckerTest.java | 35 +-- .../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- .../apache/tika/parser/pdf/tika-inline-config.xml | 4 +- .../tika/config/TikaConfigSerializerTest.java | 4 +- .../apache/tika/parser/crypto/TSDParserTest.java | 3 +- .../org/apache/tika/parser/pdf/PDFParserTest.java | 42 ++-- 13 files changed, 361 insertions(+), 324 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 0d950ec0e0..a0ef06c96a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -16,10 +16,10 @@ */ package org.apache.tika.parser.pdf; -import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.AUTO; -import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; -import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION; -import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.OCR_ONLY; +import static org.apache.tika.parser.pdf.OcrConfig.Strategy.AUTO; +import static org.apache.tika.parser.pdf.OcrConfig.Strategy.NO_OCR; +import static org.apache.tika.parser.pdf.OcrConfig.Strategy.OCR_AND_TEXT_EXTRACTION; +import static org.apache.tika.parser.pdf.OcrConfig.Strategy.OCR_ONLY; import java.awt.image.BufferedImage; import java.io.BufferedInputStream; @@ -530,7 +530,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } - void doOCROnCurrentPage(PDPage pdPage, PDFParserConfig.OCR_STRATEGY ocrStrategy) + void doOCROnCurrentPage(PDPage pdPage, OcrConfig.Strategy ocrStrategy) throws IOException, TikaException, SAXException { if (ocrStrategy.equals(NO_OCR)) { //I don't think this is reachable? @@ -597,7 +597,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { Renderer thisRenderer = getPDFRenderer(renderer); //if there's a configured renderer and if the rendering strategy is "all" if (thisRenderer != null && - config.getOcrRenderingStrategy() == PDFParserConfig.OCR_RENDERING_STRATEGY.ALL) { + config.getOcrRenderingStrategy() == OcrConfig.RenderingStrategy.ALL) { PageRangeRequest pageRangeRequest = new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo()); if (thisRenderer instanceof PDDocumentRenderer) { @@ -673,7 +673,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { try { BufferedImage image = - renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType().getImageType()); + renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType().getPdfBoxImageType()); //TODO -- get suffix based on OcrImageType tmpFile = tmpResources.createTempFile(); @@ -707,9 +707,9 @@ class AbstractPDF2XHTML extends PDFTextStripper { for (PDAnnotation annotation : page.getAnnotations()) { processPageAnnotation(annotation); } - if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) { + if (config.getOcrStrategy() == OCR_AND_TEXT_EXTRACTION) { doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION); - } else if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.AUTO) { + } else if (config.getOcrStrategy() == AUTO) { boolean unmappedExceedsLimit = false; if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) { // There are enough characters to not have to do OCR. Check number of unmapped characters diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java index 4cf307a763..6c294ee5ca 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java @@ -28,64 +28,77 @@ import org.apache.tika.metadata.Metadata; */ public class AccessChecker implements Serializable { - private static final long serialVersionUID = 6492570218190936986L; + private static final long serialVersionUID = 6492570218190936987L; - private boolean needToCheck; - private boolean allowExtractionForAccessibility; + /** + * Mode for checking document access permissions. + */ + public enum AccessCheckMode { + /** + * Don't check extraction permissions. Content will always be extracted + * regardless of document permissions. This is the default for backwards + * compatibility with Tika's legacy behavior (<= v1.7). + */ + DONT_CHECK, + + /** + * Check permissions, but allow extraction for accessibility purposes. + * If general extraction is blocked but accessibility extraction is allowed, + * content will be extracted. + */ + ALLOW_EXTRACTION_FOR_ACCESSIBILITY, + + /** + * Enforce document permissions strictly. If extraction is blocked, + * an {@link AccessPermissionException} will be thrown. + */ + ENFORCE_PERMISSIONS + } + + private AccessCheckMode mode; /** - * This constructs an {@link AccessChecker} that - * will not perform any checking and will always return without + * Constructs an {@link AccessChecker} with {@link AccessCheckMode#DONT_CHECK}. + * This will not perform any checking and will always return without * throwing an exception. * <p/> * This constructor is available to allow for Tika's legacy (<= v1.7) behavior. */ public AccessChecker() { - needToCheck = false; - allowExtractionForAccessibility = true; + this.mode = AccessCheckMode.DONT_CHECK; } /** - * This constructs an {@link AccessChecker} that will check - * for whether or not content should be extracted from a document. + * Constructs an {@link AccessChecker} with the specified mode. * - * @param allowExtractionForAccessibility if general extraction is - * not allowed, is extraction for accessibility allowed + * @param mode the access check mode */ - public AccessChecker(boolean allowExtractionForAccessibility) { - needToCheck = true; - this.allowExtractionForAccessibility = allowExtractionForAccessibility; + public AccessChecker(AccessCheckMode mode) { + this.mode = mode; } - public boolean isNeedToCheck() { - return needToCheck; + public AccessCheckMode getMode() { + return mode; } - public void setNeedToCheck(boolean needToCheck) { - this.needToCheck = needToCheck; - } - - public boolean isAllowExtractionForAccessibility() { - return allowExtractionForAccessibility; - } - - public void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) { - this.allowExtractionForAccessibility = allowExtractionForAccessibility; + public void setMode(AccessCheckMode mode) { + this.mode = mode; } /** * Checks to see if a document's content should be extracted based - * on metadata values and the value of {@link #allowExtractionForAccessibility} in the constructor. + * on metadata values and the configured {@link AccessCheckMode}. * - * @param metadata + * @param metadata the document metadata containing access permissions * @throws AccessPermissionException if access is not permitted */ public void check(Metadata metadata) throws AccessPermissionException { - if (!needToCheck) { + if (mode == AccessCheckMode.DONT_CHECK) { return; } + if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { - if (allowExtractionForAccessibility) { + if (mode == AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY) { if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { return; } @@ -106,18 +119,11 @@ public class AccessChecker implements Serializable { } AccessChecker checker = (AccessChecker) o; - - if (needToCheck != checker.needToCheck) { - return false; - } - return allowExtractionForAccessibility == checker.allowExtractionForAccessibility; - + return mode == checker.mode; } @Override public int hashCode() { - int result = (needToCheck ? 1 : 0); - result = 31 * result + (allowExtractionForAccessibility ? 1 : 0); - return result; + return mode != null ? mode.hashCode() : 0; } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java index 26a9d33151..8eff5c597e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java @@ -94,7 +94,7 @@ class OCR2XHTML extends AbstractPDF2XHTML { public void processPage(PDPage pdPage) throws IOException { try { startPage(pdPage); - doOCROnCurrentPage(pdPage, PDFParserConfig.OCR_STRATEGY.OCR_ONLY); + doOCROnCurrentPage(pdPage, OcrConfig.Strategy.OCR_ONLY); endPage(pdPage); } catch (TikaException | SAXException e) { throw new IOException(e); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java new file mode 100644 index 0000000000..9101b2f6ab --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.io.Serializable; +import java.util.Locale; + +import org.apache.pdfbox.rendering.ImageType; + +/** + * Configuration for OCR processing in PDF parsing. + * Groups all OCR-related settings together. + */ +public class OcrConfig implements Serializable { + + private static final long serialVersionUID = 1L; + + public enum Strategy { + AUTO, + NO_OCR, + OCR_ONLY, + OCR_AND_TEXT_EXTRACTION + } + + public enum RenderingStrategy { + NO_TEXT, + TEXT_ONLY, + VECTOR_GRAPHICS_ONLY, + ALL + } + + public enum ImageFormat { + PNG, TIFF, JPEG; + + public String getFormatName() { + return name().toLowerCase(Locale.ROOT); + } + } + + public enum ImageType { + RGB(org.apache.pdfbox.rendering.ImageType.RGB), + GRAY(org.apache.pdfbox.rendering.ImageType.GRAY); + + private final org.apache.pdfbox.rendering.ImageType pdfBoxImageType; + + ImageType(org.apache.pdfbox.rendering.ImageType pdfBoxImageType) { + this.pdfBoxImageType = pdfBoxImageType; + } + + public org.apache.pdfbox.rendering.ImageType getPdfBoxImageType() { + return pdfBoxImageType; + } + } + + /** + * Configuration for AUTO strategy behavior. + * Controls when OCR is triggered based on character analysis. + */ + public static class StrategyAuto implements Serializable { + private static final long serialVersionUID = 1L; + + public static final StrategyAuto BETTER = new StrategyAuto(10, 10); + public static final StrategyAuto FASTER = new StrategyAuto(0.1f, 10); + + private float unmappedUnicodeCharsPerPage; + private int totalCharsPerPage; + + public StrategyAuto() { + this(10, 10); + } + + public StrategyAuto(float unmappedUnicodeCharsPerPage, int totalCharsPerPage) { + this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage; + this.totalCharsPerPage = totalCharsPerPage; + } + + public float getUnmappedUnicodeCharsPerPage() { + return unmappedUnicodeCharsPerPage; + } + + public void setUnmappedUnicodeCharsPerPage(float unmappedUnicodeCharsPerPage) { + this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage; + } + + public int getTotalCharsPerPage() { + return totalCharsPerPage; + } + + public void setTotalCharsPerPage(int totalCharsPerPage) { + this.totalCharsPerPage = totalCharsPerPage; + } + + @Override + public String toString() { + String unmappedString; + if (unmappedUnicodeCharsPerPage < 1.0) { + unmappedString = String.format(Locale.US, "%.03f", + unmappedUnicodeCharsPerPage * 100) + "%"; + } else { + unmappedString = String.format(Locale.US, "%.0f", unmappedUnicodeCharsPerPage); + } + return unmappedString + "," + totalCharsPerPage; + } + } + + private Strategy strategy = Strategy.AUTO; + private StrategyAuto strategyAuto = StrategyAuto.BETTER; + private RenderingStrategy renderingStrategy = RenderingStrategy.ALL; + private int dpi = 300; + private ImageType imageType = ImageType.GRAY; + private ImageFormat imageFormat = ImageFormat.PNG; + private float imageQuality = 1.0f; + + public Strategy getStrategy() { + return strategy; + } + + public void setStrategy(Strategy strategy) { + this.strategy = strategy; + } + + public StrategyAuto getStrategyAuto() { + return strategyAuto; + } + + public void setStrategyAuto(StrategyAuto strategyAuto) { + this.strategyAuto = strategyAuto; + } + + public RenderingStrategy getRenderingStrategy() { + return renderingStrategy; + } + + public void setRenderingStrategy(RenderingStrategy renderingStrategy) { + this.renderingStrategy = renderingStrategy; + } + + public int getDpi() { + return dpi; + } + + public void setDpi(int dpi) { + this.dpi = dpi; + } + + public ImageType getImageType() { + return imageType; + } + + public void setImageType(ImageType imageType) { + this.imageType = imageType; + } + + public ImageFormat getImageFormat() { + return imageFormat; + } + + public void setImageFormat(ImageFormat imageFormat) { + this.imageFormat = imageFormat; + } + + /** + * @return lowercase format name for use with image writers + */ + public String getImageFormatName() { + return imageFormat.getFormatName(); + } + + public float getImageQuality() { + return imageQuality; + } + + public void setImageQuality(float imageQuality) { + this.imageQuality = imageQuality; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 81e5fa8fe6..99b2446dca 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -232,7 +232,7 @@ public class PDFParser implements Parser, RenderingParser, Initializable { if (shouldHandleXFAOnly(hasXFA, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); } else if (localConfig.getOcrStrategy() - .equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { + .equals(OcrConfig.Strategy.OCR_ONLY)) { OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig, renderer); } else if (hasMarkedContent && localConfig.isExtractMarkedContent()) { @@ -434,7 +434,7 @@ public class PDFParser implements Parser, RenderingParser, Initializable { return true; } - if (localConfig.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.NO_OCR) { + if (localConfig.getOcrStrategy() == OcrConfig.Strategy.NO_OCR) { return false; } //TODO: test that this is not AUTO with no OCR parser installed @@ -842,38 +842,38 @@ public class PDFParser implements Parser, RenderingParser, Initializable { } @Field - public void setOcrStrategy(PDFParserConfig.OCR_STRATEGY ocrStrategy) { + public void setOcrStrategy(OcrConfig.Strategy ocrStrategy) { defaultConfig.setOcrStrategy(ocrStrategy); } - public PDFParserConfig.OCR_STRATEGY getOcrStrategy() { + public OcrConfig.Strategy getOcrStrategy() { return defaultConfig.getOcrStrategy(); } @Field - public void setOcrStrategyAuto(String ocrStrategyAuto) { - defaultConfig.setOcrStrategyAutoFromString(ocrStrategyAuto); + public void setOcrStrategyAuto(OcrConfig.StrategyAuto ocrStrategyAuto) { + defaultConfig.setOcrStrategyAuto(ocrStrategyAuto); } - public String getOcrStrategyAuto() { - return defaultConfig.getOcrStrategyAuto().toString(); + public OcrConfig.StrategyAuto getOcrStrategyAuto() { + return defaultConfig.getOcrStrategyAuto(); } @Field - public void setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY ocrRenderingStrategy) { + public void setOcrRenderingStrategy(OcrConfig.RenderingStrategy ocrRenderingStrategy) { defaultConfig.setOcrRenderingStrategy(ocrRenderingStrategy); } - public PDFParserConfig.OCR_RENDERING_STRATEGY getOcrRenderingStrategy() { + public OcrConfig.RenderingStrategy getOcrRenderingStrategy() { return defaultConfig.getOcrRenderingStrategy(); } @Field - public void setOcrImageType(PDFParserConfig.TikaImageType ocrImageType) { + public void setOcrImageType(OcrConfig.ImageType ocrImageType) { defaultConfig.setOcrImageType(ocrImageType); } - public PDFParserConfig.TikaImageType getOcrImageType() { + public OcrConfig.ImageType getOcrImageType() { return defaultConfig.getOcrImageType(); } @@ -895,8 +895,12 @@ public class PDFParser implements Parser, RenderingParser, Initializable { } @Field - public void setOcrImageFormatName(String formatName) { - defaultConfig.setOcrImageFormatName(formatName); + public void setOcrImageFormat(OcrConfig.ImageFormat imageFormat) { + defaultConfig.setOcrImageFormat(imageFormat); + } + + public OcrConfig.ImageFormat getOcrImageFormat() { + return defaultConfig.getOcrImageFormat(); } public String getOcrImageFormatName() { @@ -976,12 +980,12 @@ public class PDFParser implements Parser, RenderingParser, Initializable { return defaultConfig.isIfXFAExtractOnlyXFA(); } @Field - public void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) { - defaultConfig.setAccessChecker(new AccessChecker(allowExtractionForAccessibility)); + public void setAccessCheckMode(AccessChecker.AccessCheckMode mode) { + defaultConfig.getAccessChecker().setMode(mode); } - public boolean isAllowExtractionForAccessibility() { - return defaultConfig.getAccessChecker().isAllowExtractionForAccessibility(); + public AccessChecker.AccessCheckMode getAccessCheckMode() { + return defaultConfig.getAccessChecker().getMode(); } @Field @@ -1146,7 +1150,7 @@ public class PDFParser implements Parser, RenderingParser, Initializable { //set a default renderer if nothing was defined PDFBoxRenderer pdfBoxRenderer = new PDFBoxRenderer(); pdfBoxRenderer.setDPI(config.getOcrDPI()); - pdfBoxRenderer.setImageType(config.getOcrImageType().getImageType()); + pdfBoxRenderer.setImageType(config.getOcrImageType().getPdfBoxImageType()); pdfBoxRenderer.setImageFormatName(config.getOcrImageFormatName()); this.renderer = pdfBoxRenderer; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index fa64ad9161..6d238d2c33 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -17,11 +17,7 @@ package org.apache.tika.parser.pdf; import java.io.Serializable; -import java.util.Locale; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory; @@ -38,19 +34,6 @@ import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory; */ public class PDFParserConfig implements Serializable { - public enum TikaImageType { - RGB(ImageType.RGB), - GRAY(ImageType.GRAY); - - private ImageType imageType; - TikaImageType(ImageType imageType) { - this.imageType = imageType; - } - public ImageType getImageType() { - return imageType; - } - } - private static final long serialVersionUID = 6492570218190936986L; // True if we let PDFBox "guess" where spaces should go: @@ -110,21 +93,7 @@ public class PDFParserConfig implements Serializable { //content from elsewhere in the document. private boolean ifXFAExtractOnlyXFA = false; - private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.AUTO; - - // If OCR_Strategy=AUTO, then this controls the algorithm used - private static final OCRStrategyAuto OCR_STRATEGY_AUTO_BETTER = new OCRStrategyAuto(10, 10); - private static final OCRStrategyAuto OCR_STRATEGY_AUTO_FASTER = new OCRStrategyAuto(.1f, 10); - private static final int OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE = 10; - - private OCRStrategyAuto ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER; - - private OCR_RENDERING_STRATEGY ocrRenderingStrategy = OCR_RENDERING_STRATEGY.ALL; - - private int ocrDPI = 300; - private TikaImageType ocrImageType = TikaImageType.GRAY; - private String ocrImageFormatName = "png"; - private float ocrImageQuality = 1.0f; + private OcrConfig ocr = new OcrConfig(); /** * Should the entire document be rendered? @@ -514,172 +483,108 @@ public class PDFParserConfig implements Serializable { } /** - * @return strategy to use for OCR + * @return the OCR configuration */ - public OCR_STRATEGY getOcrStrategy() { - return ocrStrategy; + public OcrConfig getOcr() { + return ocr; } /** - * @return ocr auto strategy to use when ocr_strategy = Auto + * @param ocr the OCR configuration */ - public OCRStrategyAuto getOcrStrategyAuto() { - return ocrStrategyAuto; + public void setOcr(OcrConfig ocr) { + this.ocr = ocr; } /** - * Which strategy to use for OCR - * - * @param ocrStrategy + * @return strategy to use for OCR */ - public void setOcrStrategy(OCR_STRATEGY ocrStrategy) { - this.ocrStrategy = ocrStrategy; + public OcrConfig.Strategy getOcrStrategy() { + return ocr.getStrategy(); } - /** - * Sets the OCR strategy auto configuration from an object. - * Used by Jackson deserialization. - * - * @param ocrStrategyAuto the OCR strategy auto configuration + * @return ocr auto strategy to use when ocr_strategy = Auto */ - public void setOcrStrategyAuto(OCRStrategyAuto ocrStrategyAuto) { - this.ocrStrategyAuto = ocrStrategyAuto; + public OcrConfig.StrategyAuto getOcrStrategyAuto() { + return ocr.getStrategyAuto(); } /** - * Sets the OCR strategy auto configuration from a string. - * Used for configuration parsing from XML/text via PDFParser's @Field annotation. - * Package-private to prevent Jackson from discovering it during bean introspection. - * - * @param ocrStrategyAuto string representation of OCR strategy - */ - void setOcrStrategyAutoFromString(String ocrStrategyAuto) { - final String regex = "^\\s*(faster|better)|(\\d{1,3})(%)?(?:,\\s*(\\d{1,3}))?\\s*$"; - Pattern pattern = Pattern.compile(regex); - Matcher matcher = pattern.matcher(ocrStrategyAuto); - if (matcher.matches()) { - final String group1 = matcher.group(1); - - if ("better".equals(group1)) { - this.ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER; - } else if ("faster".equals(group1)) { - this.ocrStrategyAuto = OCR_STRATEGY_AUTO_FASTER; - } else { - float unmappedUnicodeCharsPerPage = Integer.parseInt(matcher.group(2)); - if (matcher.group(3) != null) { - // If we have the percent sign, then convert - if (unmappedUnicodeCharsPerPage > 100.0) { - throw new IllegalArgumentException - ("Error parsing OCRStrategyAuto - Percent cannot exceed 100%"); - } - unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage / 100f; - } - // The 2nd number is optional. Default to 10 chars per page - int totalCharsPerPage = matcher.group(4) == null - ? OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE - : Integer.parseInt(matcher.group(4)); - this.ocrStrategyAuto = new OCRStrategyAuto(unmappedUnicodeCharsPerPage, totalCharsPerPage); - } + * Which strategy to use for OCR + */ + public void setOcrStrategy(OcrConfig.Strategy ocrStrategy) { + ocr.setStrategy(ocrStrategy); + } - } else { - throw new IllegalArgumentException("Error parsing OCRStrategyAuto - Must be in the form 'num[%], num'"); - } + /** + * Sets the OCR strategy auto configuration. + */ + public void setOcrStrategyAuto(OcrConfig.StrategyAuto ocrStrategyAuto) { + ocr.setStrategyAuto(ocrStrategyAuto); } - public OCR_RENDERING_STRATEGY getOcrRenderingStrategy() { - return ocrRenderingStrategy; + public OcrConfig.RenderingStrategy getOcrRenderingStrategy() { + return ocr.getRenderingStrategy(); } /** * When rendering the page for OCR, do you want to include the rendering of the electronic text, * ALL, or do you only want to run OCR on the images and vector graphics (NO_TEXT)? - * - * @param ocrRenderingStrategy */ - public void setOcrRenderingStrategy(OCR_RENDERING_STRATEGY ocrRenderingStrategy) { - this.ocrRenderingStrategy = ocrRenderingStrategy; + public void setOcrRenderingStrategy(OcrConfig.RenderingStrategy ocrRenderingStrategy) { + ocr.setRenderingStrategy(ocrRenderingStrategy); } /** - * String representation of the image format used to render - * the page image for OCR (examples: png, tiff, jpeg) - * - * @return + * @return lowercase format name (e.g., "png", "tiff", "jpeg") */ public String getOcrImageFormatName() { - return ocrImageFormatName; + return ocr.getImageFormatName(); } - /** - * @param ocrImageFormatName name of image format used to render - * page image - * @see #getOcrImageFormatName() - */ - public void setOcrImageFormatName(String ocrImageFormatName) { - if (!ocrImageFormatName.equals("png") && !ocrImageFormatName.equals("tiff") && - !ocrImageFormatName.equals("jpeg")) { - throw new IllegalArgumentException( - "Available options: png, tiff, jpeg. " + "I'm sorry, but I don't recognize: " + - ocrImageFormatName); - } - this.ocrImageFormatName = ocrImageFormatName; + public OcrConfig.ImageFormat getOcrImageFormat() { + return ocr.getImageFormat(); } - /** - * Image type used to render the page image for OCR. - * - * @return image type - * @see #setOcrImageType(TikaImageType) - */ - public TikaImageType getOcrImageType() { - return ocrImageType; + public void setOcrImageFormat(OcrConfig.ImageFormat ocrImageFormat) { + ocr.setImageFormat(ocrImageFormat); } - /** - * Image type used to render the page image for OCR. - * - * @param ocrImageType - */ - public void setOcrImageType(TikaImageType ocrImageType) { - this.ocrImageType = ocrImageType; + public OcrConfig.ImageType getOcrImageType() { + return ocr.getImageType(); + } + + public void setOcrImageType(OcrConfig.ImageType ocrImageType) { + ocr.setImageType(ocrImageType); } /** - * Dots per inch used to render the page image for OCR - * - * @return dots per inch + * @return dots per inch used to render the page image for OCR */ public int getOcrDPI() { - return ocrDPI; + return ocr.getDpi(); } /** * Dots per inch used to render the page image for OCR. - * This does not apply to all image formats. - * - * @param ocrDPI */ public void setOcrDPI(int ocrDPI) { - this.ocrDPI = ocrDPI; + ocr.setDpi(ocrDPI); } /** - * Image quality used to render the page image for OCR. - * This does not apply to all image formats - * - * @return + * @return image quality used to render the page image for OCR */ public float getOcrImageQuality() { - return ocrImageQuality; + return ocr.getImageQuality(); } /** * Image quality used to render the page image for OCR. - * This does not apply to all image formats */ public void setOcrImageQuality(float ocrImageQuality) { - this.ocrImageQuality = ocrImageQuality; + ocr.setImageQuality(ocrImageQuality); } /** @@ -808,78 +713,6 @@ public class PDFParserConfig implements Serializable { return throwOnEncryptedPayload; } - public enum OCR_STRATEGY { - AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION - } - - /** - * Encapsulate the numbers used to control OCR Strategy when set to auto - * <p> - * If the total characters on the page < this.totalCharsPerPage - * or - * total unmapped unicode characters on the page > this.unmappedUnicodeCharsPerPage - * then we will perform OCR on the page - * <p> - * If unamppedUnicodeCharsPerPage is an integer > 0, then we compare absolute number of characters. - * If it is a float < 1, then we assume it is a percentage and we compare it to the - * percentage of unmappedCharactersPerPage/totalCharsPerPage - */ - public static class OCRStrategyAuto implements Serializable { - private float unmappedUnicodeCharsPerPage; - private int totalCharsPerPage; - - /** - * No-arg constructor for Jackson deserialization. - * Uses default "better" strategy values. - */ - public OCRStrategyAuto() { - this(10, 10); - } - - public OCRStrategyAuto(float unmappedUnicodeCharsPerPage, int totalCharsPerPage) { - this.totalCharsPerPage = totalCharsPerPage; - this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage; - } - - public float getUnmappedUnicodeCharsPerPage() { - return unmappedUnicodeCharsPerPage; - } - - public void setUnmappedUnicodeCharsPerPage(float unmappedUnicodeCharsPerPage) { - this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage; - } - - public int getTotalCharsPerPage() { - return totalCharsPerPage; - } - - public void setTotalCharsPerPage(int totalCharsPerPage) { - this.totalCharsPerPage = totalCharsPerPage; - } - - @Override - public String toString() { - //TODO -- figure out if this is actual BEST or whatever - //and return that instead of the literal values - String unmappedString = null; - if (unmappedUnicodeCharsPerPage < 1.0) { - unmappedString = String.format(Locale.US, "%.03f", - unmappedUnicodeCharsPerPage * 100) + "%"; - } else { - unmappedString = String.format(Locale.US, "%.0f", unmappedUnicodeCharsPerPage); - } - return unmappedString + "," + totalCharsPerPage; - } - } - - public enum OCR_RENDERING_STRATEGY { - NO_TEXT, //includes vector graphics and image - TEXT_ONLY, //renders only glyphs - VECTOR_GRAPHICS_ONLY, //renders only vector graphics - ALL - //TODO: add AUTO? - } - public enum IMAGE_STRATEGY { NONE, /** diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java index c4d3a028f8..7e82263b97 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java @@ -228,7 +228,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable { if (pdfParserConfig == null) { return defaultImageType; } - return pdfParserConfig.getOcrImageType().getImageType(); + return pdfParserConfig.getOcrImageType().getPdfBoxImageType(); } protected String getImageFormatName(ParseContext parseContext) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java index 2335fc16f3..f4b7e75706 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java @@ -28,8 +28,8 @@ import org.apache.tika.metadata.PropertyTypeException; public class AccessCheckerTest { @Test - public void testLegacy() throws AccessPermissionException { - //test that there are no thrown exceptions + public void testDontCheck() throws AccessPermissionException { + //test that there are no thrown exceptions with DONT_CHECK mode Metadata m = getMetadata(false, false); //legacy behavior; don't bother checking AccessChecker checker = new AccessChecker(); @@ -40,14 +40,18 @@ public class AccessCheckerTest { m = getMetadata(true, true); checker.check(m); + + // Explicitly set DONT_CHECK mode + checker = new AccessChecker(AccessChecker.AccessCheckMode.DONT_CHECK); + m = getMetadata(false, false); + checker.check(m); } @Test - public void testNoExtraction() { - + public void testEnforcePermissions() { Metadata m = null; - //allow nothing - AccessChecker checker = new AccessChecker(false); + // ENFORCE_PERMISSIONS - no extraction allowed if blocked + AccessChecker checker = new AccessChecker(AccessChecker.AccessCheckMode.ENFORCE_PERMISSIONS); boolean ex = false; try { m = getMetadata(false, false); @@ -62,17 +66,17 @@ public class AccessCheckerTest { m = getMetadata(false, true); checker.check(m); } catch (AccessPermissionException e) { - //but application is not an accessibility application + //but ENFORCE_PERMISSIONS mode doesn't allow it ex = true; } - assertTrue(ex, "correct exception with no extraction, no extract for accessibility"); + assertTrue(ex, "correct exception with no extraction, enforce permissions"); } @Test - public void testExtractOnlyForAccessibility() throws AccessPermissionException { + public void testAllowExtractionForAccessibility() throws AccessPermissionException { Metadata m = getMetadata(false, true); - //allow accessibility - AccessChecker checker = new AccessChecker(true); + // ALLOW_EXTRACTION_FOR_ACCESSIBILITY mode + AccessChecker checker = new AccessChecker(AccessChecker.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY); checker.check(m); assertTrue(true, "no exception"); boolean ex = false; @@ -88,18 +92,17 @@ public class AccessCheckerTest { @Test public void testIllogicalExtractNotForAccessibility() throws AccessPermissionException { Metadata m = getMetadata(true, false); - //allow accessibility - AccessChecker checker = new AccessChecker(true); + // ALLOW_EXTRACTION_FOR_ACCESSIBILITY mode + AccessChecker checker = new AccessChecker(AccessChecker.AccessCheckMode.ALLOW_EXTRACTION_FOR_ACCESSIBILITY); checker.check(m); assertTrue(true, "no exception"); - //don't extract for accessibility - checker = new AccessChecker(false); + // ENFORCE_PERMISSIONS mode + checker = new AccessChecker(AccessChecker.AccessCheckMode.ENFORCE_PERMISSIONS); //if extract content is allowed, the checker shouldn't //check the value of extract for accessibility checker.check(m); assertTrue(true, "no exception"); - } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index b4645386db..b617b77d79 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1132,7 +1132,7 @@ public class PDFParserTest extends TikaTest { //behavior config = new PDFParserConfig(); config.setOcrDPI(10000); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + config.setOcrStrategy(OcrConfig.Strategy.NO_OCR); pc.set(PDFParserConfig.class, config); text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc); text = text.replaceAll("\\s+", " "); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml index 9124c89244..bffe8be380 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml @@ -21,13 +21,13 @@ <parser class="org.apache.tika.parser.pdf.PDFParser"> <params> <param name="extractInlineImages" type="bool">true</param> - <param name="allowExtractionForAccessibility" type="bool">true</param> + <param name="accessCheckMode" type="string">ALLOW_EXTRACTION_FOR_ACCESSIBILITY</param> <param name="catchIntermediateExceptions" type="bool">false</param> <param name="extractUniqueInlineImagesOnly" type="bool">false</param> <param name="catchIntermediateExceptions" type="bool">false</param> <param name="ocrDPI" type="int">314</param> <param name="ocrImageQuality" type="float">2.1</param> - <param name="ocrImageFormatName" type="string">jpeg</param> + <param name="ocrImageFormat" type="string">JPEG</param> <param name="ocrImageScale" type="float">1.3</param> <param name="maxMainMemoryBytes" type="long">524288000</param> <!-- we really should throw an exception for this!! --> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java index 60172383a9..6b7eb3d0b7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java @@ -49,8 +49,8 @@ public class TikaConfigSerializerTest { assertContains(detectorNeedle, xml); String parserNeedle = "<parser class=\"org.apache.tika.parser.pdf.PDFParser\">" + - " <params> <param name=\"allowExtractionForAccessibility\" " + - "type=\"bool\">true</param>"; + " <params> <param name=\"accessCheckMode\" " + + "type=\"string\">DONT_CHECK</param>"; assertContains(parserNeedle, xml); //TODO This is still to be implemented -- we do not want to show the default renderer here diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java index 18c1314599..140a82d5f0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java @@ -27,6 +27,7 @@ import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.pdf.OcrConfig; import org.apache.tika.parser.pdf.PDFParserConfig; public class TSDParserTest extends TikaTest { @@ -35,7 +36,7 @@ public class TSDParserTest extends TikaTest { public void testBrokenPdf() throws Exception { ParseContext parseContext = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + config.setOcrStrategy(OcrConfig.Strategy.NO_OCR); parseContext.set(PDFParserConfig.class, config); //make sure that embedded file appears in list //and make sure embedded exception is recorded diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index d0aa70519e..d89beadcfd 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -113,7 +113,7 @@ public class PDFParserTest extends TikaTest { private static ParseContext NO_OCR() { PDFParserConfig config = new PDFParserConfig(); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + config.setOcrStrategy(OcrConfig.Strategy.NO_OCR); ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, config); return context; @@ -230,7 +230,7 @@ public class PDFParserTest extends TikaTest { PDFParserConfig config = new PDFParserConfig(); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + config.setOcrStrategy(OcrConfig.Strategy.NO_OCR); context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); context.set(org.apache.tika.parser.Parser.class, p); @@ -260,15 +260,15 @@ public class PDFParserTest extends TikaTest { public void testEmbeddedDocsWithOCROnly() throws Exception { assumeTrue(canRunOCR(), "can't run OCR"); //test default is "auto" - assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO, new PDFParserConfig().getOcrStrategy()); + assertEquals(OcrConfig.Strategy.AUTO, new PDFParserConfig().getOcrStrategy()); testStrategy(null); //now test other options - for (PDFParserConfig.OCR_STRATEGY strategy : PDFParserConfig.OCR_STRATEGY.values()) { + for (OcrConfig.Strategy strategy : OcrConfig.Strategy.values()) { testStrategy(strategy); } } - private void testStrategy(PDFParserConfig.OCR_STRATEGY strategy) throws Exception { + private void testStrategy(OcrConfig.Strategy strategy) throws Exception { //make sure everything works with regular xml _and_ with recursive ParseContext context = new ParseContext(); if (strategy != null) { @@ -277,7 +277,7 @@ public class PDFParserTest extends TikaTest { context.set(PDFParserConfig.class, config); }; PDFParserConfig config = context.get(PDFParserConfig.class, new PDFParserConfig()); - config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.ALL); + config.setOcrRenderingStrategy(OcrConfig.RenderingStrategy.ALL); context.set(PDFParserConfig.class, config); XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", context); @@ -289,7 +289,7 @@ public class PDFParserTest extends TikaTest { } assertContains("Haystack", xmlResult.xml); assertContains("Needle", xmlResult.xml); - if (strategy == null || strategy != PDFParserConfig.OCR_STRATEGY.NO_OCR) { + if (strategy == null || strategy != OcrConfig.Strategy.NO_OCR) { // Tesseract may see the t in haystack as a ! some times... //or it might see dehayslack... //TODO: figure out how to make this test less hacky @@ -328,7 +328,7 @@ public class PDFParserTest extends TikaTest { //TIKA-1990, test that an embedded jpeg is correctly decoded PDFParserConfig config = new PDFParserConfig(); config.setExtractInlineImages(true); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + config.setOcrStrategy(OcrConfig.Strategy.NO_OCR); ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, config); @@ -349,7 +349,7 @@ public class PDFParserTest extends TikaTest { PDFParserConfig config = new PDFParserConfig(); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + config.setOcrStrategy(OcrConfig.Strategy.NO_OCR); context.set(PDFParserConfig.class, config); @@ -376,7 +376,7 @@ public class PDFParserTest extends TikaTest { public void testJBIG2OCROnly() throws Exception { assumeTrue(canRunOCR(), "can't run OCR"); PDFParserConfig config = new PDFParserConfig(); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); + config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY); ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, config); //make sure everything works with regular xml _and_ with recursive @@ -388,7 +388,7 @@ public class PDFParserTest extends TikaTest { public void testJPEG2000() throws Exception { assumeTrue(canRunOCR(), "can't run OCR"); PDFParserConfig config = new PDFParserConfig(); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); + config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY); ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, config); //make sure everything works with regular xml _and_ with recursive @@ -404,13 +404,13 @@ public class PDFParserTest extends TikaTest { assertContains("Happy New Year", getXML("testOCR.pdf").xml); PDFParserConfig config = new PDFParserConfig(); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO); + config.setOcrStrategy(OcrConfig.Strategy.AUTO); ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, config); XMLResult xmlResult = getXML("testOCR.pdf", context); assertContains("Happy New Year", xmlResult.xml); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + config.setOcrStrategy(OcrConfig.Strategy.NO_OCR); String txt = getText("testOCR.pdf", new Metadata(), context); assertEquals("", txt.trim()); } @@ -419,16 +419,16 @@ public class PDFParserTest extends TikaTest { public void testOCRNoText() throws Exception { assumeTrue(canRunOCR(), "can't run OCR"); PDFParserConfig config = new PDFParserConfig(); - config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.ALL); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); + config.setOcrRenderingStrategy(OcrConfig.RenderingStrategy.ALL); + config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY); ParseContext parseContext = new ParseContext(); parseContext.set(PDFParserConfig.class, config); XMLResult xmlResult = getXML("testPDF_XFA_govdocs1_258578.pdf", parseContext); assertContains("PARK", xmlResult.xml); assertContains("Applications", xmlResult.xml); - config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); + config.setOcrRenderingStrategy(OcrConfig.RenderingStrategy.NO_TEXT); + config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY); parseContext.set(PDFParserConfig.class, config); xmlResult = getXML("testPDF_XFA_govdocs1_258578.pdf", parseContext); assertContains("NATIONAL", xmlResult.xml); @@ -583,7 +583,7 @@ public class PDFParserTest extends TikaTest { PDFParserConfig config = new PDFParserConfig(); config.setSortByPosition(true); config.setExtractInlineImages(true); - config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO); + config.setOcrStrategy(OcrConfig.Strategy.AUTO); ParseContext parseContext = new ParseContext(); parseContext.set(PDFParserConfig.class, config); @@ -607,7 +607,7 @@ public class PDFParserTest extends TikaTest { "sortByPosition should be preserved"); assertTrue(deserializedConfig.isExtractInlineImages(), "extractInlineImages should be preserved"); - assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO, deserializedConfig.getOcrStrategy(), + assertEquals(OcrConfig.Strategy.AUTO, deserializedConfig.getOcrStrategy(), "ocrStrategy should be preserved"); } @@ -661,9 +661,9 @@ public class PDFParserTest extends TikaTest { .get(MediaType.application("pdf")); assertEquals("org.apache.tika.parser.pdf.PDFParser", pdfParser.getClass().getName()); - assertEquals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY, + assertEquals(OcrConfig.Strategy.OCR_ONLY, ((PDFParser) pdfParser).getPDFParserConfig().getOcrStrategy()); - assertEquals(PDFParserConfig.TikaImageType.RGB, + assertEquals(OcrConfig.ImageType.RGB, ((PDFParser) pdfParser).getPDFParserConfig().getOcrImageType()); }
