tballison commented on code in PR #2769: URL: https://github.com/apache/tika/pull/2769#discussion_r3096293143
########## tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRParser.java: ########## @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ocrencode; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.tika.sax.XHTMLContentHandler.XHTML; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.codec.binary.Base64InputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import org.apache.tika.config.ConfigDeserializer; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.JsonConfig; +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.ParentContentHandler; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractExternalProcessParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.TeeContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; + +/** + * Parser that base64-encodes image content instead of performing OCR + * text extraction. This is useful when you need to preserve the original + * image data in the parsed output for downstream processing by an + * external service. + * <p> + * To configure this parser, pass an {@link EncodeOCRConfig} object + * through the ParseContext, or configure it via tika-config.xml/json. + */ +@TikaComponent(spi = false) +public class EncodeOCRParser + extends AbstractExternalProcessParser + implements Initializable { + + private static final String OCR = "ocr-"; + private static final Logger LOG = LoggerFactory.getLogger( + EncodeOCRParser.class); + private static final Object[] LOCK = new Object[0]; + private static final long serialVersionUID = -8167538283213097266L; + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<>(Arrays.asList( + MediaType.image(OCR + "png"), + MediaType.image(OCR + "jpeg"), + MediaType.image(OCR + "tiff"), + MediaType.image(OCR + "bmp"), + MediaType.image(OCR + "gif"), + // these are not currently covered by other parsers + MediaType.image("jp2"), + MediaType.image("jpx"), + MediaType.image("x-portable-pixmap"), + // add the ocr- versions as well + MediaType.image(OCR + "jp2"), + MediaType.image(OCR + "jpx"), + MediaType.image(OCR + "x-portable-pixmap") + ))); + private static volatile boolean hasWarned = false; + + private EncodeOCRConfig defaultConfig = new EncodeOCRConfig(); + + public EncodeOCRParser() { + } + + public EncodeOCRParser(EncodeOCRConfig config) { + this.defaultConfig = config; + } + + /** + * Constructor for JSON configuration. + * Requires Jackson on the classpath. + * + * @param jsonConfig JSON configuration + */ + public EncodeOCRParser(JsonConfig jsonConfig) { + this(ConfigDeserializer.buildConfig( + jsonConfig, EncodeOCRConfig.class)); + } + + @Override + public void initialize() throws TikaConfigException { + //no-op + } + + public void checkInitialization() throws TikaConfigException { + //no-op + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + EncodeOCRConfig userConfig = context.get(EncodeOCRConfig.class); + EncodeOCRConfig config = defaultConfig; + if (userConfig != null) { + try { + config = defaultConfig.cloneAndUpdate(userConfig); + } catch (TikaException e) { + LOG.warn("Failed to merge config, using default", e); + } + } + if (config.isSkipOcr()) { + return Collections.emptySet(); + } + return SUPPORTED_TYPES; + } + + @Override + public void parse( + TikaInputStream tis, + ContentHandler handler, + Metadata metadata, + ParseContext parseContext + ) throws IOException, SAXException, TikaException { + normalizeOCRMimeMetadata(metadata); + + ParseContext workingContext = + parseContext != null ? parseContext : new ParseContext(); + + EncodeOCRConfig userConfig = workingContext.get( + EncodeOCRConfig.class); + EncodeOCRConfig config = defaultConfig; + if (userConfig != null) { + config = defaultConfig.cloneAndUpdate(userConfig); + } + + if (config != null && config.isSkipOcr()) { + return; + } + + try (TemporaryResources tmp = new TemporaryResources()) { + TikaInputStream tikaStream = TikaInputStream.get( Review Comment: Not sure you need to do this? Given that you're just reading once, right? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
