This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-3891 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9d3c171a8cc7b70c75438b380c739b646a511adb Author: tallison <[email protected]> AuthorDate: Fri Oct 21 14:14:59 2022 -0400 TIKA-3891 -- mvp for serializing parameters --- .../apache/tika/config/TikaConfigSerializer.java | 315 ++++++++++++++++++++- .../org/apache/tika/parser/mp3/ID3v2Frame.java | 4 + .../java/org/apache/tika/parser/mp3/Mp3Parser.java | 3 + .../apache/tika/parser/dwg/AbstractDWGParser.java | 14 +- .../org/apache/tika/parser/image/BPGParser.java | 4 + .../org/apache/tika/parser/image/PSDParser.java | 4 + .../parser/microsoft/AbstractOfficeParser.java | 26 +- .../tika/parser/microsoft/OfficeParserConfig.java | 9 + .../tika/parser/microsoft/rtf/RTFParser.java | 2 +- .../tika/parser/odf/FlatOpenDocumentParser.java | 3 + .../apache/tika/parser/odf/OpenDocumentParser.java | 4 + .../tika/parser/wordperfect/WordPerfectParser.java | 4 + .../apache/tika/parser/ocr/TesseractOCRParser.java | 68 +++++ .../org/apache/tika/parser/pdf/AccessChecker.java | 18 +- .../java/org/apache/tika/parser/pdf/PDFParser.java | 141 +++++++-- .../apache/tika/parser/pdf/PDFParserConfig.java | 14 + .../apache/tika/parser/pkg/CompressorParser.java | 4 + .../org/apache/tika/parser/pkg/PackageParser.java | 4 + .../tika/parser/txt/Icu4jEncodingDetector.java | 13 + .../detect/zip/DefaultZipContainerDetector.java | 4 + .../tika-parsers-standard-package/pom.xml | 14 + .../tika/config/TikaConfigSerializerTest.java | 92 ++++++ .../configs/tika-config-tesseract-arbitrary.xml | 30 ++ 23 files changed, 749 insertions(+), 45 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java index 12aa4bc70..ef565994b 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java @@ -17,18 +17,31 @@ package org.apache.tika.config; import java.io.Writer; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; import java.nio.charset.Charset; import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; import java.util.Set; +import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ExecutorService; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; @@ -48,10 +61,30 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.multiple.AbstractMultipleParser; +import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; public class TikaConfigSerializer { + private static final Logger LOG = LoggerFactory.getLogger(TikaConfigSerializer.class); + private static Map<Class, String> PRIMITIVES = new HashMap<>(); + + static { + PRIMITIVES.put(Integer.class, "int"); + PRIMITIVES.put(int.class, "int"); + PRIMITIVES.put(String.class, "string"); + PRIMITIVES.put(Boolean.class, "bool"); + PRIMITIVES.put(boolean.class, "bool"); + PRIMITIVES.put(Float.class, "float"); + PRIMITIVES.put(float.class, "float"); + PRIMITIVES.put(Double.class, "double"); + PRIMITIVES.put(double.class, "double"); + PRIMITIVES.put(Long.class, "long"); + PRIMITIVES.put(long.class, "long"); + PRIMITIVES.put(Map.class, "map"); + PRIMITIVES.put(List.class, "list"); + } + /** * @param config config to serialize * @param mode serialization mode @@ -171,6 +204,8 @@ public class TikaConfigSerializer { for (EncodingDetector d : children) { Element encDetectorElement = doc.createElement("encodingDetector"); encDetectorElement.setAttribute("class", d.getClass().getCanonicalName()); + serializeParams(doc, encDetectorElement, d); + encDetectorsElement.appendChild(encDetectorElement); } } @@ -200,6 +235,7 @@ public class TikaConfigSerializer { for (Detector d : children) { Element detectorElement = doc.createElement("detector"); detectorElement.setAttribute("class", d.getClass().getCanonicalName()); + serializeParams(doc, detectorElement, d); detectorsElement.appendChild(detectorElement); } } @@ -287,16 +323,7 @@ public class TikaConfigSerializer { parserElement.setAttribute("class", className); rootElement.appendChild(parserElement); - // TODO Output configurable parameters in a genric way, see TIKA-1508 - if (parser instanceof AbstractMultipleParser) { - Element paramsElement = doc.createElement("params"); - Element paramElement = doc.createElement("param"); - paramElement.setAttribute("name", "metadataPolicy"); - paramElement.setAttribute("value", - ((AbstractMultipleParser) parser).getMetadataPolicy().toString()); - paramsElement.appendChild(paramElement); - parserElement.appendChild(paramsElement); - } + serializeParams(doc, parserElement, parser); for (MediaType type : addedTypes) { Element mimeElement = doc.createElement("mime"); @@ -312,6 +339,274 @@ public class TikaConfigSerializer { return parserElement; } + public static void serializeParams(Document doc, Element element, Object object) { + Matcher setterMatcher = Pattern.compile("\\Aset([A-Z].*)").matcher(""); + Matcher getterMatcher = Pattern.compile("\\A(?:get|is)([A-Z].+)\\Z").matcher(""); + + //TODO -- check code base for setters with lowercase initial letters?! + MethodTuples nonPrimitiveSetters = new MethodTuples(); + MethodTuples primitiveSetters = new MethodTuples(); + MethodTuples nonPrimitiveGetters = new MethodTuples(); + MethodTuples primitiveGetters = new MethodTuples(); + for (Method method : object.getClass().getDeclaredMethods()) { + Class[] parameterTypes = method.getParameterTypes(); + + if (setterMatcher.reset(method.getName()).find()) { + if (!Modifier.isPublic(method.getModifiers())) { + //we could just call getMethods, but this can be helpful debugging inf + LOG.trace("inaccessible setter: {} in {}", method.getName(), object.getClass()); + continue; + } + //require @Field on setters + if (method.getAnnotation(Field.class) == null) { + // LOG.warn("unannotated setter {} in {}", method.getName(), object.getClass()); + continue; + } + if (parameterTypes.length != 1) { + //TODO -- check code base for setX() zero parameters that set boolean to true + LOG.warn("setter with wrong number of params " + method.getName() + " " + parameterTypes.length); + continue; + } + String paramName = methodToParamName(setterMatcher.group(1)); + if (PRIMITIVES.containsKey(parameterTypes[0])) { + primitiveSetters.add(new MethodTuple(paramName, method, parameterTypes[0])); + } else { + nonPrimitiveSetters.add(new MethodTuple(paramName, method, parameterTypes[0])); + } + } else if (getterMatcher.reset(method.getName()).find()) { + if (parameterTypes.length != 0) { + //require 0 parameters for the getter + continue; + } + String paramName = methodToParamName(getterMatcher.group(1)); + if (PRIMITIVES.containsKey(method.getReturnType())) { + primitiveGetters.add(new MethodTuple(paramName, method, method.getReturnType())); + } else { + nonPrimitiveGetters.add(new MethodTuple(paramName, method, method.getReturnType())); + } + + } + } + + //TODO -- remove nonprimitive setters/getters that have a string equivalent + serializePrimitives(doc, element, object, primitiveSetters, primitiveGetters); + serializeNonPrimitives(doc, element, object, nonPrimitiveSetters, nonPrimitiveGetters); + + } + + private static String methodToParamName(String name) { + if (StringUtils.isBlank(name)) { + return name; + } + return name.substring(0, 1).toLowerCase(Locale.US) + name.substring(1); + + } + + private static void serializeNonPrimitives(Document doc, Element element, + Object object, + MethodTuples setterTuples, + MethodTuples getterTuples) { + + for (Map.Entry<String, Set<MethodTuple>> e : setterTuples.tuples.entrySet()) { + Set<MethodTuple> getters = getterTuples.tuples.get(e.getKey()); + processNonPrimitive(e.getKey(), e.getValue(), getters, doc, element, object); + if (!getterTuples.tuples.containsKey(e.getKey())) { + LOG.warn("no getter for setter non-primitive: {} in {}", e.getKey(), + object.getClass()); + continue; + } + } + } + + private static void processNonPrimitive(String name, Set<MethodTuple> setters, + Set<MethodTuple> getters, Document doc, Element element, + Object object) { + for (MethodTuple setter : setters) { + for (MethodTuple getter : getters) { + if (setter.singleParam.equals(getter.singleParam)) { + serializeObject(name, doc, element, setter, getter, object); + return; + } + } + } + } + + private static void serializeObject(String name, Document doc, Element element, + MethodTuple setter, + MethodTuple getter, Object object) { + + Object item = null; + try { + item = getter.method.invoke(object); + } catch (IllegalAccessException | InvocationTargetException e) { + LOG.warn("couldn't get " + name + " on " + object.getClass(), e); + return; + } + if (item == null) { + LOG.warn("Getter {} on {} returned null", getter.name, object.getClass()); + } + Element entry = doc.createElement(name); + entry.setAttribute("class", item.getClass().getCanonicalName()); + element.appendChild(entry); + serializeParams(doc, element, item); + } + + private static void serializePrimitives(Document doc, Element root, + Object object, + MethodTuples setterTuples, MethodTuples getterTuples) { + + Element params = null; + for (Map.Entry<String, Set<MethodTuple>> e : setterTuples.tuples.entrySet()) { + if (!getterTuples.tuples.containsKey(e.getKey())) { + LOG.info("no getter for setter: {} in {}", e.getKey(), object.getClass()); + continue; + } + Set<MethodTuple> getters = getterTuples.tuples.get(e.getKey()); + Set<MethodTuple> setters = e.getValue(); + MethodTuple getterTuple = null; + for (MethodTuple getterCandidate : getters) { + for (MethodTuple setter : setters) { + if (getterCandidate.singleParam.equals(setter.singleParam)) { + getterTuple = getterCandidate; + break; + } + } + } + + if (getterTuple == null) { + LOG.debug("Could not find getter to match setter for: {}", e.getKey()); + continue; + } + Object value = null; + try { + value = getterTuple.method.invoke(object); + } catch (IllegalAccessException ex) { + LOG.error("couldn't invoke " + getterTuple, ex); + continue; + } catch (InvocationTargetException ex) { + LOG.error("couldn't invoke " + getterTuple, ex); + continue; + } + if (value == null) { + LOG.debug("null value: {} in {}", getterTuple.name, object.getClass()); + } + String valString = (value == null) ? "" : value.toString(); + Element param = doc.createElement("param"); + param.setAttribute("name", getterTuple.name); + param.setAttribute("type", PRIMITIVES.get(getterTuple.singleParam)); + if (List.class.isAssignableFrom(getterTuple.singleParam)) { + //this outputs even empty list elements, which I think is good. + addList(param, doc, getterTuple, (List<String>) value); + } else if (Map.class.isAssignableFrom(getterTuple.singleParam)) { + //this outputs even empty lists, which I think is good. + addMap(param, doc, getterTuple, (Map<String, String>) value); + } else { + param.setTextContent(valString); + } + if (params == null) { + params = doc.createElement("params"); + root.appendChild(params); + } + params.appendChild(param); + } + } + + private static void addMap(Element param, Document doc, MethodTuple getterTuple, + Map<String, String> object) { + for (Map.Entry<String, String> e : new TreeMap<String, String>(object).entrySet()) { + Element element = doc.createElement("string"); + element.setAttribute("key", e.getKey()); + element.setAttribute("value", e.getValue()); + param.appendChild(element); + } + + } + + private static void addList(Element param, Document doc, MethodTuple getterTuple, + List<String> list) { + for (String s : list) { + Element element = doc.createElement("string"); + element.setTextContent(s); + param.appendChild(element); + } + } + + private static Method findGetter(MethodTuple setter, Object object) { + Matcher m = Pattern.compile("\\A(?:get|is)([A-Z].+)\\Z").matcher(""); + for (Method method : object.getClass().getMethods()) { + if (object.getClass().getName().contains("PDF")) { + System.out.println(method.getName()); + } + if (m.reset(method.getName()).find()) { + if (object.getClass().getName().contains("PDF")) { + System.out.println("2: " + method.getName()); + } + String paramName = m.group(1); + if (setter.name.equals(paramName)) { + Class returnType = method.getReturnType(); + if (setter.singleParam.equals(returnType)) { + return method; + } + } + } + } + return null; + } + + private static MethodTuple pickBestSetter(Set<MethodTuple> tuples) { + //TODO -- if both string and integer, which one do we pick? + //stub for now -- just pick the first + for (MethodTuple t : tuples) { + return t; + } + return null; + } + + private static class MethodTuples { + Map<String, Set<MethodTuple>> tuples = new TreeMap<>(); + + public void add(MethodTuple tuple) { + Set<MethodTuple> set = tuples.get(tuple.name); + if (set == null) { + set = new HashSet<>(); + tuples.put(tuple.name, set); + } + set.add(tuple); + } + + public int getSize() { + return tuples.size(); + } + } + private static class MethodTuple { + String name; + Method method; + Class singleParam; + + public MethodTuple(String name, Method method, Class singleParam) { + this.name = name; + this.method = method; + this.singleParam = singleParam; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + MethodTuple that = (MethodTuple) o; + return name.equals(that.name) && method.equals(that.method) && + singleParam.equals(that.singleParam); + } + + @Override + public int hashCode() { + return Objects.hash(name, method, singleParam); + } + } public enum Mode { /** * Minimal version of the config, defaults where possible diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java index 6f39a28fb..1dddd1410 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java @@ -312,6 +312,10 @@ public class ID3v2Frame implements MP3Frame { return new String(data, offset, length, ISO_8859_1); } + public static int getMaxRecordSize() { + return MAX_RECORD_SIZE; + } + public int getMajorVersion() { return majorVersion; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java index 826e74840..55e5ac69b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java @@ -254,6 +254,9 @@ public class Mp3Parser extends AbstractParser { ID3v2Frame.setMaxRecordSize(maxRecordSize); } + public int getMaxRecordSize() { + return ID3v2Frame.getMaxRecordSize(); + } protected static class ID3TagsAndAudio { private ID3Tags[] tags; private AudioFrame audio; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java index 934ec5cba..938ef9db0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java @@ -39,7 +39,7 @@ public abstract class AbstractDWGParser extends AbstractParser { } - String getDwgReadExecutable() { + public String getDwgReadExecutable() { return defaultDwgParserConfig.getDwgReadExecutable(); } @@ -48,7 +48,7 @@ public abstract class AbstractDWGParser extends AbstractParser { defaultDwgParserConfig.setDwgReadExecutable(dwgReadExecutable); } - boolean isCleanDwgReadOutput() { + public boolean isCleanDwgReadOutput() { return defaultDwgParserConfig.isCleanDwgReadOutput(); } @@ -56,8 +56,8 @@ public abstract class AbstractDWGParser extends AbstractParser { public void setCleanDwgReadOutput(boolean cleanDwgReadOutput) { defaultDwgParserConfig.setCleanDwgReadOutput(cleanDwgReadOutput); } - - int getCleanDwgReadOutputBatchSize() { + + public int getCleanDwgReadOutputBatchSize() { return defaultDwgParserConfig.getCleanDwgReadOutputBatchSize(); } @@ -65,7 +65,7 @@ public abstract class AbstractDWGParser extends AbstractParser { public void setCleanDwgReadOutputBatchSize(int cleanDwgReadOutputBatchSize) { defaultDwgParserConfig.setCleanDwgReadOutputBatchSize(cleanDwgReadOutputBatchSize); } - String getCleanDwgReadRegexToReplace() { + public String getCleanDwgReadRegexToReplace() { return defaultDwgParserConfig.getCleanDwgReadRegexToReplace(); } @@ -73,7 +73,7 @@ public abstract class AbstractDWGParser extends AbstractParser { public void setCleanDwgReadRegexToReplace(String cleanDwgReadRegexToReplace) { defaultDwgParserConfig.setCleanDwgReadRegexToReplace(cleanDwgReadRegexToReplace); } - String getCleanDwgReadReplaceWith() { + public String getCleanDwgReadReplaceWith() { return defaultDwgParserConfig.getCleanDwgReadReplaceWith(); } @@ -81,7 +81,7 @@ public abstract class AbstractDWGParser extends AbstractParser { public void setCleanDwgReadReplaceWith(String cleanDwgReadReplaceWith) { defaultDwgParserConfig.setCleanDwgReadReplaceWith(cleanDwgReadReplaceWith); } - long getDwgReadTimeout() { + public long getDwgReadTimeout() { return defaultDwgParserConfig.getDwgReadTimeout(); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java index a15479671..1b7bb96d8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java @@ -182,6 +182,10 @@ public class BPGParser extends AbstractImageParser { this.maxRecordLength = maxRecordLength; } + public int getMaxRecordLength() { + return this.maxRecordLength; + } + protected void handleXMP(InputStream stream, int xmpLength, ImageMetadataExtractor extractor) throws IOException, TikaException, SAXException { if (xmpLength < 0) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java index a342f303d..b7222239e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java @@ -165,6 +165,10 @@ public class PSDParser extends AbstractParser { this.maxDataLengthBytes = maxDataLengthBytes; } + public int getMaxDataLengthBytes() { + return maxDataLengthBytes; + } + private static class ResourceBlock { private static final long SIGNATURE = 0x3842494d; // 8BIM private static final int ID_CAPTION = 0x03F0; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java index 461346f0f..254a1a051 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java @@ -99,18 +99,26 @@ public abstract class AbstractOfficeParser extends AbstractParser { defaultOfficeParserConfig.setIncludeShapeBasedContent(includeShapeBasedContent); } + public boolean isIncludeShapeBasedContent() { + return defaultOfficeParserConfig.isIncludeShapeBasedContent(); + } + @Field public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) { defaultOfficeParserConfig.setUseSAXPptxExtractor(useSAXPptxExtractor); } + public boolean isUseSAXPptxExtractor() { + return defaultOfficeParserConfig.isUseSAXPptxExtractor(); + } + @Field public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) { defaultOfficeParserConfig.setConcatenatePhoneticRuns(concatenatePhoneticRuns); } - void getConcatenatePhoneticRuns() { - defaultOfficeParserConfig.isConcatenatePhoneticRuns(); + public boolean isConcatenatePhoneticRuns() { + return defaultOfficeParserConfig.isConcatenatePhoneticRuns(); } public boolean isExtractAllAlternativesFromMSG() { @@ -143,6 +151,12 @@ public abstract class AbstractOfficeParser extends AbstractParser { @Field public void setByteArrayMaxOverride(int maxOverride) { IOUtils.setByteArrayMaxOverride(maxOverride); + //required for serialization + defaultOfficeParserConfig.setMaxOverride(maxOverride); + } + + public int getByteArrayMaxOverride() { + return defaultOfficeParserConfig.getMaxOverride(); } @Field @@ -150,8 +164,16 @@ public abstract class AbstractOfficeParser extends AbstractParser { defaultOfficeParserConfig.setDateOverrideFormat(format); } + public String getDateFormatOverride() { + return defaultOfficeParserConfig.getDateFormatOverride(); + } + @Field public void setIncludeHeadersAndFooters(boolean includeHeadersAndFooters) { defaultOfficeParserConfig.setIncludeHeadersAndFooters(includeHeadersAndFooters); } + + public boolean isIncludeHeadersAndFooters() { + return defaultOfficeParserConfig.isIncludeHeadersAndFooters(); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index b81c46c13..fccebba46 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -37,6 +37,7 @@ public class OfficeParserConfig implements Serializable { private boolean extractAllAlternativesFromMSG; private String dateOverrideFormat = null; + private int maxOverride; /** * @return whether or not to extract macros @@ -267,6 +268,14 @@ public class OfficeParserConfig implements Serializable { public void setDateOverrideFormat(String format) { this.dateOverrideFormat = format; } + + public void setMaxOverride(int maxOverride) { + this.maxOverride = maxOverride; + } + + public int getMaxOverride() { + return this.maxOverride; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java index 315ea5392..50c77ca91 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java @@ -103,7 +103,7 @@ public class RTFParser extends AbstractParser { } } - private int getMemoryLimitInKb() { + public int getMemoryLimitInKb() { //there's a race condition here, but it shouldn't matter. if (USE_STATIC) { if (EMB_OBJ_MAX_BYTES < 0) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java index bab318233..e943bc9a9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java @@ -89,6 +89,9 @@ public class FlatOpenDocumentParser extends AbstractParser { this.extractMacros = extractMacros; } + public boolean isExtractMacros() { + return extractMacros; + } private ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) { return new FlatOpenDocumentParserHandler(handler, metadata, context, extractMacros); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java index e5d51a80c..3886af599 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java @@ -191,6 +191,10 @@ public class OpenDocumentParser extends AbstractParser { this.extractMacros = extractMacros; } + public boolean isExtractMacros() { + return extractMacros; + } + private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler, EmbeddedDocumentUtil embeddedDocumentUtil) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java index e1633013d..6d20ce811 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WordPerfectParser.java @@ -151,4 +151,8 @@ public class WordPerfectParser extends AbstractParser { public void setIncludeDeletedContent(boolean includeDeletedContent) { this.includeDeletedContent = includeDeletedContent; } + + public boolean isIncludeDeletedContent() { + return includeDeletedContent; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index cb64e39b5..f0b21420a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -42,6 +42,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -663,31 +664,59 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements } } + public List<String> getOtherTesseractSettings() { + List<String> settings = new ArrayList<>(); + Map<String, String> sorted = new TreeMap<>(defaultConfig.getOtherTesseractConfig()); + for (Map.Entry<String, String> e :sorted.entrySet()) { + settings.add(e.getKey() + " " + e.getValue()); + } + return settings; + } + @Field public void setSkipOCR(boolean skipOCR) { defaultConfig.setSkipOcr(skipOCR); } + public boolean isSkipOCR() { + return defaultConfig.isSkipOcr(); + } + @Field public void setLanguage(String language) { defaultConfig.setLanguage(language); } + public String getLanguage() { + return defaultConfig.getLanguage(); + } + @Field public void setPageSegMode(String pageSegMode) { defaultConfig.setPageSegMode(pageSegMode); } + public String getPageSegMode() { + return defaultConfig.getPageSegMode(); + } @Field public void setMaxFileSizeToOcr(long maxFileSizeToOcr) { defaultConfig.setMaxFileSizeToOcr(maxFileSizeToOcr); } + public long getMaxFileSizeToOcr() { + return defaultConfig.getMaxFileSizeToOcr(); + } + @Field public void setMinFileSizeToOcr(long minFileSizeToOcr) { defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr); } + public long getMinFileSizeToOcr() { + return defaultConfig.getMinFileSizeToOcr(); + } + /** * Set default timeout in seconds. This can be overridden per parse * with {@link TikaTaskTimeout} sent in via the {@link ParseContext} @@ -700,51 +729,87 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements defaultConfig.setTimeoutSeconds(timeout); } + public int getTimeout() { + return defaultConfig.getTimeoutSeconds(); + } + @Field public void setOutputType(String outputType) { defaultConfig.setOutputType(outputType); } + public String getOutputType() { + return defaultConfig.getOutputType().name(); + } + @Field public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) { defaultConfig.setPreserveInterwordSpacing(preserveInterwordSpacing); } + public boolean isPreserveInterwordSpacing() { + return defaultConfig.isPreserveInterwordSpacing(); + } + @Field public void setEnableImagePreprocessing(boolean enableImagePreprocessing) { defaultConfig.setEnableImagePreprocessing(enableImagePreprocessing); } + public boolean isEnableImagePreprocessing() { + return defaultConfig.isEnableImagePreprocessing(); + } @Field public void setDensity(int density) { defaultConfig.setDensity(density); } + public int getDensity() { + return defaultConfig.getDensity(); + } + @Field public void setDepth(int depth) { defaultConfig.setDepth(depth); } + public int getDepth() { + return defaultConfig.getDepth(); + } @Field public void setColorspace(String colorspace) { defaultConfig.setColorspace(colorspace); } + public String getColorspace() { + return defaultConfig.getColorspace(); + } @Field public void setFilter(String filter) { defaultConfig.setFilter(filter); } + public String getFilter() { + return defaultConfig.getFilter(); + } + @Field public void setResize(int resize) { defaultConfig.setResize(resize); } + public int getResize() { + return defaultConfig.getResize(); + } + @Field public void setApplyRotation(boolean applyRotation) { defaultConfig.setApplyRotation(applyRotation); } + public boolean isApplyRotation() { + return defaultConfig.isApplyRotation(); + } /** * If set to <code>true</code> and if tesseract is found, this will load the * langs that result from --list-langs. At parse time, the @@ -763,6 +828,9 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements this.preloadLangs = preloadLangs; } + public boolean isPreloadLangs() { + return this.preloadLangs; + } public TesseractOCRConfig getDefaultConfig() { return defaultConfig; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java index 7abe6af06..2527f7388 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java @@ -32,7 +32,7 @@ public class AccessChecker implements Serializable { private static final long serialVersionUID = 6492570218190936986L; private final boolean needToCheck; - private final boolean allowAccessibility; + private final boolean allowExtractionForAccessibility; /** * This constructs an {@link AccessChecker} that @@ -43,7 +43,7 @@ public class AccessChecker implements Serializable { */ public AccessChecker() { needToCheck = false; - allowAccessibility = true; + allowExtractionForAccessibility = true; } /** @@ -55,12 +55,16 @@ public class AccessChecker implements Serializable { */ public AccessChecker(boolean allowExtractionForAccessibility) { needToCheck = true; - this.allowAccessibility = allowExtractionForAccessibility; + this.allowExtractionForAccessibility = allowExtractionForAccessibility; + } + + public boolean isAllowExtractionForAccessibility() { + return allowExtractionForAccessibility; } /** * Checks to see if a document's content should be extracted based - * on metadata values and the value of {@link #allowAccessibility} in the constructor. + * on metadata values and the value of {@link #allowExtractionForAccessibility} in the constructor. * * @param metadata * @throws AccessPermissionException if access is not permitted @@ -70,7 +74,7 @@ public class AccessChecker implements Serializable { return; } if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { - if (allowAccessibility) { + if (allowExtractionForAccessibility) { if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { return; } @@ -95,14 +99,14 @@ public class AccessChecker implements Serializable { if (needToCheck != checker.needToCheck) { return false; } - return allowAccessibility == checker.allowAccessibility; + return allowExtractionForAccessibility == checker.allowExtractionForAccessibility; } @Override public int hashCode() { int result = (needToCheck ? 1 : 0); - result = 31 * result + (allowAccessibility ? 1 : 0); + result = 31 * result + (allowExtractionForAccessibility ? 1 : 0); return result; } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index f5d11a1a3..ae5c03aaf 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -632,122 +632,214 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia defaultConfig.setOcrStrategy(ocrStrategyString); } + public String getOcrStrategy() { + return defaultConfig.getOcrStrategy().name(); + } + @Field public void setOcrStrategyAuto(String ocrStrategyAuto) { defaultConfig.setOcrStrategyAuto(ocrStrategyAuto); } + public String getOcrStrategyAuto() { + return defaultConfig.getOcrStrategyAuto().toString(); + } + @Field public void setOcrRenderingStrategy(String ocrRenderingStrategy) { defaultConfig.setOcrRenderingStrategy(ocrRenderingStrategy); } + public String getOcrRenderingStrategy() { + return defaultConfig.getOcrRenderingStrategy().name(); + } + @Field public void setOcrImageType(String imageType) { defaultConfig.setOcrImageType(imageType); } + public String getOcrImageType() { + return defaultConfig.getOcrImageType().name(); + } + @Field - void setOcrDPI(int dpi) { + public void setOcrDPI(int dpi) { defaultConfig.setOcrDPI(dpi); } + public int getOcrDPI() { + return defaultConfig.getOcrDPI(); + } @Field - void setOcrImageQuality(float imageQuality) { + public void setOcrImageQuality(float imageQuality) { defaultConfig.setOcrImageQuality(imageQuality); } + public float getOcrImageQuality() { + return defaultConfig.getOcrImageQuality(); + } + @Field - void setOcrImageFormatName(String formatName) { + public void setOcrImageFormatName(String formatName) { defaultConfig.setOcrImageFormatName(formatName); } + public String getOcrImageFormatName() { + return defaultConfig.getOcrImageFormatName(); + } + @Field - void setExtractBookmarksText(boolean extractBookmarksText) { + public void setExtractBookmarksText(boolean extractBookmarksText) { defaultConfig.setExtractBookmarksText(extractBookmarksText); } + public boolean isExtractBookmarksText() { + return defaultConfig.isExtractBookmarksText(); + } + @Field - void setExtractInlineImages(boolean extractInlineImages) { + public void setExtractInlineImages(boolean extractInlineImages) { defaultConfig.setExtractInlineImages(extractInlineImages); } + public boolean isExtractInlineImages() { + return defaultConfig.isExtractInlineImages(); + } + @Field - void setExtractInlineImageMetadataOnly(boolean extractInlineImageMetadataOnly) { + public void setExtractInlineImageMetadataOnly(boolean extractInlineImageMetadataOnly) { defaultConfig.setExtractInlineImageMetadataOnly(extractInlineImageMetadataOnly); } + public boolean isExtractInlineImageMetadataOnly() { + return defaultConfig.isExtractInlineImageMetadataOnly(); + } + @Field - void setAverageCharTolerance(float averageCharTolerance) { + public void setAverageCharTolerance(float averageCharTolerance) { defaultConfig.setAverageCharTolerance(averageCharTolerance); } + public float getAverageCharTolerance() { + return defaultConfig.getAverageCharTolerance(); + } + @Field - void setSpacingTolerance(float spacingTolerance) { + public void setSpacingTolerance(float spacingTolerance) { defaultConfig.setSpacingTolerance(spacingTolerance); } + public float getSpacingTolerance() { + return defaultConfig.getSpacingTolerance(); + } + @Field - void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) { + public void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) { defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions); } + public boolean isCatchIntermediateExceptions() { + return defaultConfig.isCatchIntermediateIOExceptions(); + } + @Field - void setExtractAcroFormContent(boolean extractAcroFormContent) { + public void setExtractAcroFormContent(boolean extractAcroFormContent) { defaultConfig.setExtractAcroFormContent(extractAcroFormContent); } + public boolean isExtractAcroFormContent() { + return defaultConfig.isExtractAcroFormContent(); + }; + @Field - void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) { + public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) { defaultConfig.setIfXFAExtractOnlyXFA(ifXFAExtractOnlyXFA); } + public boolean isIfXFAExtractOnlyXFA() { + return defaultConfig.isIfXFAExtractOnlyXFA(); + } @Field - void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) { + public void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) { defaultConfig.setAccessChecker(new AccessChecker(allowExtractionForAccessibility)); } + public boolean isAllowExtractionForAccessibility() { + return defaultConfig.getAccessChecker().isAllowExtractionForAccessibility(); + } + @Field - void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { + public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { defaultConfig.setExtractUniqueInlineImagesOnly(extractUniqueInlineImagesOnly); } + public boolean isExtractUniqueInlineImagesOnly() { + return defaultConfig.isExtractUniqueInlineImagesOnly(); + } + @Field - void setExtractActions(boolean extractActions) { + public void setExtractActions(boolean extractActions) { defaultConfig.setExtractActions(extractActions); } + public boolean isExtractActions() { + return defaultConfig.isExtractActions(); + } + @Field - void setExtractFontNames(boolean extractFontNames) { + public void setExtractFontNames(boolean extractFontNames) { defaultConfig.setExtractFontNames(extractFontNames); } + public boolean isExtractFontNames() { + return defaultConfig.isExtractFontNames(); + } + @Field - void setSetKCMS(boolean setKCMS) { + public void setSetKCMS(boolean setKCMS) { defaultConfig.setSetKCMS(setKCMS); } + public boolean isSetKCMS() { + return defaultConfig.isSetKCMS(); + } @Field - void setDetectAngles(boolean detectAngles) { + public void setDetectAngles(boolean detectAngles) { defaultConfig.setDetectAngles(detectAngles); } + public boolean isDetectAngles() { + return defaultConfig.isDetectAngles(); + } @Field - void setExtractMarkedContent(boolean extractMarkedContent) { + public void setExtractMarkedContent(boolean extractMarkedContent) { defaultConfig.setExtractMarkedContent(extractMarkedContent); } + public boolean isExtractMarkedContent() { + return defaultConfig.isExtractMarkedContent(); + } + @Field public void setDropThreshold(float dropThreshold) { defaultConfig.setDropThreshold(dropThreshold); } + public float getDropThreshold() { + return defaultConfig.getDropThreshold(); + } + @Field public void setMaxMainMemoryBytes(long maxMainMemoryBytes) { defaultConfig.setMaxMainMemoryBytes(maxMainMemoryBytes); } + public long getMaxMainMemoryBytes() { + return defaultConfig.getMaxMainMemoryBytes(); + } + /** * This is a no-op. There is no need to initialize multiple fields. * The regular field loading should happen without this. @@ -780,21 +872,34 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia config.setRenderer(pdfBoxRenderer); } + //TODO -- figure out how to deserialize this in TikaConfigSerializer @Override public void setRenderer(Renderer renderer) { defaultConfig.setRenderer(renderer); } + public Renderer getRenderer() { + return defaultConfig.getRenderer(); + } + @Field public void setImageGraphicsEngineFactory(ImageGraphicsEngineFactory imageGraphicsEngineFactory) { defaultConfig.setImageGraphicsEngineFactory(imageGraphicsEngineFactory); } + public ImageGraphicsEngineFactory getImageGraphicsEngineFactory() { + return defaultConfig.getImageGraphicsEngineFactory(); + } + @Field public void setImageStrategy(String imageStrategy) { defaultConfig.setImageStrategy(imageStrategy); } + public String getImageStrategy() { + return defaultConfig.getImageStrategy().name(); + } + /** * Copied from AcroformDefaultFixup minus generation of appearances and handling of orphan * widgets, which we don't need. diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index acd57e47f..54776648b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -946,6 +946,20 @@ public class PDFParserConfig implements Serializable { public int getTotalCharsPerPage() { return totalCharsPerPage; } + + @Override + public String toString() { + //TODO -- figure out if this is actual BEST or whatever + //and return that instead of the literal values + String unmappedString = null; + if (unmappedUnicodeCharsPerPage < 1.0) { + unmappedString = String.format(Locale.US, "%.03f", + unmappedUnicodeCharsPerPage * 100) + "%"; + } else { + unmappedString = String.format(Locale.US, "%.0f", unmappedUnicodeCharsPerPage); + } + return unmappedString + "," + totalCharsPerPage; + } } public enum OCR_RENDERING_STRATEGY { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index b95588a74..d01959bfc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -252,4 +252,8 @@ public class CompressorParser extends AbstractParser { this.memoryLimitInKb = memoryLimitInKb; } + public int getMemoryLimitInKb() { + return this.memoryLimitInKb; + } + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java index 25a91f74d..7fedfd3ba 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java @@ -527,4 +527,8 @@ public class PackageParser extends AbstractEncodingDetectorParser { public void setDetectCharsetsInEntryNames(boolean detectCharsetsInEntryNames) { this.detectCharsetsInEntryNames = detectCharsetsInEntryNames; } + + public boolean isDetectCharsetsInEntryNames() { + return detectCharsetsInEntryNames; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java index f89b27c12..15189988e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java @@ -19,6 +19,8 @@ package org.apache.tika.parser.txt; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -121,8 +123,19 @@ public class Icu4jEncodingDetector implements EncodingDetector { this.markLimit = markLimit; } + public int getMarkLimt() { + return this.markLimit; + } + @Field public void setIgnoreCharsets(List<String> charsetsToIgnore) { this.ignoreCharsets.addAll(charsetsToIgnore); } + + public List<String> getIgnoreCharsets() { + List<String> ret = new ArrayList<>(); + ret.addAll(ignoreCharsets); + Collections.sort(ret); + return ret; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java index 3782c69fd..d2f28e0a0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java @@ -151,6 +151,10 @@ public class DefaultZipContainerDetector implements Detector { this.markLimit = markLimit; } + public int getMarkLimit() { + return markLimit; + } + @Override public MediaType detect(InputStream input, Metadata metadata) throws IOException { // Check if we have access to the document diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml index 0f6767668..e034afad8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml @@ -325,6 +325,20 @@ <scope>test</scope> </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <version>${log4j2.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j2-impl</artifactId> + <version>${log4j2.version}</version> + <scope>test</scope> + </dependency> + + </dependencies> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java new file mode 100644 index 000000000..0c354e474 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import static org.apache.tika.TikaTest.assertContains; +import static org.apache.tika.TikaTest.assertNotContained; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.StringWriter; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.junit.jupiter.api.Test; + +public class TikaConfigSerializerTest { + + @Test + public void testBasicParams() throws Exception { + TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); + StringWriter writer = new StringWriter(); + + TikaConfigSerializer.serialize(tikaConfig, TikaConfigSerializer.Mode.STATIC_FULL, + writer, StandardCharsets.UTF_8); + String xml = writer.toString().replaceAll("\\s+", " "); + String encodingNeedle = "<encodingDetector class=\"org.apache.tika.parser.txt" + + ".Icu4jEncodingDetector\">" + + " <params> <param name=\"ignoreCharsets\" type=\"list\"/>"; + assertContains(encodingNeedle, xml); + + String detectorNeedle = "<detector class=\"org.apache.tika.detect.zip.DefaultZipContainerDetector\">" + + " <params> <param name=\"markLimit\" type=\"int\">16777216</param> </params>"; + assertContains(detectorNeedle, xml); + + String parserNeedle = "<parser class=\"org.apache.tika.parser.pdf.PDFParser\">" + + " <params> <param name=\"allowExtractionForAccessibility\" " + + "type=\"bool\">true</param>"; + + assertContains(parserNeedle, xml); + //TODO This is still to be implemented -- we do not want to show the default renderer here + assertNotContained("<renderer class=\"org.apache.tika.renderer.CompositeRenderer\"/>", xml); + + //For now, make sure that deserialization basically works; + //add many more unit tests! + try (InputStream is = new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))) { + TikaConfig deserialized = new TikaConfig(is); + } + } + + @Test + public void testTesseractList() throws Exception { + TikaConfig tikaConfig = new TikaConfig(getPath("tika-config-tesseract-arbitrary.xml")); + StringWriter writer = new StringWriter(); + + TikaConfigSerializer.serialize(tikaConfig, TikaConfigSerializer.Mode.STATIC, + writer, StandardCharsets.UTF_8); + String xml = writer.toString().replaceAll("\\s+", " "); + String needle = "<param name=\"otherTesseractSettings\" type=\"list\"> " + + "<string>textord_initialx_ile 0.75</string> <string>textord_noise_hfract 0.15625</string> </param>"; + assertContains(needle, xml); + //For now, make sure that deserialization basically works; + //add many more unit tests! + try (InputStream is = new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))) { + TikaConfig deserialized = new TikaConfig(is); + } + } + + private Path getPath(String config) { + try { + return Paths.get(TikaConfigSerializerTest.class.getResource("/configs/" + config) + .toURI()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-tesseract-arbitrary.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-tesseract-arbitrary.xml new file mode 100644 index 000000000..93544797b --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-tesseract-arbitrary.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"> + <params> + <!-- space delimited key-value pairs --> + <param name="otherTesseractSettings" type="list"> + <string>textord_initialx_ile 0.75</string> + <string>textord_noise_hfract 0.15625</string> + </param> + </params> + </parser> + </parsers> +</properties>
