Start factoring out "configurable"; change signature of ParseContext's setParam to (Class, Param); add check for illegal field being specified in TikaConfig.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/338db905 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/338db905 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/338db905 Branch: refs/heads/master Commit: 338db905d4e203d4df4582d5511242eaa922af6b Parents: ecdc403 Author: tballison <[email protected]> Authored: Mon Jun 13 11:14:27 2016 -0400 Committer: tballison <[email protected]> Committed: Mon Jun 13 11:14:27 2016 -0400 ---------------------------------------------------------------------- .../java/org/apache/tika/config/TikaConfig.java | 12 ++--- .../org/apache/tika/parser/AbstractParser.java | 24 +--------- .../org/apache/tika/parser/ParseContext.java | 46 +++++++++++++------- .../org/apache/tika/utils/AnnotationUtils.java | 24 +++++++--- .../tika/parser/ConfigurableParserTest.java | 3 ++ .../tika/parser/DummyConfigurableParser.java | 6 +-- .../tika/parser/DummyParameterizedParser.java | 3 +- .../tika/parser/ParameterizedParserTest.java | 1 - .../org/apache/tika/parser/pdf/PDFParser.java | 16 ++++--- .../apache/tika/parser/pdf/PDFParserTest.java | 19 +++++++- 10 files changed, 86 insertions(+), 68 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index 853cdf0..692b007 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -567,15 +567,9 @@ public class TikaConfig { // Have any decoration performed, eg explicit mimetypes loaded = decorate(loaded, element); //if the instance is configurable, then call configure() - if (loaded instanceof Configurable){ - Map<String, Param<?>> params = getParams(element); - //Assigning the params to bean fields/setters - AnnotationUtils.assignFieldParams(loaded, params); - //invoking the configure() hook - ParseContext context = new ParseContext(); - context.getParams().putAll(params); - ((Configurable) loaded).configure(context); // initialize here - } + Map<String, Param<?>> params = getParams(element); + //Assigning the params to bean fields/setters + AnnotationUtils.assignFieldParams(loaded, params); // All done with setup return loaded; } catch (ClassNotFoundException e) { http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java index 5c045db..51687e7 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java @@ -34,7 +34,7 @@ import org.xml.sax.SAXException; * * @since Apache Tika 0.10 */ -public abstract class AbstractParser implements ConfigurableParser { +public abstract class AbstractParser implements Parser { /** * Configuration supplied at runtime @@ -62,27 +62,5 @@ public abstract class AbstractParser implements ConfigurableParser { parse(stream, handler, metadata, new ParseContext()); } - /** - * called by the framework to supply runtime parameters which may be - * required for initialization - * @param context the parser context at runtime - * @since Apache Tika 1.14 - */ - @Override - public void configure(ParseContext context) throws TikaConfigException { - this.context = context; - } - - - /** - * Gets Parameters of this configurable instance - * @return a map of key value pairs - * - * @since Apache Tika 1.14 - */ - @Override - public Map<String, Param<?>> getParams() { - return this.context.getParams(); - } } http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index dc03099..68d5038 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.io.Serializable; import java.io.StringReader; import java.lang.reflect.Method; +import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -55,10 +56,12 @@ public class ParseContext implements Serializable { /** Map of objects in this context */ private final Map<String, Object> context = new HashMap<String, Object>(); + private final static Map<String, Param<?>> EMPTY_PARAMS = Collections.EMPTY_MAP; + /** * Map of configurable arguments */ - private final Map<String, Param<?>> params = new HashMap<>(); + private final Map<String, Map<String, Param<?>>> params = new HashMap<>(); private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() { public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { @@ -202,29 +205,42 @@ public class ParseContext implements Serializable { } /** - * Stores a key=value parameter - * @param key parameter name + * @param clazz class associated with given param name * @param value value */ - public void setParam(String key, Param<?> value){ - this.params.put(key, value); + public void setParam(Class clazz, Param<?> value){ + Map<String, Param<?>> classParams = this.params.get(clazz.getName()); + if (classParams == null) { + classParams = new HashMap<>(); + } + classParams.put(value.getName(), value); + this.params.put(clazz.getName(), classParams); } /** - * Gets the value associated with given parameter + * Gets the value associated with given class and parameter + * @param clazz class * @param key parameter name - * @return param value + * @return param value or null if the clazz or key doesn't exist */ - public Param<?> getParam(String key){ - return this.params.get(key); + public Param<?> getParam(Class clazz, String key) { + Map<String, Param<?>> classParams = this.params.get(clazz.getName()); + if (classParams != null) { + return classParams.get(key); + } + return null; } /** - * Gets all the params - * @return map of key values + * Gets all the params for the specified class + * @param clazz class for which to grab the params + * @return map of key values or null if nothing has been specified */ - public Map<String, Param<?>> getParams() { - return params; + public Map<String, Param<?>> getParams(Class clazz) { + if (params.containsKey(clazz.getName())) { + return params.get(clazz.getName()); + } + return EMPTY_PARAMS; } /** @@ -232,8 +248,8 @@ public class ParseContext implements Serializable { * @param key parameter name * @return true if parameter is available, false otherwise */ - public boolean hasParam(String key){ - return params.containsKey(key); + public boolean hasParam(Class clazz, String key){ + return params.containsKey(clazz) && params.get(clazz.getName()).containsKey(key); } /** * Returns the DOM builder factory specified in this parsing context. http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java index 08e004b..1f56bc7 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java @@ -26,11 +26,7 @@ import java.lang.annotation.Annotation; import java.lang.reflect.AccessibleObject; import java.security.AccessController; import java.security.PrivilegedAction; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * This class contains utilities for dealing with tika annotations @@ -100,7 +96,11 @@ public class AnnotationUtils { } List<ParamField> fields = PARAM_INFO.get(beanClass); + + Set<String> validFieldNames = new HashSet<>(); + for (ParamField field : fields) { + validFieldNames.add(field.getName()); Param<?> param = params.get(field.getName()); if (param != null){ if (field.getType().isAssignableFrom(param.getType())) { @@ -110,7 +110,7 @@ public class AnnotationUtils { throw new TikaConfigException(e.getMessage(), e); } } else { - String msg = String.format("Value '%s' of type '%s' cant be" + + String msg = String.format(Locale.ROOT, "Value '%s' of type '%s' cant be" + " assigned to field '%s' of defined type '%s'", param.getValue(), param.getValue().getClass(), field.getName(), field.getType()); @@ -118,7 +118,7 @@ public class AnnotationUtils { } } else if (field.isRequired()){ //param not supplied but field is declared as required? - String msg = String.format("Param %s is required for %s," + + String msg = String.format(Locale.ROOT, "Param %s is required for %s," + " but it is not given in config.", field.getName(), bean.getClass().getName()); throw new TikaConfigException(msg); @@ -127,5 +127,15 @@ public class AnnotationUtils { //LOG.debug("Param not supplied, field is not mandatory"); } } + //now test that params doesn't contain a field + //not allowed by this object + for (String fieldName : params.keySet()) { + if (! validFieldNames.contains(fieldName)) { + String msg = String.format(Locale.ROOT, + "No field '%s' exists for %s", + fieldName, bean.getClass().getName()); + throw new TikaConfigException(msg); + } + } } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java index dcf188d..ffb632c 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java @@ -20,6 +20,7 @@ import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import java.io.File; @@ -36,6 +37,7 @@ public class ConfigurableParserTest { public static final String TEST_PARAM_VAL = "testparamval"; @Test + @Ignore public void testConfigurableParser() throws Exception { URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE); assert configFileUrl != null; @@ -48,6 +50,7 @@ public class ConfigurableParserTest { } @Test + @Ignore public void testConfigurableParserTypes() throws Exception { URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE); assert configFileUrl != null; http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java index 3914b01..15fe060 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java @@ -39,8 +39,8 @@ import java.util.Set; * 3. parameters were available at parse * */ -public class DummyConfigurableParser extends AbstractParser { - +public class DummyConfigurableParser { +/* private static Set<MediaType> MIMES = new HashSet<>(); static { MIMES.add(MediaType.TEXT_PLAIN); @@ -63,5 +63,5 @@ public class DummyConfigurableParser extends AbstractParser { metadata.add(entry.getKey()+"-type", param.getValue().getClass().getName()); } } - +*/ } http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java index 848b774..801d65e 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java @@ -40,8 +40,7 @@ import static org.osgi.util.measurement.Unit.s; * A test Parsers to test {@link Field} * @since Apache Tika 1.14 */ -public class DummyParameterizedParser extends AbstractParser - implements ConfigurableParser { +public class DummyParameterizedParser extends AbstractParser { private static Set<MediaType> MIMES = new HashSet<>(); static { http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java index e0c3b53..a048f29 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java @@ -77,7 +77,6 @@ public class ParameterizedParserTest { } @Test - @Ignore("can we get this to work, somehow?") public void testBadParam() throws Exception { try { Metadata m = getMetadata("TIKA-1986-bad-parameters.xml"); http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index bacc901..dd03177 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -21,12 +21,7 @@ import javax.xml.stream.XMLStreamException; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.Arrays; -import java.util.Calendar; -import java.util.Collections; -import java.util.List; -import java.util.Locale; -import java.util.Set; +import java.util.*; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.jempbox.xmp.XMPMetadata; @@ -44,6 +39,7 @@ import org.apache.pdfbox.pdmodel.common.PDMetadata; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; import org.apache.tika.config.Field; +import org.apache.tika.config.Param; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -86,7 +82,7 @@ import static org.bouncycastle.asn1.x500.style.RFC4519Style.name; * turn this feature on, see * {@link PDFParserConfig#setExtractInlineImages(boolean)}. */ -public class PDFParser extends AbstractParser implements ConfigurableParser { +public class PDFParser extends AbstractParser { /** @@ -123,6 +119,12 @@ public class PDFParser extends AbstractParser implements ConfigurableParser { PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); //TODO: get rid of this after dev of TIKA-1508!!! localConfig.setSortByPosition(sortByPosition); + + //TODO: this is just a mockup...move elsewhere + Map<String, Param<?>> params = context.getParams(PDFParser.class); + if (params != null && params.containsKey("sortByPosition")) { + localConfig.setSortByPosition((Boolean)params.get("sortByPosition").getValue()); + } String password = ""; try { // PDFBox can process entirely in memory, or can use a temp file http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index ac54b11..2ef29f3 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -35,6 +35,7 @@ import org.apache.commons.io.IOUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.tika.TikaTest; +import org.apache.tika.config.Param; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; @@ -469,7 +470,7 @@ public class PDFParserTest extends TikaTest { content = content.replaceAll("\\s+", " "); assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content); - parser.getPDFParserConfig().setSortByPosition(true); + parser.setSortByPosition(true); stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"); content = getText(stream, parser); content = content.replaceAll("\\s+", " "); @@ -1229,6 +1230,22 @@ public class PDFParserTest extends TikaTest { } + @Test + public void testParameterizationViaContext() throws Exception { + ParseContext context = new ParseContext(); + + Param<Boolean> paramVal = new Param<>("sortByPosition", new Boolean(true)); + context.setParam(PDFParser.class, paramVal); + + Parser p = new AutoDetectParser(); + String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p, context); + text = text.replaceAll("\\s+", " "); + + // Column text is now interleaved: + assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text); + + } + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path);
