Author: nick Date: Thu Mar 24 15:35:24 2011 New Revision: 1085003 URL: http://svn.apache.org/viewvc?rev=1085003&view=rev Log: TIKA-620 - Have CompositeParser always use the canonical mimetype internally, via suitable calls to registry.normalise, rather than trying to handle the aliases individually
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1085003&r1=1085002&r2=1085003&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Thu Mar 24 15:35:24 2011 @@ -25,7 +25,8 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.SortedSet; +import java.util.logging.Level; +import java.util.logging.Logger; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TaggedInputStream; @@ -79,7 +80,16 @@ public class CompositeParser implements Map<MediaType, Parser> map = new HashMap<MediaType, Parser>(); for (Parser parser : parsers) { for (MediaType type : parser.getSupportedTypes(context)) { - map.put(type, parser); + MediaType canonicalType = registry.normalize(type); + if (map.containsKey(canonicalType)) { + if (map.get(canonicalType) != parser) { + Logger.getLogger(getClass().getName()).log( + Level.INFO, "Duplicate parser definition for " + type + + " (" + canonicalType + "), using " + parser + ); + } + } + map.put(canonicalType, parser); } } return map; @@ -165,6 +175,11 @@ public class CompositeParser implements protected Parser getParser(Metadata metadata, ParseContext context) { Map<MediaType, Parser> map = getParsers(context); MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE)); + if (type != null) { + // We always work on the normalised, canonical form + type = registry.normalize(type); + } + while (type != null) { // Try finding a parser for the type Parser parser = map.get(type); @@ -172,15 +187,6 @@ public class CompositeParser implements return parser; } - // Next up, look for one for its aliases - SortedSet<MediaType> aliases = registry.getAliases(type); - for (MediaType alias : aliases) { - parser = map.get(alias); - if (parser != null) { - return parser; - } - } - // Failing that, try for the parent of the type type = registry.getSupertype(type); } Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java?rev=1085003&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java (added) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java Thu Mar 24 15:35:24 2011 @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import junit.framework.TestCase; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +public class CompositeParserTest extends TestCase { + public void testDefaultParser() throws Exception { + TikaConfig config = TikaConfig.getDefaultConfig(); + + CompositeParser parser = (CompositeParser)config.getParser(); + + // Check it has the full registry + assertEquals(config.getMediaTypeRegistry(), parser.getMediaTypeRegistry()); + } + + public void testMimeTypeAliases() throws Exception { + MediaType bmpCanonical = MediaType.image("x-ms-bmp"); + Map<String,String> bmpCanonicalMetadata = new HashMap<String, String>(); + bmpCanonicalMetadata.put("BMP", "True"); + bmpCanonicalMetadata.put("Canonical", "True"); + Parser bmpCanonicalParser = new DummyParser( + new HashSet<MediaType>(Arrays.asList(bmpCanonical)), + bmpCanonicalMetadata, null + ); + + MediaType bmpAlias = MediaType.image("bmp"); + Map<String,String> bmpAliasMetadata = new HashMap<String, String>(); + bmpAliasMetadata.put("BMP", "True"); + bmpAliasMetadata.put("Alias", "True"); + Parser bmpAliasParser = new DummyParser( + new HashSet<MediaType>(Arrays.asList(bmpAlias)), + bmpAliasMetadata, null + ); + + TikaConfig config = TikaConfig.getDefaultConfig(); + CompositeParser canonical = new CompositeParser( + config.getMediaTypeRegistry(), bmpCanonicalParser + ); + CompositeParser alias = new CompositeParser( + config.getMediaTypeRegistry(), bmpAliasParser + ); + CompositeParser both = new CompositeParser( + config.getMediaTypeRegistry(), bmpCanonicalParser, bmpAliasParser + ); + + ContentHandler handler = new BodyContentHandler(); + Metadata metadata; + + // Canonical and Canonical + metadata = new Metadata(); + metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString()); + canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); + assertEquals("True", metadata.get("BMP")); + assertEquals("True", metadata.get("Canonical")); + + + // Alias and Alias + metadata = new Metadata(); + metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString()); + alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); + assertEquals("True", metadata.get("BMP")); + assertEquals("True", metadata.get("Alias")); + + + // Alias type and Canonical parser + metadata = new Metadata(); + metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString()); + canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); + assertEquals("True", metadata.get("BMP")); + assertEquals("True", metadata.get("Canonical")); + + + // Canonical type and Alias parser + metadata = new Metadata(); + metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString()); + alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); + assertEquals("True", metadata.get("BMP")); + assertEquals("True", metadata.get("Alias")); + + + // And when both are there, will go for the last one + // to be registered (which is the alias one) + metadata = new Metadata(); + metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString()); + both.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); + assertEquals("True", metadata.get("BMP")); + assertEquals("True", metadata.get("Alias")); + } +} Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java?rev=1085003&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java (added) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java Thu Mar 24 15:35:24 2011 @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Map; +import java.util.Set; +import java.util.Map.Entry; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * A Dummy Parser for use with unit tests. + */ +public class DummyParser implements Parser { + private Set<MediaType> types; + private Map<String,String> metadata; + private String xmlText; + + public DummyParser(Set<MediaType> types, Map<String, String> metadata, + String xmlText) { + this.types = types; + this.metadata = metadata; + this.xmlText = xmlText; + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return types; + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + for (Entry<String,String> m : this.metadata.entrySet()) { + metadata.add(m.getKey(), m.getValue()); + } + + handler.startDocument(); + if (xmlText != null) { + handler.characters(xmlText.toCharArray(), 0, xmlText.length()); + } + handler.endDocument(); + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata) throws IOException, SAXException, TikaException { + parse(stream, handler, metadata, new ParseContext()); + } +}