Author: nick
Date: Thu Mar 24 15:35:24 2011
New Revision: 1085003

URL: http://svn.apache.org/viewvc?rev=1085003&view=rev
Log:
TIKA-620 - Have CompositeParser always use the canonical mimetype internally, 
via suitable calls to registry.normalise, rather than trying to handle the 
aliases individually

Added:
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java
Modified:
    
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1085003&r1=1085002&r2=1085003&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java 
Thu Mar 24 15:35:24 2011
@@ -25,7 +25,8 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.SortedSet;
+import java.util.logging.Level;
+import java.util.logging.Logger;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TaggedInputStream;
@@ -79,7 +80,16 @@ public class CompositeParser implements 
         Map<MediaType, Parser> map = new HashMap<MediaType, Parser>();
         for (Parser parser : parsers) {
             for (MediaType type : parser.getSupportedTypes(context)) {
-                map.put(type, parser);
+                MediaType canonicalType = registry.normalize(type);
+                if (map.containsKey(canonicalType)) {
+                   if (map.get(canonicalType) != parser) {
+                      Logger.getLogger(getClass().getName()).log(
+                            Level.INFO, "Duplicate parser definition for " + 
type + 
+                            " (" + canonicalType + "), using " + parser
+                      );
+                   }
+                }
+                map.put(canonicalType, parser);
             }
         }
         return map;
@@ -165,6 +175,11 @@ public class CompositeParser implements 
     protected Parser getParser(Metadata metadata, ParseContext context) {
         Map<MediaType, Parser> map = getParsers(context);
         MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+        if (type != null) {
+           // We always work on the normalised, canonical form
+           type = registry.normalize(type);
+        }
+        
         while (type != null) {
             // Try finding a parser for the type
             Parser parser = map.get(type);
@@ -172,15 +187,6 @@ public class CompositeParser implements 
                 return parser;
             }
             
-            // Next up, look for one for its aliases
-            SortedSet<MediaType> aliases = registry.getAliases(type);
-            for (MediaType alias : aliases) {
-               parser = map.get(alias);
-               if (parser != null) {
-                   return parser;
-               }
-            }
-         
             // Failing that, try for the parent of the type
             type = registry.getSupertype(type);
         }

Added: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java?rev=1085003&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java
 (added)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/CompositeParserTest.java
 Thu Mar 24 15:35:24 2011
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+public class CompositeParserTest extends TestCase {
+   public void testDefaultParser() throws Exception {
+      TikaConfig config = TikaConfig.getDefaultConfig();
+      
+      CompositeParser parser = (CompositeParser)config.getParser();
+      
+      // Check it has the full registry
+      assertEquals(config.getMediaTypeRegistry(), 
parser.getMediaTypeRegistry());
+   }
+   
+   public void testMimeTypeAliases() throws Exception {
+      MediaType bmpCanonical = MediaType.image("x-ms-bmp");
+      Map<String,String> bmpCanonicalMetadata = new HashMap<String, String>();
+      bmpCanonicalMetadata.put("BMP", "True");
+      bmpCanonicalMetadata.put("Canonical", "True");
+      Parser bmpCanonicalParser = new DummyParser(
+            new HashSet<MediaType>(Arrays.asList(bmpCanonical)),
+            bmpCanonicalMetadata, null
+      );
+      
+      MediaType bmpAlias = MediaType.image("bmp");
+      Map<String,String> bmpAliasMetadata = new HashMap<String, String>();
+      bmpAliasMetadata.put("BMP", "True");
+      bmpAliasMetadata.put("Alias", "True");
+      Parser bmpAliasParser = new DummyParser(
+            new HashSet<MediaType>(Arrays.asList(bmpAlias)),
+            bmpAliasMetadata, null
+      );
+      
+      TikaConfig config = TikaConfig.getDefaultConfig();
+      CompositeParser canonical = new CompositeParser(
+            config.getMediaTypeRegistry(), bmpCanonicalParser
+      );
+      CompositeParser alias = new CompositeParser(
+            config.getMediaTypeRegistry(), bmpAliasParser
+      );
+      CompositeParser both = new CompositeParser(
+            config.getMediaTypeRegistry(), bmpCanonicalParser, bmpAliasParser
+      );
+      
+      ContentHandler handler = new BodyContentHandler();
+      Metadata metadata;
+      
+      // Canonical and Canonical
+      metadata = new Metadata();
+      metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
+      canonical.parse(new ByteArrayInputStream(new byte[0]), handler, 
metadata, new ParseContext());
+      assertEquals("True", metadata.get("BMP"));
+      assertEquals("True", metadata.get("Canonical"));
+      
+      
+      // Alias and Alias
+      metadata = new Metadata();
+      metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
+      alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, 
new ParseContext());
+      assertEquals("True", metadata.get("BMP"));
+      assertEquals("True", metadata.get("Alias"));
+      
+      
+      // Alias type and Canonical parser
+      metadata = new Metadata();
+      metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
+      canonical.parse(new ByteArrayInputStream(new byte[0]), handler, 
metadata, new ParseContext());
+      assertEquals("True", metadata.get("BMP"));
+      assertEquals("True", metadata.get("Canonical"));
+      
+      
+      // Canonical type and Alias parser
+      metadata = new Metadata();
+      metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
+      alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, 
new ParseContext());
+      assertEquals("True", metadata.get("BMP"));
+      assertEquals("True", metadata.get("Alias"));
+      
+      
+      // And when both are there, will go for the last one
+      //  to be registered (which is the alias one)
+      metadata = new Metadata();
+      metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
+      both.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new 
ParseContext());
+      assertEquals("True", metadata.get("BMP"));
+      assertEquals("True", metadata.get("Alias"));
+   }
+}

Added: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java?rev=1085003&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java 
(added)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DummyParser.java 
Thu Mar 24 15:35:24 2011
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A Dummy Parser for use with unit tests.
+ */
+public class DummyParser implements Parser {
+   private Set<MediaType> types;
+   private Map<String,String> metadata;
+   private String xmlText;
+
+   public DummyParser(Set<MediaType> types, Map<String, String> metadata,
+         String xmlText) {
+      this.types = types;
+      this.metadata = metadata;
+      this.xmlText = xmlText;
+   }
+
+   public Set<MediaType> getSupportedTypes(ParseContext context) {
+      return types;
+   }
+
+   public void parse(InputStream stream, ContentHandler handler,
+         Metadata metadata, ParseContext context) throws IOException,
+         SAXException, TikaException {
+      for (Entry<String,String> m : this.metadata.entrySet()) {
+         metadata.add(m.getKey(), m.getValue());
+      }
+      
+      handler.startDocument();
+      if (xmlText != null) {
+         handler.characters(xmlText.toCharArray(), 0, xmlText.length());
+      }
+      handler.endDocument();
+   }
+
+   public void parse(InputStream stream, ContentHandler handler,
+         Metadata metadata) throws IOException, SAXException, TikaException {
+      parse(stream, handler, metadata, new ParseContext());
+   }
+}


Reply via email to