Author: nick Date: Mon Feb 4 16:36:42 2013 New Revision: 1442168 URL: http://svn.apache.org/viewvc?rev=1442168&view=rev Log: Support tika:link and tika:uti mimetype extensions, along with unit tests. Modified version of the patch from TIKA-1012
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=1442168&r1=1442167&r2=1442168&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java Mon Feb 4 16:36:42 2013 @@ -17,7 +17,9 @@ package org.apache.tika.mime; import java.io.Serializable; +import java.net.URI; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -75,6 +77,21 @@ public final class MimeType implements C private final MediaType type; /** + * The MimeType acronym + */ + private String acronym = ""; + + /** + * The http://en.wikipedia.org/wiki/Uniform_Type_Identifier + */ + private String uti = ""; + + /** + * Documentation Links + */ + private List<URI> links = Collections.emptyList(); + + /** * Description of this media type. */ private String description = ""; @@ -148,6 +165,75 @@ public final class MimeType implements C } this.description = description; } + + + /** + * Returns an acronym for this mime type. + * + * @return mime type acronym + */ + public String getAcronym() { + return acronym; + } + + /** + * Set an acronym for the mime type + * + * @param acronym + */ + void setAcronym(String v) { + if (v == null) { + throw new IllegalArgumentException("Acronym is missing"); + } + acronym = v; + } + + /** + * Get the UTI for this mime type. + * + * @see http://en.wikipedia.org/wiki/Uniform_Type_Identifier + * + * @return The Uniform Type Identifier + */ + public String getUniformTypeIdentifier() { + return uti; + } + + /** + * Set The Uniform Type Identifier + * + * @param uti + */ + void setUniformTypeIdentifier(String v) { + if (v == null) { + throw new IllegalArgumentException("Uniform Type Identifier is missing"); + } + uti = v; + } + + /** + * Get a list of links to help document this mime type + * + * @return an array of links (will never be null) + */ + public List<URI> getLinks() { + return links; // this is already unmodifiable + } + + /** + * Add a link to this mime type + * @param link + */ + void addLink(URI link) { + if(link==null) { + throw new IllegalArgumentException("Missing Link"); + } + List<URI> copy = new ArrayList<URI>(links.size()+1); + copy.addAll(links); + copy.add(link); + links = Collections.unmodifiableList(copy); + } + /** * Add some rootXML info to this mime-type Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=1442168&r1=1442167&r2=1442168&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java Mon Feb 4 16:36:42 2013 @@ -19,9 +19,10 @@ package org.apache.tika.mime; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Collections; -import java.util.LinkedList; import java.util.List; import javax.xml.parsers.ParserConfigurationException; @@ -89,6 +90,11 @@ import org.xml.sax.helpers.DefaultHandle * type CDATA #REQUIRED> * ]> * </pre> + * + * In addition to the standard fields, this will also read two Tika specific fields: + * - link + * - uti + * * * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec */ @@ -154,7 +160,10 @@ class MimeTypesReader extends DefaultHan } else if (SUB_CLASS_OF_TAG.equals(qName)) { String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR); types.setSuperType(type, MediaType.parse(parent)); - } else if (COMMENT_TAG.equals(qName)) { + } else if (ACRONYM_TAG.equals(qName)|| + COMMENT_TAG.equals(qName)|| + TIKA_LINK_TAG.equals(qName)|| + TIKA_UTI_TAG.equals(qName)) { characters = new StringBuilder(); } else if (GLOB_TAG.equals(qName)) { String pattern = attributes.getValue(PATTERN_ATTR); @@ -199,6 +208,20 @@ class MimeTypesReader extends DefaultHan } else if (COMMENT_TAG.equals(qName)) { type.setDescription(characters.toString().trim()); characters = null; + } else if (ACRONYM_TAG.equals(qName)) { + type.setAcronym(characters.toString().trim()); + characters = null; + } else if (TIKA_UTI_TAG.equals(qName)) { + type.setUniformTypeIdentifier(characters.toString().trim()); + characters = null; + } else if (TIKA_LINK_TAG.equals(qName)) { + try { + type.addLink(new URI(characters.toString().trim())); + } + catch (URISyntaxException e) { + throw new IllegalArgumentException("unable to parse link: "+characters, e); + } + characters = null; } else if (MATCH_TAG.equals(qName)) { current.stop(); } else if (MAGIC_TAG.equals(qName)) { Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java?rev=1442168&r1=1442167&r2=1442168&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java Mon Feb 4 16:36:42 2013 @@ -27,6 +27,8 @@ public interface MimeTypesReaderMetKeys String MIME_TYPE_TYPE_ATTR = "type"; + String ACRONYM_TAG = "acronym"; + String COMMENT_TAG = "_comment"; String GLOB_TAG = "glob"; @@ -63,4 +65,7 @@ public interface MimeTypesReaderMetKeys String LOCAL_NAME_ATTR = "localName"; + String TIKA_LINK_TAG = "tika:link"; + + String TIKA_UTI_TAG = "tika:uti"; } Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1442168&r1=1442167&r2=1442168&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Mon Feb 4 16:36:42 2013 @@ -3752,6 +3752,8 @@ <alias type="image/bmp"/> <acronym>BMP</acronym> <_comment>Windows bitmap</_comment> + <tika:link>http://en.wikipedia.org/wiki/BMP_file_format</tika:link> + <tika:uti>com.microsoft.bmp</tika:uti> <magic priority="50"> <match value="BM" type="string" offset="0"> <match value="0x0100" type="string" offset="26"> Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java?rev=1442168&r1=1442167&r2=1442168&view=diff ============================================================================== --- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java (original) +++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java Mon Feb 4 16:36:42 2013 @@ -119,6 +119,17 @@ public class MimeTypesReaderTest extends } /** + * @since TIKA-1012 + */ + public void testReadExtendedMetadata() throws Exception { + MimeType bmp = this.mimeTypes.forName("image/x-ms-bmp"); + assertEquals("BMP", bmp.getAcronym()); + assertEquals("com.microsoft.bmp", bmp.getUniformTypeIdentifier()); + assertEquals("http://en.wikipedia.org/wiki/BMP_file_format", + bmp.getLinks().get(0).toString()); + } + + /** * TIKA-746 Ensures that the custom mimetype maps were also * loaded and used */ Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java?rev=1442168&r1=1442167&r2=1442168&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java Mon Feb 4 16:36:42 2013 @@ -63,13 +63,34 @@ public class MimeTypeTest extends TestCa } /** Test MimeType setDescription() */ - public void testSetDescription() { + public void testSetEmptyValues() { try { text.setDescription(null); fail("Expected IllegalArgumentException"); } catch (IllegalArgumentException e) { // expected result } + + try { + text.setAcronym(null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected result + } + + try { + text.addLink(null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected result + } + + try { + text.setUniformTypeIdentifier(null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected result + } } }