Author: kbennett
Date: Sun Oct 28 10:42:42 2007
New Revision: 589385
URL: http://svn.apache.org/viewvc?rev=589385&view=rev
Log:
TIKA-88: Moved all nonredundant functionality from MimeUtils to MimeTypes.
Moved test code from MimeUtilsTest to MimeTypesTest accordingly.
Deleted MimeUtils class and its test class.
Modified URL for MIME type config file in default tika-config.xml to have
leading "/".
Created MimeTypesFactory class as a public factory and adapter to package
protected MimeTypesReader.
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
incubator/tika/trunk/src/main/resources/tika-config.xml
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java?rev=589385&r1=589384&r2=589385&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
Sun Oct 28 10:42:42 2007
@@ -17,6 +17,8 @@
package org.apache.tika.config;
//JDK imports
+
+
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -24,13 +26,9 @@
import java.util.HashMap;
import java.util.Map;
-//TIKA imports
import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.mime.MimeUtils;
+import org.apache.tika.mime.MimeTypesFactory;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.Utils;
-
-//JDOM imports
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
@@ -47,7 +45,7 @@
private final Map<String, Parser> parsers = new HashMap<String, Parser>();
- private static MimeUtils mimeTypeRepo;
+ private static MimeTypes mimeTypes;
public TikaConfig(String file) throws JDOMException, IOException {
this(new File(file));
@@ -65,14 +63,14 @@
this(new SAXBuilder().build(stream));
}
- public TikaConfig(Document document) throws JDOMException {
+ public TikaConfig(Document document) throws JDOMException, IOException {
this(document.getRootElement());
}
- public TikaConfig(Element element) throws JDOMException {
+ public TikaConfig(Element element) throws JDOMException, IOException {
Element mtr = element.getChild("mimeTypeRepository");
String mimeTypeRepoResource = mtr.getAttributeValue("resource");
- mimeTypeRepo = new MimeUtils(mimeTypeRepoResource);
+ mimeTypes = MimeTypesFactory.create(mimeTypeRepoResource);
for (Object node : XPath.selectNodes(element, "//parser")) {
String className = ((Element) node).getAttributeValue("class");
@@ -101,7 +99,7 @@
}
public MimeTypes getMimeRepository(){
- return mimeTypeRepo.getRepository();
+ return mimeTypes;
}
/**
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=589385&r1=589384&r2=589385&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Sun
Oct 28 10:42:42 2007
@@ -209,6 +209,52 @@
return shorter;
}
+ public String getType(String typeName, String url, byte[] data) {
+ MimeType type = null;
+ try {
+ typeName = MimeType.clean(typeName);
+ type = typeName == null ? null : forName(typeName);
+ } catch (MimeTypeException mte) {
+ // Seems to be a malformed mime type name...
+ }
+
+ if (typeName == null || type == null || !type.matches(url)) {
+ // If no mime-type header, or cannot find a corresponding
registered
+ // mime-type, or the one found doesn't match the url pattern
+ // it shouldbe, then guess a mime-type from the url pattern
+ type = getMimeType(url);
+ typeName = type == null ? typeName : type.getName();
+ }
+ // if (typeName == null || type == null ||
+ // (this.magic && type.hasMagic() && !type.matches(data))) {
+ // If no mime-type already found, or the one found doesn't match
+ // the magic bytes it should be, then, guess a mime-type from the
+ // document content (magic bytes)
+ type = getMimeType(data);
+ typeName = type == null ? typeName : type.getName();
+ // }
+ return typeName;
+ }
+
+ /**
+ * Determines the MIME type of the resource pointed to by the specified
URL.
+ * Examines the file's header, and if it cannot determine the MIME type
+ * from the header, guesses the MIME type from the URL extension
+ * (e.g. "pdf).
+ *
+ * @param url
+ * @return
+ * @throws IOException
+ */
+ public String getType(URL url) throws IOException {
+ InputStream stream = url.openStream();
+ try {
+ return getType(null, url.toString(), readMagicHeader(stream));
+ } finally {
+ stream.close();
+ }
+ }
+
/**
* Find the Mime Content Type of a document from its name and its content.
* The policy used to guess the Mime Content Type is:
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesFactory.java?rev=589385&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
Sun Oct 28 10:42:42 2007
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import java.io.InputStream;
+import java.io.IOException;
+import java.net.URL;
+
+import org.w3c.dom.Document;
+
+
+/**
+ * Creates instances of MimeTypes.
+ */
+public class MimeTypesFactory {
+
+
+ /**
+ * Creates an empty instance; same as calling new MimeTypes().
+ *
+ * @return an empty instance
+ */
+ public static MimeTypes create() {
+ return new MimeTypes();
+ }
+
+ /**
+ * Creates and returns a MimeTypes instance from the specified document.
+ */
+ public static MimeTypes create(Document document) {
+ MimeTypes mimeTypes = new MimeTypes();
+ new MimeTypesReader(mimeTypes).read(document);
+ return mimeTypes;
+ }
+
+ /**
+ * Creates and returns a MimeTypes instance from the specified input
stream.
+ * Does not close the input stream.
+ */
+ public static MimeTypes create(InputStream inputStream) {
+ MimeTypes mimeTypes = new MimeTypes();
+ new MimeTypesReader(mimeTypes).read(inputStream);
+ return mimeTypes;
+ }
+
+ /**
+ * Creates and returns a MimeTypes instance from the resource
+ * at the location specified by the URL. Opens and closes the
+ * InputStream from the URL.
+ */
+ public static MimeTypes create(URL url) throws IOException {
+ InputStream inputStream = null;
+
+ try {
+ inputStream = url.openStream();
+ return create(inputStream);
+ } finally {
+ if (inputStream != null) {
+ inputStream.close();
+ }
+ }
+ }
+
+ /**
+ * Creates and returns a MimeTypes instance from the specified file path,
+ * as interpreted by the class loader in getResource().
+ */
+ public static MimeTypes create(String filePath) throws IOException {
+ return create(MimeTypesReader.class.getResource(filePath));
+ }
+}
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=589385&r1=589384&r2=589385&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Oct 28 10:42:42
2007
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<properties>
- <mimeTypeRepository resource="org/apache/tika/mime/tika-mimetypes.xml"
magic="false"/>
+ <mimeTypeRepository resource="/org/apache/tika/mime/tika-mimetypes.xml"
magic="false"/>
<parsers>
Modified:
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=589385&r1=589384&r2=589385&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Sun Oct 28 10:42:42 2007
@@ -18,6 +18,11 @@
package org.apache.tika.mime;
// Junit imports
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.io.File;
+import java.io.IOException;
+
import junit.framework.TestCase;
// Tika imports
@@ -32,6 +37,20 @@
private MimeTypes repo;
+ private static URL u;
+
+ static {
+ try {
+ u = new URL("http://mydomain.com/x.pdf?x=y");
+ } catch (MalformedURLException e) {
+ fail(e.getMessage());
+ }
+ }
+
+ private static final File f = new File("/a/b/c/x.pdf");
+
+
+
public TestMimeTypes() {
try {
repo = TikaConfig.getDefaultConfig().getMimeRepository();
@@ -48,5 +67,75 @@
assertEquals(repo.getMimeType("test.PdF"), type);
assertEquals(repo.getMimeType("test.pdF"), type);
}
+
+ public void testLoadMimeTypes() {
+ assertNotNull(repo.forName("application/octet-stream"));
+ assertNotNull(repo.forName("text/x-tex"));
+ }
+
+ /**
+ * Tests MIME type determination based solely on the URL's extension.
+ */
+ public void testGuessMimeTypes() {
+
+ assertEquals("application/pdf", repo.getMimeType("x.pdf").getName());
+ assertEquals("application/pdf", repo.getMimeType(u).getName());
+ assertEquals("application/pdf", repo.getMimeType(f).getName());
+ assertEquals("text/plain", repo.getMimeType("x.txt").getName());
+ assertEquals("text/html", repo.getMimeType("x.htm").getName());
+ assertEquals("text/html", repo.getMimeType("x.html").getName());
+ assertEquals("application/xhtml+xml",
+ repo.getMimeType("x.xhtml").getName());
+ assertEquals("application/xml", repo.getMimeType("x.xml").getName());
+ assertEquals("application/msword",
repo.getMimeType("x.doc").getName());
+ assertEquals("application/vnd.ms-powerpoint",
+ repo.getMimeType("x.ppt").getName());
+ assertEquals("application/vnd.ms-excel",
+ repo.getMimeType("x.xls").getName());
+ assertEquals("application/zip", repo.getMimeType("x.zip").getName());
+ assertEquals("application/vnd.oasis.opendocument.text",
+ repo.getMimeType("x.odt").getName());
+ assertEquals("application/octet-stream",
+ repo.getMimeType("x.xyz").getName());
+ }
+
+ /**
+ * Tests MimeTypes.getMimeType(URL), which examines both the byte header
+ * and, if necessary, the URL's extension.
+ */
+ public void testMimeDeterminationForTestDocuments() {
+
+ assertEquals("text/html", getMimeType("testHTML.html"));
+ assertEquals("application/zip", getMimeType("test-documents.zip"));
+ assertEquals("application/vnd.ms-excel", getMimeType("testEXCEL.xls"));
+ assertEquals("text/html", getMimeType("testHTML_utf8.html"));
+ assertEquals("application/vnd.oasis.opendocument.text",
+ getMimeType("testOpenOffice2.odt"));
+ assertEquals("application/pdf", getMimeType("testPDF.pdf"));
+ assertEquals("application/vnd.ms-powerpoint",
+ getMimeType("testPPT.ppt"));
+ assertEquals("application/rtf", getMimeType("testRTF.rtf"));
+ assertEquals("text/plain", getMimeType("testTXT.txt"));
+ assertEquals("application/msword", getMimeType("testWORD.doc"));
+ assertEquals("application/xml", getMimeType("testXML.xml"));
+ }
+
+
+ private String getMimeType(String filename) {
+
+ String type = null;
+
+ try {
+ URL url = getClass().getResource("/test-documents/" + filename);
+ type = repo.getType(url);
+ } catch (MalformedURLException e) {
+ fail(e.getMessage());
+ } catch (IOException e) {
+ fail(e.getMessage());
+ }
+
+ return type;
+ }
+
}