Author: mattmann
Date: Fri Nov 13 22:35:27 2009
New Revision: 836035

URL: http://svn.apache.org/viewvc?rev=836035&view=rev
Log:
- fix for TIKA-309: Mime type application/rdf+xml not correctly detected

Modified:
    lucene/tika/trunk/CHANGES.txt
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
    
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java

Modified: lucene/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=836035&r1=836034&r2=836035&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Fri Nov 13 22:35:27 2009
@@ -5,6 +5,8 @@
 ------------------------
 
 The most notable changes in Tika 0.5 over the previous release are:
+ * Improved RDF/OWL mime detection using both MIME magic as well as
+   pattern matching (TIKA-309)
 
  * An org.apache.tika.Tika facade class has been added to simplify common
    text extraction and type detection use cases. (TIKA-269)

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=836035&r1=836034&r2=836035&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java 
(original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java 
Fri Nov 13 22:35:27 2009
@@ -255,7 +255,7 @@
         RootXML xml = null;
         String content = new String(data);
         for (int i = 0; i < rootXML.size(); i++) {
-            xml = rootXML.get(i);
+            xml = rootXML.get(i);            
             if (xml.matches(content)) {
                 return true;
             }
@@ -340,7 +340,7 @@
             }
             String regex = null;
             if (isEmpty(namespaceURI)) {
-                regex = ".*<" + localName + "[^<>]*>.*";
+                regex = ".*<" + localName + "[^<>]*.*";
             } else if (isEmpty(localName)) {
                 regex = ".*<[^<>]*\\p{Space}xmlns=[\"\']?" + namespaceURI
                         + "[\"\']?[^<>]*>.*";

Modified: 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=836035&r1=836034&r2=836035&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 Fri Nov 13 22:35:27 2009
@@ -366,12 +366,17 @@
   <mime-type type="application/qsig"/>
 
   <mime-type type="application/rdf+xml">
+    <root-XML localName="rdf:RDF"/>
+    <root-XML localName="RDF"
+              namespaceURI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>
+    <sub-class-of type="application/xml"/>
     <acronym>RDF/XML</acronym>
     <comment>XML syntax for RDF graphs</comment>
     <glob pattern="*.owl"/>
     <glob pattern="*.rdf"/>
-    <root-XML localName="RDF"
-              namespaceURI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>
+    <glob pattern="*.owl"/>
+    <glob pattern="^rdf$" isregex="true"/>
+    <glob pattern="^owl$" isregex="true"/>
   </mime-type>
 
   <mime-type type="application/reginfo+xml">
@@ -3548,7 +3553,6 @@
       <match value="&lt;body" type="string" offset="0"/>
       <match value="&lt;TITLE" type="string" offset="0"/>
       <match value="&lt;title" type="string" offset="0"/>
-      <match value="&lt;!--" type="string" offset="0"/>
       <match value="&lt;h1" type="string" offset="0"/>
       <match value="&lt;H1" type="string" offset="0"/>
       <match value="&lt;!doctype HTML" type="string" offset="0"/>

Modified: 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=836035&r1=836034&r2=836035&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
 Fri Nov 13 22:35:27 2009
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URL;
 
 import junit.framework.TestCase;
 
@@ -47,6 +48,8 @@
         testFile("application/xml", "test-utf16be.xml");
         testFile("application/xml", "test-long-comment.xml");
         testFile("application/xslt+xml", "stylesheet.xsl");
+        testUrl("application/rdf+xml", new 
URL("http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl";));
+        testUrl("application/rdf+xml", new 
URL("http://www.w3.org/2002/07/owl#";));
     }
     
     public void testAutosetSupertype() throws MimeTypeException {
@@ -57,25 +60,34 @@
        type = types.forName("text/something");
        assertEquals("text/plain", type.getSuperType().getName());
     }
+    
+    private void testUrl(String expected, URL url) throws IOException{
+        InputStream in = url.openStream();
+        testStream(expected, url.toString(), in);        
+    }
 
     private void testFile(String expected, String filename) throws IOException 
{
         InputStream in = getClass().getResourceAsStream(filename);
-        assertNotNull("Test file not found: " + filename, in);
+        testStream(expected, filename, in);
+    }
+    
+    private void testStream(String expected, String urlOrFileName, InputStream 
in) throws IOException{
+        assertNotNull("Test stream: ["+urlOrFileName+"] is null!", in);
         if (!in.markSupported()) {
             in = new java.io.BufferedInputStream(in);
         }
         try {
             Metadata metadata = new Metadata();
             String mime = this.mimeTypes.detect(in, metadata).toString();
-            assertEquals(filename + " is not properly detected.", expected, 
mime);
+            assertEquals(urlOrFileName + " is not properly detected.", 
expected, mime);
 
             //Add resource name and test again
-            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+            metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
             mime = this.mimeTypes.detect(in, metadata).toString();
-            assertEquals(filename + " is not properly detected.", expected, 
mime);
+            assertEquals(urlOrFileName + " is not properly detected after 
adding resource name.", expected, mime);
         } finally {
             in.close();
-        }
+        }        
     }
 
 }


Reply via email to