Author: nick
Date: Thu Jul 29 16:59:14 2010
New Revision: 980508

URL: http://svn.apache.org/viewvc?rev=980508&view=rev
Log:
Make mime type detection a little bit more stable (TIKA-391)
Make the comparison operator work better on Magic types, and ensure that the 
type is present on the magic to help debugging and sorting. Also add tests to 
show that we can detect the same file multiple times, and get the same answer 
each time.

Added:
    
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
    tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.xls   
(with props)
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
    
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=980508&r1=980507&r2=980508&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java 
Thu Jul 29 16:59:14 2010
@@ -69,6 +69,8 @@ public class MagicDetector implements De
      * starts at this offset.
      */
     private final int offsetRangeEnd;
+    
+    private final String asString;
 
     /**
      * Creates a detector for input documents that have the exact given byte
@@ -134,6 +136,13 @@ public class MagicDetector implements De
 
         this.offsetRangeBegin = offsetRangeBegin;
         this.offsetRangeEnd = offsetRangeEnd;
+        
+        // Build the string representation. Needs to be unique, as
+        //  these get compared. Compute now as may get compared a lot!
+        this.asString = "Magic Detection for " + type.toString() +
+          " looking for " + pattern.length + 
+          " bytes = " + this.pattern + 
+          " mask = " + this.mask;
     }
 
     /**
@@ -196,4 +205,12 @@ public class MagicDetector implements De
         }
     }
 
+    /**
+     * Returns a string representation of the Detection Rule.
+     * Should sort nicely by type and details, as we sometimes
+     *  compare these.
+     */
+    public String toString() {
+       return asString;
+    }
 }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java?rev=980508&r1=980507&r2=980508&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java Thu Jul 
29 16:59:14 2010
@@ -70,6 +70,9 @@ class Magic implements Clause, Comparabl
             diff = o.size() - size();
         }
         if (diff == 0) {
+            diff = o.type.compareTo(type);
+        }
+        if (diff == 0) {
             diff = o.toString().compareTo(toString());
         }
         return diff;

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=980508&r1=980507&r2=980508&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java 
Thu Jul 29 16:59:14 2010
@@ -183,7 +183,7 @@ final class MimeTypesReader implements M
             priority = Integer.parseInt(value);
         }
 
-        for (Clause clause : readMatches(element)) {
+        for (Clause clause : readMatches(element, mimeType.getType())) {
             Magic magic = new Magic(mimeType);
             magic.setPriority(priority);
             magic.setClause(clause);
@@ -191,7 +191,7 @@ final class MimeTypesReader implements M
         }
     }
 
-    private List<Clause> readMatches(Element element) throws MimeTypeException 
{
+    private List<Clause> readMatches(Element element, MediaType mediaType) 
throws MimeTypeException {
         List<Clause> clauses = new ArrayList<Clause>();
         NodeList nodes = element.getChildNodes();
         for (int i = 0; i < nodes.getLength(); i++) {
@@ -199,7 +199,7 @@ final class MimeTypesReader implements M
             if (node.getNodeType() == Node.ELEMENT_NODE) {
                 Element nodeElement = (Element) node;
                 if (nodeElement.getTagName().equals(MATCH_TAG)) {
-                    clauses.add(readMatch(nodeElement));
+                    clauses.add(readMatch(nodeElement, mediaType));
                 }
             }
         }
@@ -207,7 +207,7 @@ final class MimeTypesReader implements M
     }
 
     /** Read Element named match. */
-    private Clause readMatch(Element element) throws MimeTypeException {
+    private Clause readMatch(Element element, MediaType mediaType) throws 
MimeTypeException {
         String type = "string";
         int start = 0;
         int end = 0;
@@ -252,10 +252,10 @@ final class MimeTypesReader implements M
         }
 
         MagicDetector detector = new MagicDetector(
-                MediaType.TEXT_PLAIN, patternBytes, maskBytes, start, end);
+                mediaType, patternBytes, maskBytes, start, end);
         Clause clause = new MagicMatch(detector, length);
 
-        List<Clause> subClauses = readMatches(element);
+        List<Clause> subClauses = readMatches(element, mediaType);
         if (subClauses.size() == 0) {
             return clause;
         } else if (subClauses.size() == 1) {

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=980508&r1=980507&r2=980508&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java 
(original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java 
Thu Jul 29 16:59:14 2010
@@ -138,5 +138,16 @@ public class MimeDetectionTest extends T
             in.close();
         }        
     }
+    
+    /**
+     * Tests that when we repeatedly test the detection of a document
+     *  that can be detected with Mime Magic, that we consistently
+     *  detect it correctly. See TIKA-391 for more details.
+     */
+    public void testMimeMagicStability() throws IOException {
+       for(int i=0; i<100; i++) {
+          testFile("application/vnd.ms-excel", "test.xls");
+       }
+    }
 
 }

Added: 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java?rev=980508&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
 (added)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
 Thu Jul 29 16:59:14 2010
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.Field;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.SortedSet;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * These tests try to ensure that the MimeTypesReader
+ *  has correctly processed the mime-types.xml file.
+ * To do this, it tests that various aspects of the
+ *  mime-types.xml file have ended up correctly as
+ *  globs, matches, magics etc.
+ *  
+ * If you make updates to mime-types.xml, then the
+ *  checks in this test may no longer hold true.
+ * As such, if tests here start failing after your
+ *  changes, please review the test details, and
+ *  update it to match the new state of the file! 
+ */
+public class MimeTypesReaderTest extends TestCase {
+
+    private MimeTypes mimeTypes;
+    private SortedSet<Magic> magics;
+    private SortedSet<MimeType> xmls;
+
+    @Override
+    @SuppressWarnings("unchecked")
+    protected void setUp() throws Exception {
+        super.setUp();
+        this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
+        
+        Field magicsField = mimeTypes.getClass().getDeclaredField("magics");
+        magicsField.setAccessible(true);
+        magics = (SortedSet<Magic>)magicsField.get(mimeTypes);
+        
+        Field xmlsField = mimeTypes.getClass().getDeclaredField("xmls");
+        xmlsField.setAccessible(true);
+        xmls = (SortedSet<MimeType>)xmlsField.get(mimeTypes);
+    }
+    
+    public void testHtmlMatches() throws Exception {
+       int minMatches = 10;
+       
+       // Check on the type
+       MimeType html = mimeTypes.forName("text/html");
+       assertTrue(html.hasMagic());
+       assertTrue(
+             "There should be at least "+minMatches+" HTML matches, found " + 
html.getMagics().length,
+             html.getMagics().length >= minMatches
+       );
+       
+       
+       // Check on the overall magics
+       List<Magic> htmlMagics = new ArrayList<Magic>();
+       for(Magic magic : magics) {
+          if(magic.getType().toString().equals("text/html")) {
+             htmlMagics.add(magic);
+          }
+       }
+       
+       assertTrue(
+             "There should be at least "+minMatches+" HTML matches, found " + 
htmlMagics.size(),
+             htmlMagics.size() >= minMatches
+       );
+    }
+    
+    public void testExcelMatches() throws Exception {
+       int minMatches = 4;
+       
+       // Check on the type
+       MimeType excel = mimeTypes.forName("application/vnd.ms-excel");
+       assertTrue(excel.hasMagic());
+       assertTrue(
+             "There should be at least "+minMatches+" Excel matches, found " + 
excel.getMagics().length,
+             excel.getMagics().length >= minMatches
+       );
+       
+       
+       // Check on the overall magics
+       List<Magic> excelMagics = new ArrayList<Magic>();
+       for(Magic magic : magics) {
+          if(magic.getType().toString().equals("application/vnd.ms-excel")) {
+             excelMagics.add(magic);
+             System.out.println(magic);
+          }
+       }
+       
+       assertTrue(
+             "There should be at least "+minMatches+" Excel matches, found " + 
excelMagics.size(),
+             excelMagics.size() >= minMatches
+       );
+    }
+}

Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.xls
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.xls?rev=980508&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.xls
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to