Author: nick
Date: Mon Nov 21 12:55:49 2011
New Revision: 1204476

URL: http://svn.apache.org/viewvc?rev=1204476&view=rev
Log:
TIKA-786 Control the ordering of detectors in DefaultDetector, so that user 
supplied detectors come first, then Tika ones, and finally MimeTypes. This 
ensures that more specific detectors get to try first

Modified:
    
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java?rev=1204476&r1=1204475&r2=1204476&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java 
Mon Nov 21 12:55:49 2011
@@ -17,6 +17,8 @@
 package org.apache.tika.detect;
 
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
 
 import javax.imageio.spi.ServiceRegistry;
@@ -27,6 +29,12 @@ import org.apache.tika.mime.MimeTypes;
 /**
  * A composite detector based on all the {@link Detector} implementations
  * available through the {@link ServiceRegistry service provider mechanism}.
+ * 
+ * Detectors are loaded and returned in a specified order, of user supplied
+ *  followed by non-MimeType Tika, followed by the Tika MimeType class.
+ * If you need to control the order of the Detectors, you should instead
+ *  construct your own {@link CompositeDetector} and pass in the list
+ *  of Detectors in the required order.
  *
  * @since Apache Tika 0.9
  */
@@ -37,9 +45,35 @@ public class DefaultDetector extends Com
 
     private static List<Detector> getDefaultDetectors(
             MimeTypes types, ServiceLoader loader) {
-        List<Detector> detectors = new ArrayList<Detector>();
+        // Find all the detectors available as services
+        List<Detector> svcDetectors = 
loader.loadServiceProviders(Detector.class);
+        List<Detector> detectors = new 
ArrayList<Detector>(svcDetectors.size()+1);
+        
+        // Sort the list by classname, rather than discovery order 
+        Collections.sort(svcDetectors, new Comparator<Detector>() {
+            public int compare(Detector d1, Detector d2) {
+               return d1.getClass().getName().compareTo(
+                     d2.getClass().getName());
+            }
+        });
+        
+        // Add the non-Tika (user supplied) detectors First
+        for (Detector d : svcDetectors) {
+           if (! d.getClass().getName().startsWith("org.apache.tika")) {
+              detectors.add(d);
+           }
+        }
+        
+        // Add the Tika detectors next
+        for (Detector d : svcDetectors) {
+           if (d.getClass().getName().startsWith("org.apache.tika")) {
+              detectors.add(d);
+           }
+        }
+        
+        // Finally add the Tika MimeTypes as a fallback
         detectors.add(types);
-        detectors.addAll(loader.loadServiceProviders(Detector.class));
+        
         return detectors;
     }
 

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1204476&r1=1204475&r2=1204476&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 Mon Nov 21 12:55:49 2011
@@ -77,10 +77,9 @@ public class TestContainerAwareDetector 
         assertTypeByNameAndData("testPPT.ppt", 
"application/vnd.ms-powerpoint");
         
         // With the wrong filename supplied, data will trump filename
-        // TODO Fix this! (TIKA-786)
-//        assertTypeByNameAndData("testEXCEL.xls", "notWord.doc",  
"application/vnd.ms-excel");
-//        assertTypeByNameAndData("testWORD.doc",  "notExcel.xls", 
"application/msword");
-//        assertTypeByNameAndData("testPPT.ppt",   "notWord.doc",  
"application/vnd.ms-powerpoint");
+        assertTypeByNameAndData("testEXCEL.xls", "notWord.doc",  
"application/vnd.ms-excel");
+        assertTypeByNameAndData("testWORD.doc",  "notExcel.xls", 
"application/msword");
+        assertTypeByNameAndData("testPPT.ppt",   "notWord.doc",  
"application/vnd.ms-powerpoint");
         
         // With a filename of a totally different type, data will trump 
filename
         assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf",  
"application/vnd.ms-excel");
@@ -127,10 +126,9 @@ public class TestContainerAwareDetector 
         assertTypeByNameAndData("testPPT.pptx", 
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
         
         // With the wrong filename supplied, data will trump filename
-        // TODO Fix this! (TIKA-786)
-//        assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-//        assertTypeByNameAndData("testWORD.docx",  "notExcel.xlsx", 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
-//        assertTypeByNameAndData("testPPT.pptx",   "notWord.docx", 
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
+        assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+        assertTypeByNameAndData("testWORD.docx",  "notExcel.xlsx", 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+        assertTypeByNameAndData("testPPT.pptx",   "notWord.docx", 
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
         
         // With an incorrect filename of a different container type, data 
trumps filename
         assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");


Reply via email to