Author: nick Date: Mon Nov 21 12:55:49 2011 New Revision: 1204476 URL: http://svn.apache.org/viewvc?rev=1204476&view=rev Log: TIKA-786 Control the ordering of detectors in DefaultDetector, so that user supplied detectors come first, then Tika ones, and finally MimeTypes. This ensures that more specific detectors get to try first
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java?rev=1204476&r1=1204475&r2=1204476&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java Mon Nov 21 12:55:49 2011 @@ -17,6 +17,8 @@ package org.apache.tika.detect; import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.List; import javax.imageio.spi.ServiceRegistry; @@ -27,6 +29,12 @@ import org.apache.tika.mime.MimeTypes; /** * A composite detector based on all the {@link Detector} implementations * available through the {@link ServiceRegistry service provider mechanism}. + * + * Detectors are loaded and returned in a specified order, of user supplied + * followed by non-MimeType Tika, followed by the Tika MimeType class. + * If you need to control the order of the Detectors, you should instead + * construct your own {@link CompositeDetector} and pass in the list + * of Detectors in the required order. * * @since Apache Tika 0.9 */ @@ -37,9 +45,35 @@ public class DefaultDetector extends Com private static List<Detector> getDefaultDetectors( MimeTypes types, ServiceLoader loader) { - List<Detector> detectors = new ArrayList<Detector>(); + // Find all the detectors available as services + List<Detector> svcDetectors = loader.loadServiceProviders(Detector.class); + List<Detector> detectors = new ArrayList<Detector>(svcDetectors.size()+1); + + // Sort the list by classname, rather than discovery order + Collections.sort(svcDetectors, new Comparator<Detector>() { + public int compare(Detector d1, Detector d2) { + return d1.getClass().getName().compareTo( + d2.getClass().getName()); + } + }); + + // Add the non-Tika (user supplied) detectors First + for (Detector d : svcDetectors) { + if (! d.getClass().getName().startsWith("org.apache.tika")) { + detectors.add(d); + } + } + + // Add the Tika detectors next + for (Detector d : svcDetectors) { + if (d.getClass().getName().startsWith("org.apache.tika")) { + detectors.add(d); + } + } + + // Finally add the Tika MimeTypes as a fallback detectors.add(types); - detectors.addAll(loader.loadServiceProviders(Detector.class)); + return detectors; } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1204476&r1=1204475&r2=1204476&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Mon Nov 21 12:55:49 2011 @@ -77,10 +77,9 @@ public class TestContainerAwareDetector assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint"); // With the wrong filename supplied, data will trump filename - // TODO Fix this! (TIKA-786) -// assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel"); -// assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword"); -// assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint"); + assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel"); + assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword"); + assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint"); // With a filename of a totally different type, data will trump filename assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf", "application/vnd.ms-excel"); @@ -127,10 +126,9 @@ public class TestContainerAwareDetector assertTypeByNameAndData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); // With the wrong filename supplied, data will trump filename - // TODO Fix this! (TIKA-786) -// assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); -// assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); -// assertTypeByNameAndData("testPPT.pptx", "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); + assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + assertTypeByNameAndData("testPPT.pptx", "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); // With an incorrect filename of a different container type, data trumps filename assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");