Author: nick
Date: Sun Aug 16 18:00:57 2015
New Revision: 1696159
URL: http://svn.apache.org/r1696159
Log:
Outlook detection with custom config tests, based on work by Justin Palmer
TIKA-1708
Added:
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-default.xml
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
tika/trunk/tika-example/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java?rev=1696159&r1=1696158&r2=1696159&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
Sun Aug 16 18:00:57 2015
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertNot
import java.net.URL;
+import org.apache.tika.TikaTest;
import org.apache.tika.parser.ParseContext;
import org.junit.After;
@@ -29,7 +30,7 @@ import org.junit.After;
* that {@link TikaConfigTest} can't, do due to a need for the
* full set of "real" classes of parsers / detectors
*/
-public abstract class AbstractTikaConfigTest {
+public abstract class AbstractTikaConfigTest extends TikaTest {
protected static ParseContext context = new ParseContext();
protected static String getConfigPath(String config) throws Exception {
Modified: tika/trunk/tika-example/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/pom.xml?rev=1696159&r1=1696158&r2=1696159&view=diff
==============================================================================
--- tika/trunk/tika-example/pom.xml (original)
+++ tika/trunk/tika-example/pom.xml Sun Aug 16 18:00:57 2015
@@ -64,6 +64,13 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1696159&r1=1696158&r2=1696159&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
Sun Aug 16 18:00:57 2015
@@ -46,14 +46,13 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
- * @author Tran Nam Quang
- * @author hong-thai.nguyen
+ * Parser for MS Outlook PST email storage files
*/
public class OutlookPSTParser extends AbstractParser {
private static final long serialVersionUID = 620998217748364063L;
- private static final MediaType MS_OUTLOOK_PST_MIMETYPE =
MediaType.application("vnd.ms-outlook-pst");
+ public static final MediaType MS_OUTLOOK_PST_MIMETYPE =
MediaType.application("vnd.ms-outlook-pst");
private static final Set<MediaType> SUPPORTED_TYPES =
singleton(MS_OUTLOOK_PST_MIMETYPE);
private static AttributesImpl createAttribute(String attName, String
attValue) {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java?rev=1696159&r1=1696158&r2=1696159&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
Sun Aug 16 18:00:57 2015
@@ -25,8 +25,12 @@ import org.apache.tika.detect.CompositeD
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EmptyDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.mbox.OutlookPSTParser;
import org.apache.tika.parser.microsoft.POIFSContainerDetector;
import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.junit.Ignore;
import org.junit.Test;
/**
@@ -52,33 +56,83 @@ public class TikaDetectorConfigTest exte
// Get the DefaultDetector from the config
- DefaultDetector confDetecotor =
(DefaultDetector)detector.getDetectors().get(0);
+ DefaultDetector confDetector =
(DefaultDetector)detector.getDetectors().get(0);
// Get a fresh "default" DefaultParser
DefaultDetector normDetector = new
DefaultDetector(config.getMimeRepository());
// The default one will offer the Zip and POIFS detectors
+ assertDetectors(normDetector, true, true);
+
+
+ // The one from the config won't, as we excluded those
+ assertDetectors(confDetector, false, false);
+ }
+
+ /**
+ * TIKA-1708 - If the Zip detector is disabled, either explicitly,
+ * or via giving a list of detectors that it isn't part of, ensure
+ * that detection of PST files still works
+ */
+ @Test
+ @Ignore // Currently broken as per bug report
+ public void testPSTDetectionWithoutZipDetector() throws Exception {
+ // Check the one with an exclude
+ TikaConfig config = getConfig("TIKA-1708-detector-default.xml");
+ assertNotNull(config.getParser());
+ assertNotNull(config.getDetector());
+ CompositeDetector detectorWX = (CompositeDetector)config.getDetector();
+
+ // Check it has the POIFS one, but not the zip one
+ assertDetectors(detectorWX, true, false);
+
+
+ // Check the one with an explicit list
+ config = getConfig("TIKA-1708-detector-composite.xml");
+ assertNotNull(config.getParser());
+ assertNotNull(config.getDetector());
+ CompositeDetector detectorCL = (CompositeDetector)config.getDetector();
+ assertEquals(2, detectorCL.getDetectors().size());
+
+ // Check it also has the POIFS one, but not the zip one
+ assertDetectors(detectorCL, true, false);
+
+
+ // Now check they detect PST files correctly
+ TikaInputStream stream = TikaInputStream.get(
+ getResourceAsFile("/test-documents/testPST.pst"));
+ assertEquals(
+ OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE.toString(),
+ detectorWX.detect(stream, new Metadata())
+ );
+ assertEquals(
+ OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE.toString(),
+ detectorCL.detect(stream, new Metadata())
+ );
+ }
+
+ private void assertDetectors(CompositeDetector detector, boolean
shouldHavePOIFS,
+ boolean shouldHaveZip) {
boolean hasZip = false;
boolean hasPOIFS = false;
- for (Detector d : normDetector.getDetectors()) {
+ for (Detector d : detector.getDetectors()) {
if (d instanceof ZipContainerDetector) {
- hasZip = true;
+ if (shouldHaveZip) {
+ hasZip = true;
+ } else {
+ fail("Shouldn't have the ZipContainerDetector from
config");
+ }
}
if (d instanceof POIFSContainerDetector) {
- hasPOIFS = true;
+ if (shouldHavePOIFS) {
+ hasPOIFS = true;
+ } else {
+ fail("Shouldn't have the POIFSContainerDetector from
config");
+ }
}
}
- assertTrue(hasZip);
- assertTrue(hasPOIFS);
-
-
- // The one from the config won't, as we excluded those
- for (Detector d : confDetecotor.getDetectors()) {
- if (d instanceof ZipContainerDetector)
- fail("Shouldn't have the ZipContainerDetector from config");
- if (d instanceof POIFSContainerDetector)
- fail("Shouldn't have the POIFSContainerDetector from config");
- }
+ if (shouldHavePOIFS) assertTrue("Should have the
POIFSContainerDetector", hasPOIFS);
+ if (shouldHaveZip) assertTrue("Should have the
ZipContainerDetector", hasZip);
}
}
Added:
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml?rev=1696159&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml
(added)
+++
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml
Sun Aug 16 18:00:57 2015
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers/>
+ <detectors>
+ <detector class="org.apache.tika.parser.microsoft.POIFSContainerDetector"/>
+ <detector class="org.apache.tika.mime.MimeTypes"/>
+ </detectors>
+ <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
+</properties>
Added:
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-default.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-default.xml?rev=1696159&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-default.xml
(added)
+++
tika/trunk/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1708-detector-default.xml
Sun Aug 16 18:00:57 2015
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers/>
+ <detectors>
+ <detector class="org.apache.tika.detect.DefaultDetector">
+ <detector-exclude
class="org.apache.tika.parser.pkg.ZipContainerDetector"/>
+ </detector>
+ </detectors>
+ <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
+</properties>