This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5a1ee00e64ec812574ba7be8e48f637e01fa018c Author: tallison <[email protected]> AuthorDate: Mon May 4 21:21:44 2020 -0400 TIKA-3094 add ignored unit test that runs the bundle against all of the test files. --- tika-bundle/pom.xml | 3 +- .../test/java/org/apache/tika/bundle/BundleIT.java | 57 ++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml index 07ef3bd..5a35e32 100644 --- a/tika-bundle/pom.xml +++ b/tika-bundle/pom.xml @@ -178,7 +178,6 @@ xmlbeans| jackcess| jackcess-encrypt| - commons-lang| commons-lang3| tagsoup| asm| @@ -192,6 +191,7 @@ boilerpipe| rome| rome-utils| + jdom2| sentiment-analysis-parser| opennlp-tools| geoapi| @@ -372,6 +372,7 @@ org.jaxen.dom4j;resolution:=optional, org.jaxen.pattern;resolution:=optional, org.jaxen.saxpath;resolution:=optional, + org.jaxen.util;resolution:=optional, org.jdom;resolution:=optional, org.jdom.input;resolution:=optional, org.jdom.output;resolution:=optional, diff --git a/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java b/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java index 54f10ae..12804ca 100644 --- a/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java +++ b/tika-bundle/src/test/java/org/apache/tika/bundle/BundleIT.java @@ -45,6 +45,8 @@ import javax.inject.Inject; import org.apache.tika.Tika; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; import org.apache.tika.fork.ForkParser; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -57,6 +59,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.internal.Activator; import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.ops4j.pax.exam.Configuration; @@ -68,6 +71,7 @@ import org.osgi.framework.Bundle; import org.osgi.framework.BundleContext; import org.osgi.framework.ServiceReference; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; @RunWith(PaxExam.class) @ExamReactorStrategy(PerMethod.class) @@ -302,4 +306,57 @@ public class BundleIT { String content = handler.toString(); assertTrue(content.contains("Attachment Test")); } + + @Test + @Ignore + public void testAll() throws Exception { + Tika tika = new Tika(); + + // Package extraction + ContentHandler handler = new BodyContentHandler(); + + Parser parser = tika.getParser(); + ParseContext context = new ParseContext(); + context.set(Parser.class, parser); + Metadata metadata = new Metadata(); + Set<String> needToFix = new HashSet<>(); + needToFix.add("testAccess2_encrypted.accdb"); + + Set<String> unknownProblem = new HashSet<>(); + //these all trigger org.apache.tika.metadata.PropertyTypeException + //which for some reason we can't catch (?!) + //We don't see problems with these files in tika-parsers?! +/* unknownProblem.add("testPPT_embedded_two_slides.pptx"); + unknownProblem.add("testWORD_multi_authors.docx"); + unknownProblem.add("testEXCEL_embeded.xlsx"); + unknownProblem.add("testVORBIS.ogg"); + unknownProblem.add("testWORD_2006ml.docx"); + unknownProblem.add("testRTFEmbeddedLink.rtf");*/ + System.out.println(getTestDir()); + for (File f : getTestDir().listFiles()) { + if (f.isDirectory()) { + continue; + } + if (needToFix.contains(f.getName()) || unknownProblem.contains(f.getName())) { + continue; + } + System.out.println("about to parse "+f); + try (InputStream is = TikaInputStream.get(f)) { + parser.parse(is, handler, metadata, context); + } catch (EncryptedDocumentException e) { + //swallow + } catch (SAXException e) { + // + } catch (TikaException e) { + System.err.println("tika Exception "+f.getName()); + e.printStackTrace(); + } + } + } + + private File getTestDir() { + return new File("../tika-parsers/src/test/resources/test-documents"); + } + + }
