You da man Nick ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Chris Mattmann, Ph.D. Chief Architect Instrument Software and Science Data Systems Section (398) NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA Office: 168-519, Mailstop: 168-527 Email: [email protected] WWW: http://sunset.usc.edu/~mattmann/ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Adjunct Associate Professor, Computer Science Department University of Southern California, Los Angeles, CA 90089 USA ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-----Original Message----- From: "[email protected]" <[email protected]> Reply-To: "[email protected]" <[email protected]> Date: Monday, February 9, 2015 at 8:25 AM To: "[email protected]" <[email protected]> Subject: svn commit: r1658449 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/parser/ParserDecorator.java test/java/org/apache/tika/parser/DummyParser.java test/java/org/apache/tika/parser/ParserDecoratorTest.java >Author: nick >Date: Mon Feb 9 16:25:09 2015 >New Revision: 1658449 > >URL: http://svn.apache.org/r1658449 >Log: >TIKA-1509 Provide a possible "parser with fallback" implementation, with >lots of questions! > >Modified: > >tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator. >java > >tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java > >tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT >est.java > >Modified: >tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator. >java >URL: >http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache >/tika/parser/ParserDecorator.java?rev=1658449&r1=1658448&r2=1658449&view=d >iff >========================================================================== >==== >--- >tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator. >java (original) >+++ >tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator. >java Mon Feb 9 16:25:09 2015 >@@ -18,10 +18,12 @@ package org.apache.tika.parser; > > import java.io.IOException; > import java.io.InputStream; >+import java.util.Collection; > import java.util.HashSet; > import java.util.Set; > > import org.apache.tika.exception.TikaException; >+import org.apache.tika.io.TikaInputStream; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.mime.MediaType; > import org.xml.sax.ContentHandler; >@@ -81,6 +83,50 @@ public class ParserDecorator extends Abs > } > }; > } >+ >+ /** >+ * Decorates the given parsers into a virtual parser, where they'll >+ * be tried in preference order until one works without error. >+ * TODO Is this the right name? >+ * TODO Is this the right place to put this? Should it be in >CompositeParser? Elsewhere? >+ * TODO Should we reset the Metadata if we try another parser? >+ * TODO Should we reset the ContentHandler if we try another parser? >+ * TODO Should we log/report failures anywhere? >+ * @deprecated Do not use until the TODOs are resolved, see TIKA-1509 >+ */ >+ public static final Parser withFallbacks( >+ final Collection<? extends Parser> parsers, final >Set<MediaType> types) { >+ Parser parser = EmptyParser.INSTANCE; >+ if (!parsers.isEmpty()) parser = parsers.iterator().next(); >+ >+ return new ParserDecorator(parser) { >+ private static final long serialVersionUID = >1625187131782069683L; >+ @Override >+ public Set<MediaType> getSupportedTypes(ParseContext >context) { >+ return types; >+ } >+ @Override >+ public void parse(InputStream stream, ContentHandler handler, >+ Metadata metadata, ParseContext context) >+ throws IOException, SAXException, TikaException { >+ // Must have a TikaInputStream, so we can re-use it if >parsing fails >+ TikaInputStream tstream = TikaInputStream.get(stream); >+ tstream.getFile(); >+ // Try each parser in turn >+ for (Parser p : parsers) { >+ tstream.mark(-1); >+ try { >+ p.parse(tstream, handler, metadata, context); >+ return; >+ } catch (Exception e) { >+ // TODO How to log / record this failure? >+ } >+ // Prepare for the next parser, if present >+ tstream.reset(); >+ } >+ } >+ }; >+ } > > /** > * The decorated parser instance. > >Modified: >tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java >URL: >http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache >/tika/parser/DummyParser.java?rev=1658449&r1=1658448&r2=1658449&view=diff >========================================================================== >==== >--- >tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java > (original) >+++ >tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java > Mon Feb 9 16:25:09 2015 >@@ -25,6 +25,7 @@ import java.util.Map.Entry; > import org.apache.tika.exception.TikaException; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.mime.MediaType; >+import org.apache.tika.sax.XHTMLContentHandler; > import org.xml.sax.ContentHandler; > import org.xml.sax.SAXException; > >@@ -54,11 +55,12 @@ public class DummyParser extends Abstrac > metadata.add(m.getKey(), m.getValue()); > } > >- handler.startDocument(); >+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, >metadata); >+ xhtml.startDocument(); > if (xmlText != null) { >- handler.characters(xmlText.toCharArray(), 0, xmlText.length()); >+ xhtml.characters(xmlText.toCharArray(), 0, xmlText.length()); > } >- handler.endDocument(); >+ xhtml.endDocument(); > } > > } > >Modified: >tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT >est.java >URL: >http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache >/tika/parser/ParserDecoratorTest.java?rev=1658449&r1=1658448&r2=1658449&vi >ew=diff >========================================================================== >==== >--- >tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT >est.java (original) >+++ >tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT >est.java Mon Feb 9 16:25:09 2015 >@@ -18,12 +18,16 @@ package org.apache.tika.parser; > > import static org.junit.Assert.assertEquals; > >+import java.io.ByteArrayInputStream; >+import java.util.Arrays; > import java.util.Collections; > import java.util.HashMap; > import java.util.HashSet; > import java.util.Set; > >+import org.apache.tika.metadata.Metadata; > import org.apache.tika.mime.MediaType; >+import org.apache.tika.sax.BodyContentHandler; > import org.junit.Test; > > public class ParserDecoratorTest { >@@ -71,4 +75,46 @@ public class ParserDecoratorTest { > assertEquals(1, types.size()); > assertEquals(types.toString(), true, >types.contains(MediaType.OCTET_STREAM)); > } >+ >+ /** >+ * Testing one proposed implementation for TIKA-1509 >+ */ >+ @Test >+ public void withFallback() throws Exception { >+ Set<MediaType> onlyOct = >Collections.singleton(MediaType.OCTET_STREAM); >+ Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList( >+ MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); >+ >+ ParseContext context = new ParseContext(); >+ BodyContentHandler handler; >+ Metadata metadata; >+ >+ ErrorParser pFail = new ErrorParser(); >+ DummyParser pWork = new DummyParser(onlyOct, new >HashMap<String,String>(), "Fell back!"); >+ EmptyParser pNothing = new EmptyParser(); >+ >+ // Create a combination which will fail first >+ @SuppressWarnings("deprecation") >+ Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, >pWork), octAndText); >+ >+ // Will claim to support the types given, not those on the child >parsers >+ Set<MediaType> types = p.getSupportedTypes(context); >+ assertEquals(2, types.size()); >+ assertEquals(types.toString(), true, >types.contains(MediaType.TEXT_PLAIN)); >+ assertEquals(types.toString(), true, >types.contains(MediaType.OCTET_STREAM)); >+ >+ // Parsing will make it to the second one >+ metadata = new Metadata(); >+ handler = new BodyContentHandler(); >+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), >handler, metadata, context); >+ assertEquals("Fell back!", handler.toString()); >+ >+ >+ // With a parser that will work with no output, will get nothing >+ p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, >pWork), octAndText); >+ metadata = new Metadata(); >+ handler = new BodyContentHandler(); >+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), >handler, metadata, context); >+ assertEquals("", handler.toString()); >+ } > } > >
