This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch multiple-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit 348bfb9be46036833bbfda38c1912c9bf9eeb06e Author: Nick Burch <n...@gagravarr.org> AuthorDate: Tue Mar 13 18:15:14 2018 +0000 More metadata handling between parsers, start on unit testing --- .../parser/multiple/AbstractMultipleParser.java | 19 ++-- .../tika/parser/multiple/MultipleParserTest.java | 111 +++++++++++++++++++++ 2 files changed, 123 insertions(+), 7 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java index 9781f49..0aded0c 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java @@ -175,11 +175,11 @@ public abstract class AbstractMultipleParser extends AbstractParser { */ public void parse( InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) + Metadata originalMetadata, ParseContext context) throws IOException, SAXException, TikaException { // Track the metadata between parsers, so we can apply our policy - Metadata originalMetadata = cloneMetadata(metadata); - Metadata lastMetadata = originalMetadata; + Metadata lastMetadata = cloneMetadata(originalMetadata); + Metadata metadata = lastMetadata; // Start tracking resources, so we can clean up when done TemporaryResources tmp = new TemporaryResources(); @@ -203,7 +203,7 @@ public abstract class AbstractMultipleParser extends AbstractParser { taggedStream.mark(-1); // Record that we used this parser - recordParserDetails(p, metadata); + recordParserDetails(p, originalMetadata); // Prepare an near-empty Metadata, will merge after metadata = cloneMetadata(originalMetadata); @@ -220,6 +220,9 @@ public abstract class AbstractMultipleParser extends AbstractParser { // Notify the implementation how it went boolean tryNext = parserCompleted(p, metadata, handler, failure); + // Handle metadata merging / clashes + metadata = mergeMetadata(metadata, lastMetadata, policy); + // Abort if requested, with the exception if there was one if (!tryNext) { if (failure != null) { @@ -232,9 +235,6 @@ public abstract class AbstractMultipleParser extends AbstractParser { break; } - // Handle metadata merging / clashes - metadata = mergeMetadata(metadata, lastMetadata, policy); - // Prepare for the next parser, if present lastMetadata = cloneMetadata(metadata); taggedStream.reset(); @@ -242,6 +242,11 @@ public abstract class AbstractMultipleParser extends AbstractParser { } finally { tmp.dispose(); } + + // Finally, copy the latest metadata back onto their supplied object + for (String n : metadata.names()) { + originalMetadata.set(n, metadata.get(n)); + } } // TODO Provide a method that takes an InputStreamSource as well, diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java new file mode 100644 index 0000000..b3166eb --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.multiple; + +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayInputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import org.apache.tika.parser.DummyParser; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.EmptyParser; +import org.apache.tika.parser.ErrorParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; + +public class MultipleParserTest { + /** + * Tests how {@link AbstractMultipleParser} works out which + * mime types to offer, based on the types of the parsers + */ + @Test + public void testMimeTypeSupported() { + // TODO + } + + /** + * Test {@link FallbackParser} + */ + @Test + public void testFallback() throws Exception { + ParseContext context = new ParseContext(); + BodyContentHandler handler; + Metadata metadata; + Parser p; + String[] usedParsers; + + // Some media types + Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM); + Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList( + MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); + + // Some parsers + ErrorParser pFail = new ErrorParser(); + DummyParser pContent = new DummyParser(onlyOct, new HashMap<String,String>(), + "Fell back!"); + EmptyParser pNothing = new EmptyParser(); + + + // With only one parser defined, works as normal + p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent); + + metadata = new Metadata(); + handler = new BodyContentHandler(); + p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); + assertEquals("Fell back!", handler.toString()); + + usedParsers = metadata.getValues("X-Parsed-By"); + assertEquals(1, usedParsers.length); + assertEquals(DummyParser.class.getName(), usedParsers[0]); + + + // With a failing parser, will go to the working one + p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent); + + metadata = new Metadata(); + handler = new BodyContentHandler(); + p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); + assertEquals("Fell back!", handler.toString()); + + usedParsers = metadata.getValues("X-Parsed-By"); + assertEquals(2, usedParsers.length); + assertEquals(DummyParser.class.getName(), usedParsers[0]); + + // TODO Check we got an exception + + + // Won't go past the working one + // TODO + } + + /** + * Test for {@link SupplementingParser} + */ + @Test + public void testSupplemental() throws Exception { + // TODO + } +} -- To stop receiving notification emails like this one, please contact n...@apache.org.