Author: tallison Date: Mon Mar 30 13:57:06 2015 New Revision: 1670095 URL: http://svn.apache.org/r1670095 Log: TIKA-1584: fixed regression in Tika 1.7 that prevents processing of embedded docs with /tika service
Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java?rev=1670095&r1=1670094&r2=1670095&view=diff ============================================================================== --- tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java (original) +++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java Mon Mar 30 13:57:06 2015 @@ -128,7 +128,8 @@ public class MetadataResource { final ParseContext context = new ParseContext(); AutoDetectParser parser = TikaResource.createParser(tikaConfig); TikaResource.fillMetadata(parser, metadata, context, httpHeaders); - TikaResource.fillParseContext(context, httpHeaders); + //no need to pass parser for embedded document parsing + TikaResource.fillParseContext(context, httpHeaders, null); TikaResource.logRequest(logger, info, metadata); TikaResource.parse(parser, logger, info.getPath(), is, new DefaultHandler(), metadata, context); return metadata; Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java?rev=1670095&r1=1670094&r2=1670095&view=diff ============================================================================== --- tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java (original) +++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java Mon Mar 30 13:57:06 2015 @@ -78,7 +78,8 @@ public class RecursiveMetadataResource { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(type, -1)); TikaResource.fillMetadata(parser, metadata, context, httpHeaders); - TikaResource.fillParseContext(context, httpHeaders); + //no need to add parser to parse recursively + TikaResource.fillParseContext(context, httpHeaders, null); TikaResource.logRequest(logger, info, metadata); TikaResource.parse(wrapper, logger, info.getPath(), is, new DefaultHandler(), metadata, context); return new MetadataList(wrapper.getMetadata()); Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java?rev=1670095&r1=1670094&r2=1670095&view=diff ============================================================================== --- tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java (original) +++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java Mon Mar 30 13:57:06 2015 @@ -138,7 +138,8 @@ public class TikaResource { return httpHeaders.getFirst("File-Name"); } - public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders) { + public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders, + Parser embeddedParser) { TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); PDFParserConfig pdfParserConfig = new PDFParserConfig(); for (String key : httpHeaders.keySet()) { @@ -150,6 +151,9 @@ public class TikaResource { } parseContext.set(TesseractOCRConfig.class, ocrConfig); parseContext.set(PDFParserConfig.class, pdfParserConfig); + if (embeddedParser != null) { + parseContext.set(Parser.class, embeddedParser); + } } /** @@ -295,7 +299,7 @@ public class TikaResource { final ParseContext context = new ParseContext(); fillMetadata(parser, metadata, context, httpHeaders); - fillParseContext(context, httpHeaders); + fillParseContext(context, httpHeaders, parser); logRequest(logger, info, metadata); @@ -353,7 +357,7 @@ public class TikaResource { final ParseContext context = new ParseContext(); fillMetadata(parser, metadata, context, httpHeaders); - fillParseContext(context, httpHeaders); + fillParseContext(context, httpHeaders, parser); logRequest(logger, info, metadata); Modified: tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java?rev=1670095&r1=1670094&r2=1670095&view=diff ============================================================================== --- tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java (original) +++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java Mon Mar 30 13:57:06 2015 @@ -37,6 +37,8 @@ public class TikaResourceTest extends CX public static final String TEST_DOC = "test.doc"; public static final String TEST_XLSX = "16637.xlsx"; public static final String TEST_PASSWORD_PROTECTED = "password.xls"; + private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx"; + private static final String TIKA_PATH = "/tika"; private static final int UNPROCESSEABLE = 422; @@ -149,4 +151,23 @@ public class TikaResourceTest extends CX assertTrue(responseMsg.contains("test")); } + @Test + public void testEmbedded() throws Exception { + //first try text + Response response = WebClient.create(endPoint + TIKA_PATH) + .accept("text/plain") + .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)); + String responseMsg = getStringFromInputStream((InputStream) response + .getEntity()); + assertTrue(responseMsg.contains("Course of human events")); + + //now go for xml -- different call than text + response = WebClient.create(endPoint + TIKA_PATH) + .accept("text/xml") + .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)); + responseMsg = getStringFromInputStream((InputStream) response + .getEntity()); + assertTrue(responseMsg.contains("Course of human events")); + } + }