This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4055 in repository https://gitbox.apache.org/repos/asf/tika.git
commit a61e785a81078687d3593b6e00c89f1a9b0e07c4 Author: tballison <talli...@apache.org> AuthorDate: Fri May 26 12:09:52 2023 -0400 TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta --- CHANGES.txt | 6 +++ .../apache/tika/parser/RecursiveParserWrapper.java | 2 + .../tika/parser/RecursiveParserWrapperTest.java | 61 +++++++++++++++++++--- .../src/test/resources/log4j.properties | 2 +- .../core/resource/RecursiveMetadataResource.java | 4 +- .../standard/RecursiveMetadataResourceTest.java | 14 ++--- 6 files changed, 73 insertions(+), 16 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index b3ac0be3b..5526b5f86 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,9 @@ +Release 2.8.1 - ??? + + * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055). + + * Add mime detection for many files (TIKA-3992). + Release 2.8.0 - 5/11/2023 * Enable counting and/or parsing of incremental updates in PDFs. This diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index 483181b0a..e8f029770 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -372,6 +372,7 @@ public class RecursiveParserWrapper extends ParserDecorator { } int availableLength = Math.min(totalWriteLimit - totalChars, length); super.characters(ch, start, availableLength); + totalChars += availableLength; if (availableLength < length) { handleWriteLimitReached(); } @@ -389,6 +390,7 @@ public class RecursiveParserWrapper extends ParserDecorator { } int availableLength = Math.min(totalWriteLimit - totalChars, length); super.ignorableWhitespace(ch, start, availableLength); + totalChars += availableLength; if (availableLength < length) { handleWriteLimitReached(); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index 24800926a..61eeab14d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -93,14 +93,15 @@ public class RecursiveParserWrapperTest extends TikaTest { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 70)); + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, + 70)); try (InputStream stream = getResourceAsStream("/test-documents/test_recursive_embedded.docx")) { wrapper.parse(stream, handler, metadata, context); } List<Metadata> list = handler.getMetadataList(); - assertEquals(5, list.size()); + assertEquals(2, list.size()); int wlr = 0; for (Metadata m : list) { @@ -112,15 +113,31 @@ public class RecursiveParserWrapperTest extends TikaTest { assertEquals(2, wlr); } + @Test + public void testOne() throws Exception { + ParseContext context = new ParseContext(); + Metadata metadata = new Metadata(); + int writeLimit = 100; + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, + writeLimit, false, context)); + try (InputStream stream = getResourceAsStream( + "/test-documents/test_recursive_embedded" + ".docx")) { + wrapper.parse(stream, handler, metadata, context); + } + List<Metadata> list = handler.getMetadataList(); + assertEquals(12, list.size()); + } @Test public void testCharLimitNoThrowOnWriteLimit() throws Exception { ParseContext context = new ParseContext(); Metadata metadata = new Metadata(); - + int writeLimit = 500; RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 500, - false, context)); + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, + writeLimit, false, context)); try (InputStream stream = getResourceAsStream("/test-documents/test_recursive_embedded" + ".docx")) { wrapper.parse(stream, handler, metadata, context); @@ -131,11 +148,41 @@ public class RecursiveParserWrapperTest extends TikaTest { assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED)); - assertContains("them to the separation", list.get(6).get(TikaCoreProperties.TIKA_CONTENT)); - assertNotContained("unalienable Rights", + assertContains("dissolve the political", list.get(6).get(TikaCoreProperties.TIKA_CONTENT)); + assertNotContained("them to the separation", list.get(6).get(TikaCoreProperties.TIKA_CONTENT)); } + @Test + public void testSpecificLimit() throws Exception { + int writeLimit = 60; + + ParseContext context = new ParseContext(); + Metadata metadata = new Metadata(); + + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, + writeLimit, false, context)); + try (InputStream stream = getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf")) { + wrapper.parse(stream, handler, metadata, context); + } + List<Metadata> list = handler.getMetadataList(); + assertTrue(writeLimit >= getContentLength(list), + "writeLimit=" + writeLimit + " contentLength=" + getContentLength(list)); + } + + private int getContentLength(List<Metadata> metadataList) { + int sz = 0; + for (Metadata metadata : metadataList) { + String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); + if (content != null) { + sz += content.length(); + } + } + return sz; + } + @Test public void testMaxEmbedded() throws Exception { int maxEmbedded = 4; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties index 8c106427a..bd6faa7a8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties @@ -15,7 +15,7 @@ # limitations under the License. #info,debug, error,fatal ... -log4j.rootLogger=info,stdout +log4j.rootLogger=error,stdout #console log4j.appender.stdout=org.apache.log4j.ConsoleAppender diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index 76e24b926..ac4837110 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -69,9 +69,11 @@ public class RecursiveMetadataResource { fillMetadata(parser, metadata, httpHeaders); fillParseContext(httpHeaders, metadata, context); TikaResource.logRequest(LOG, "/rmeta", metadata); + BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit()), + new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), + handlerConfig.isThrowOnWriteLimitReached(), context), handlerConfig.getMaxEmbeddedResources(), TikaResource.getConfig().getMetadataFilter()); try { diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java index 3de5c0e65..691554edb 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java @@ -345,8 +345,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { assertEquals(1, metadataList.size()); assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED)); - //now try with a write limit of 200 - writeLimit = 200; + //now try with a write limit of 500 + writeLimit = 550; response = WebClient.create(endPoint + META_PATH).accept("application/json") .header("writeLimit", Integer.toString(writeLimit)) .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)); @@ -390,11 +390,11 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { // Check results Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); List<Metadata> metadataList = JsonMetadataList.fromJson(reader); - assertEquals(10, metadataList.size()); + assertEquals(12, metadataList.size()); assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED)); - //now try with a write limit of 200 - writeLimit = 200; + //now try with a write limit of 550 + writeLimit = 550; response = WebClient.create(endPoint + META_PATH).accept("application/json") .header("writeLimit", Integer.toString(writeLimit)) .header("throwOnWriteLimitReached", "false") @@ -404,8 +404,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { // Check results reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); - assertEquals(10, metadataList.size()); - assertEquals("true", metadataList.get(6).get(TikaCoreProperties.WRITE_LIMIT_REACHED)); + assertEquals(12, metadataList.size()); + assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED)); assertContains("When in the Course of human events it becomes necessary for one people", metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT)); TikaTest.assertNotContained("We hold these truths",