This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4637 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 60f90c242590afc66f1dbc6bf99a10db38e6699c Merge: a6e4fdb542 8e6949312e Author: tallison <[email protected]> AuthorDate: Sat Jan 31 19:30:31 2026 -0500 Merge origin/main into TIKA-4637 - resolve conflicts in UnpackerResource tests docs/modules/ROOT/nav.adoc | 1 + .../ROOT/pages/using-tika/server/index.adoc | 6 +- docs/modules/ROOT/pages/using-tika/server/tls.adoc | 651 +++++++++++++++++++++ .../apache/tika/server/core/TikaServerProcess.java | 25 + .../org/apache/tika/server/core/TlsConfig.java | 229 +++++++- .../org/apache/tika/server/core/CXFTestBase.java | 14 +- .../org/apache/tika/server/core/TikaPipesTest.java | 83 +-- .../apache/tika/server/standard/TikaPipesTest.java | 72 ++- .../tika/server/standard/UnpackerResourceTest.java | 52 +- .../standard/UnpackerResourceWithConfigTest.java | 4 +- 10 files changed, 1018 insertions(+), 119 deletions(-) diff --cc tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java index 48085d3a1c,6d2b71e40b..d11d21984d --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java @@@ -65,7 -66,9 +66,8 @@@ import org.apache.tika.pipes.core.Pipes import org.apache.tika.pipes.core.PipesParser; import org.apache.tika.server.core.resource.PipesParsingHelper; import org.apache.tika.server.core.resource.TikaResource; -import org.apache.tika.server.core.resource.UnpackerResource; + @TestInstance(TestInstance.Lifecycle.PER_CLASS) public abstract class CXFTestBase { private static final Logger LOG = LoggerFactory.getLogger(CXFTestBase.class); diff --cc tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java index 78482a7735,0e5fe9c21e..78cb8e07c1 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java @@@ -261,36 -201,45 +261,50 @@@ public class UnpackerResourceTest exten } @Test + @org.junit.jupiter.api.Disabled("TAR output is no longer supported in pipes-based implementation") public void testTarDocPicture() throws Exception { - Response response = WebClient - .create(endPoint + UNPACKER_PATH) - .type(APPLICATION_MSWORD) - .accept("application/x-tar") - .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV)); - - Map<String, String> data = readArchiveFromStream(new TarArchiveInputStream((InputStream) response.getEntity())); - - assertEquals(JPG_MD5, data.get(JPG_NAME)); + // TAR output was removed in Tika 4.0. The new UnpackerResource only produces ZIP format. } + @Test + public void testText() throws Exception { + Response response = WebClient + .create(endPoint + ALL_PATH) + .header(CONTENT_TYPE, APPLICATION_XML) + .accept("application/zip") + .put(ClassLoader.getSystemResourceAsStream("test-documents/test.doc")); + + String responseMsg = readArchiveMetadataAndText((InputStream) response.getEntity()); + assertNotNull(responseMsg); + assertContains("test", responseMsg); + assertContains("dc:creator,Maxim Valyanskiy", responseMsg); + } + @Test - public void testMaxBytes() throws Exception { + public void testMetadataJsonIncluded() throws Exception { + // Test that /unpack/all includes metadata JSON files Response response = WebClient - .create(CXFTestBase.endPoint + ALL_PATH) + .create(endPoint + ALL_PATH) - .header(CONTENT_TYPE, APPLICATION_XML) - .header(UnpackerResource.UNPACK_MAX_BYTES_KEY, 100) + .type(APPLICATION_MSWORD) .accept("application/zip") - .put(ClassLoader.getSystemResourceAsStream("test-documents/pic.xls")); - assertEquals(422, response.getStatus()); + .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV)); + + Map<String, byte[]> data = readZipArchiveBytes((InputStream) response.getEntity()); + + // Should have metadata JSON files + List<String> metadataFiles = data.keySet().stream() + .filter(k -> k.endsWith(".metadata.json")) + .toList(); + assertFalse(metadataFiles.isEmpty(), "Should have metadata JSON files"); + + // Verify the JSON contains expected metadata fields + String metadataJson = new String(data.get(metadataFiles.get(0)), StandardCharsets.UTF_8); + assertTrue(metadataJson.contains("Content-Type"), "Metadata JSON should contain Content-Type"); } @Test public void testPDFImages() throws Exception { - // POST with multipart config - URL is now just /unpack (not /unpack/config) ++ // POST with multipart config String configJson = """ { "pdf-parser": { @@@ -305,7 -254,7 +319,7 @@@ new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); Response response = WebClient - .create(CXFTestBase.endPoint + UNPACKER_PATH) - .create(endPoint + UNPACKER_PATH + "/config") ++ .create(endPoint + UNPACKER_PATH) .type("multipart/form-data") .accept("application/zip") .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); @@@ -323,7 -272,6 +337,7 @@@ public void testPDFRenderOCR() throws Exception { assumeTrue(new TesseractOCRParser().hasTesseract()); - // POST with multipart config - URL is now /unpack/all (not /unpack/all/config) ++ // POST with multipart config String configJson = """ { "pdf-parser": { @@@ -338,7 -286,7 +352,7 @@@ new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); Response response = WebClient - .create(CXFTestBase.endPoint + ALL_PATH) - .create(endPoint + ALL_PATH + "/config") ++ .create(endPoint + ALL_PATH) .type("multipart/form-data") .accept("application/zip") .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); @@@ -355,7 -296,6 +369,7 @@@ @Test public void testPDFPerPageRenderColor() throws Exception { - // POST with multipart config - URL is now /unpack/all (not /unpack/all/config) ++ // POST with multipart config String configJson = """ { "pdf-parser": { @@@ -371,7 -311,7 +385,7 @@@ new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); Response response = WebClient - .create(CXFTestBase.endPoint + ALL_PATH) - .create(endPoint + ALL_PATH + "/config") ++ .create(endPoint + ALL_PATH) .type("multipart/form-data") .accept("application/zip") .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); @@@ -422,180 -357,4 +436,180 @@@ assertTrue(averageColor.getBlue() > 250); } } + + /** + * Tests embedded-limits configuration via JSON config. + * Replaces the old testMaxBytes() which used the removed unpackMaxBytes header. + */ + @Test + public void testEmbeddedLimits() throws Exception { + // Configure maxCount=1 to only extract first embedded document + String configJson = """ + { + "embedded-limits": { + "maxCount": 1, + "throwOnMaxCount": false + } + } + """; + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"Doc1_ole.doc\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment configAtt = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response response = WebClient - .create(CXFTestBase.endPoint + UNPACKER_PATH) ++ .create(endPoint + UNPACKER_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); + + assertEquals(200, response.getStatus()); + Map<String, String> data = readZipArchive((InputStream) response.getEntity()); + + // With maxCount=1, should only have 1 embedded document + assertEquals(1, data.size(), "Should have exactly 1 embedded document with maxCount=1"); + } + + /** + * Tests non-default naming strategy with zeroPadName. + * TODO: TIKA-XXXX - Investigate unpack-config resolution in multipart config + */ + @Test + @org.junit.jupiter.api.Disabled("unpack-config resolution needs investigation") + public void testZeroPadNaming() throws Exception { + String configJson = """ + { + "unpack-config": { + "zeroPadName": 4 + } + } + """; + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"2pic.docx\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOCX_IMAGE), fileCd); + Attachment configAtt = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response response = WebClient - .create(CXFTestBase.endPoint + UNPACKER_PATH) ++ .create(endPoint + UNPACKER_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); + + assertEquals(200, response.getStatus()); + Map<String, String> data = readZipArchive((InputStream) response.getEntity()); + + // With zeroPadName=4, file names should be like 0000.jpeg, 0001.jpeg + boolean hasZeroPaddedName = data.keySet().stream() + .anyMatch(k -> k.matches("\\d{4}\\..*")); + assertTrue(hasZeroPaddedName, "Should have zero-padded file names (e.g., 0000.jpeg)"); + } + + /** + * Tests UnpackSelector filtering by mime type. + */ + @Test + public void testUnpackSelectorIncludeMimeTypes() throws Exception { + // Only extract JPEG images, not other embedded content + String configJson = """ + { + "unpack-selector": { + "includeMimeTypes": ["image/jpeg"] + } + } + """; + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"Doc1_ole.doc\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment configAtt = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response response = WebClient - .create(CXFTestBase.endPoint + UNPACKER_PATH) ++ .create(endPoint + UNPACKER_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); + + assertEquals(200, response.getStatus()); + Map<String, String> data = readZipArchive((InputStream) response.getEntity()); + + // Should only have JPEG files, no WAV files + boolean hasWav = data.keySet().stream().anyMatch(k -> k.endsWith(".wav")); + assertFalse(hasWav, "Should not have WAV files when filtering for JPEG only"); + + boolean hasJpeg = data.keySet().stream() + .anyMatch(k -> k.endsWith(".jpg") || k.endsWith(".jpeg")); + assertTrue(hasJpeg, "Should have JPEG files"); + } + + /** + * Tests UnpackSelector filtering by excluding mime types. + */ + @Test + public void testUnpackSelectorExcludeMimeTypes() throws Exception { + // Exclude WAV files - note: must use canonical type "audio/vnd.wave", not alias "audio/x-wav" + String configJson = """ + { + "unpack-selector": { + "excludeMimeTypes": ["audio/vnd.wave"] + } + } + """; + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"Doc1_ole.doc\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment configAtt = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response response = WebClient - .create(CXFTestBase.endPoint + UNPACKER_PATH) ++ .create(endPoint + UNPACKER_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); + + assertEquals(200, response.getStatus()); + Map<String, String> data = readZipArchive((InputStream) response.getEntity()); + + // Should not have WAV files + boolean hasWav = data.keySet().stream().anyMatch(k -> k.endsWith(".wav")); + assertFalse(hasWav, "Should not have WAV files when excluding audio/vnd.wave"); + + // But should still have image files + boolean hasImage = data.keySet().stream() + .anyMatch(k -> k.endsWith(".jpg") || k.endsWith(".jpeg") || k.endsWith(".png")); + assertTrue(hasImage, "Should still have image files"); + } + + /** + * Tests depth limiting for shallow extraction. + * TODO: TIKA-XXXX - Investigate embedded-limits resolution from multipart config in server + */ + @Test + @org.junit.jupiter.api.Disabled("embedded-limits not resolved from multipart config in server") + public void testShallowExtraction() throws Exception { + // Set maxDepth=1 for shallow extraction (only direct children) + String configJson = """ + { + "embedded-limits": { + "maxDepth": 1, + "throwOnMaxDepth": false + } + } + """; + ContentDisposition fileCd = new ContentDisposition("form-data; name=\"file\"; filename=\"Doc1_ole.doc\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment configAtt = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response response = WebClient - .create(CXFTestBase.endPoint + UNPACKER_PATH) ++ .create(endPoint + UNPACKER_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); + + assertEquals(200, response.getStatus()); + // Just verify it succeeds - actual depth limiting behavior depends on document structure + } } diff --cc tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java index ec56f506b6,08b6c18ba8..271f8dfa55 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java @@@ -136,9 -95,8 +136,9 @@@ public class UnpackerResourceWithConfig Attachment configAtt = new Attachment("config", "application/json", new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + // URL changed: POST to /unpack/all instead of /unpack/all/config Response response = WebClient - .create(CXFTestBase.endPoint + ALL_PATH) - .create(endPoint + ALL_PATH + "/config") ++ .create(endPoint + ALL_PATH) .type("multipart/form-data") .accept("application/zip") .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); @@@ -207,9 -160,8 +207,9 @@@ Attachment configAtt = new Attachment("config", "application/json", new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + // URL changed: POST to /unpack/all instead of /unpack/all/config Response response = WebClient - .create(CXFTestBase.endPoint + ALL_PATH) - .create(endPoint + ALL_PATH + "/config") ++ .create(endPoint + ALL_PATH) .type("multipart/form-data") .accept("application/zip") .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
