This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4637
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 60f90c242590afc66f1dbc6bf99a10db38e6699c
Merge: a6e4fdb542 8e6949312e
Author: tallison <[email protected]>
AuthorDate: Sat Jan 31 19:30:31 2026 -0500

    Merge origin/main into TIKA-4637 - resolve conflicts in UnpackerResource 
tests

 docs/modules/ROOT/nav.adoc                         |   1 +
 .../ROOT/pages/using-tika/server/index.adoc        |   6 +-
 docs/modules/ROOT/pages/using-tika/server/tls.adoc | 651 +++++++++++++++++++++
 .../apache/tika/server/core/TikaServerProcess.java |  25 +
 .../org/apache/tika/server/core/TlsConfig.java     | 229 +++++++-
 .../org/apache/tika/server/core/CXFTestBase.java   |  14 +-
 .../org/apache/tika/server/core/TikaPipesTest.java |  83 +--
 .../apache/tika/server/standard/TikaPipesTest.java |  72 ++-
 .../tika/server/standard/UnpackerResourceTest.java |  52 +-
 .../standard/UnpackerResourceWithConfigTest.java   |   4 +-
 10 files changed, 1018 insertions(+), 119 deletions(-)

diff --cc 
tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index 48085d3a1c,6d2b71e40b..d11d21984d
--- 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++ 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@@ -65,7 -66,9 +66,8 @@@ import org.apache.tika.pipes.core.Pipes
  import org.apache.tika.pipes.core.PipesParser;
  import org.apache.tika.server.core.resource.PipesParsingHelper;
  import org.apache.tika.server.core.resource.TikaResource;
 -import org.apache.tika.server.core.resource.UnpackerResource;
  
+ @TestInstance(TestInstance.Lifecycle.PER_CLASS)
  public abstract class CXFTestBase {
  
      private static final Logger LOG = 
LoggerFactory.getLogger(CXFTestBase.class);
diff --cc 
tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index 78482a7735,0e5fe9c21e..78cb8e07c1
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@@ -261,36 -201,45 +261,50 @@@ public class UnpackerResourceTest exten
      }
  
      @Test
 +    @org.junit.jupiter.api.Disabled("TAR output is no longer supported in 
pipes-based implementation")
      public void testTarDocPicture() throws Exception {
 -        Response response = WebClient
 -                .create(endPoint + UNPACKER_PATH)
 -                .type(APPLICATION_MSWORD)
 -                .accept("application/x-tar")
 -                .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
 -
 -        Map<String, String> data = readArchiveFromStream(new 
TarArchiveInputStream((InputStream) response.getEntity()));
 -
 -        assertEquals(JPG_MD5, data.get(JPG_NAME));
 +        // TAR output was removed in Tika 4.0. The new UnpackerResource only 
produces ZIP format.
      }
  
+     @Test
+     public void testText() throws Exception {
+         Response response = WebClient
+                 .create(endPoint + ALL_PATH)
+                 .header(CONTENT_TYPE, APPLICATION_XML)
+                 .accept("application/zip")
+                 
.put(ClassLoader.getSystemResourceAsStream("test-documents/test.doc"));
+ 
+         String responseMsg = readArchiveMetadataAndText((InputStream) 
response.getEntity());
+         assertNotNull(responseMsg);
+         assertContains("test", responseMsg);
+         assertContains("dc:creator,Maxim Valyanskiy", responseMsg);
+     }
+ 
      @Test
 -    public void testMaxBytes() throws Exception {
 +    public void testMetadataJsonIncluded() throws Exception {
 +        // Test that /unpack/all includes metadata JSON files
          Response response = WebClient
-                 .create(CXFTestBase.endPoint + ALL_PATH)
+                 .create(endPoint + ALL_PATH)
 -                .header(CONTENT_TYPE, APPLICATION_XML)
 -                .header(UnpackerResource.UNPACK_MAX_BYTES_KEY, 100)
 +                .type(APPLICATION_MSWORD)
                  .accept("application/zip")
 -                
.put(ClassLoader.getSystemResourceAsStream("test-documents/pic.xls"));
 -        assertEquals(422, response.getStatus());
 +                .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
 +
 +        Map<String, byte[]> data = readZipArchiveBytes((InputStream) 
response.getEntity());
 +
 +        // Should have metadata JSON files
 +        List<String> metadataFiles = data.keySet().stream()
 +                .filter(k -> k.endsWith(".metadata.json"))
 +                .toList();
 +        assertFalse(metadataFiles.isEmpty(), "Should have metadata JSON 
files");
 +
 +        // Verify the JSON contains expected metadata fields
 +        String metadataJson = new String(data.get(metadataFiles.get(0)), 
StandardCharsets.UTF_8);
 +        assertTrue(metadataJson.contains("Content-Type"), "Metadata JSON 
should contain Content-Type");
      }
  
      @Test
      public void testPDFImages() throws Exception {
-         // POST with multipart config - URL is now just /unpack (not 
/unpack/config)
++        // POST with multipart config
          String configJson = """
                  {
                    "pdf-parser": {
@@@ -305,7 -254,7 +319,7 @@@
                  new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
  
          Response response = WebClient
-                 .create(CXFTestBase.endPoint + UNPACKER_PATH)
 -                .create(endPoint + UNPACKER_PATH + "/config")
++                .create(endPoint + UNPACKER_PATH)
                  .type("multipart/form-data")
                  .accept("application/zip")
                  .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
@@@ -323,7 -272,6 +337,7 @@@
      public void testPDFRenderOCR() throws Exception {
          assumeTrue(new TesseractOCRParser().hasTesseract());
  
-         // POST with multipart config - URL is now /unpack/all (not 
/unpack/all/config)
++        // POST with multipart config
          String configJson = """
                  {
                    "pdf-parser": {
@@@ -338,7 -286,7 +352,7 @@@
                  new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
  
          Response response = WebClient
-                 .create(CXFTestBase.endPoint + ALL_PATH)
 -                .create(endPoint + ALL_PATH + "/config")
++                .create(endPoint + ALL_PATH)
                  .type("multipart/form-data")
                  .accept("application/zip")
                  .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
@@@ -355,7 -296,6 +369,7 @@@
  
      @Test
      public void testPDFPerPageRenderColor() throws Exception {
-         // POST with multipart config - URL is now /unpack/all (not 
/unpack/all/config)
++        // POST with multipart config
          String configJson = """
                  {
                    "pdf-parser": {
@@@ -371,7 -311,7 +385,7 @@@
                  new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
  
          Response response = WebClient
-                 .create(CXFTestBase.endPoint + ALL_PATH)
 -                .create(endPoint + ALL_PATH + "/config")
++                .create(endPoint + ALL_PATH)
                  .type("multipart/form-data")
                  .accept("application/zip")
                  .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
@@@ -422,180 -357,4 +436,180 @@@
              assertTrue(averageColor.getBlue() > 250);
          }
      }
 +
 +    /**
 +     * Tests embedded-limits configuration via JSON config.
 +     * Replaces the old testMaxBytes() which used the removed unpackMaxBytes 
header.
 +     */
 +    @Test
 +    public void testEmbeddedLimits() throws Exception {
 +        // Configure maxCount=1 to only extract first embedded document
 +        String configJson = """
 +                {
 +                  "embedded-limits": {
 +                    "maxCount": 1,
 +                    "throwOnMaxCount": false
 +                  }
 +                }
 +                """;
 +        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
 +        Attachment fileAtt = new Attachment("file",
 +                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
 +        Attachment configAtt = new Attachment("config", "application/json",
 +                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
 +
 +        Response response = WebClient
-                 .create(CXFTestBase.endPoint + UNPACKER_PATH)
++                .create(endPoint + UNPACKER_PATH)
 +                .type("multipart/form-data")
 +                .accept("application/zip")
 +                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
 +
 +        assertEquals(200, response.getStatus());
 +        Map<String, String> data = readZipArchive((InputStream) 
response.getEntity());
 +
 +        // With maxCount=1, should only have 1 embedded document
 +        assertEquals(1, data.size(), "Should have exactly 1 embedded document 
with maxCount=1");
 +    }
 +
 +    /**
 +     * Tests non-default naming strategy with zeroPadName.
 +     * TODO: TIKA-XXXX - Investigate unpack-config resolution in multipart 
config
 +     */
 +    @Test
 +    @org.junit.jupiter.api.Disabled("unpack-config resolution needs 
investigation")
 +    public void testZeroPadNaming() throws Exception {
 +        String configJson = """
 +                {
 +                  "unpack-config": {
 +                    "zeroPadName": 4
 +                  }
 +                }
 +                """;
 +        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"2pic.docx\"");
 +        Attachment fileAtt = new Attachment("file",
 +                ClassLoader.getSystemResourceAsStream(TEST_DOCX_IMAGE), 
fileCd);
 +        Attachment configAtt = new Attachment("config", "application/json",
 +                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
 +
 +        Response response = WebClient
-                 .create(CXFTestBase.endPoint + UNPACKER_PATH)
++                .create(endPoint + UNPACKER_PATH)
 +                .type("multipart/form-data")
 +                .accept("application/zip")
 +                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
 +
 +        assertEquals(200, response.getStatus());
 +        Map<String, String> data = readZipArchive((InputStream) 
response.getEntity());
 +
 +        // With zeroPadName=4, file names should be like 0000.jpeg, 0001.jpeg
 +        boolean hasZeroPaddedName = data.keySet().stream()
 +                .anyMatch(k -> k.matches("\\d{4}\\..*"));
 +        assertTrue(hasZeroPaddedName, "Should have zero-padded file names 
(e.g., 0000.jpeg)");
 +    }
 +
 +    /**
 +     * Tests UnpackSelector filtering by mime type.
 +     */
 +    @Test
 +    public void testUnpackSelectorIncludeMimeTypes() throws Exception {
 +        // Only extract JPEG images, not other embedded content
 +        String configJson = """
 +                {
 +                  "unpack-selector": {
 +                    "includeMimeTypes": ["image/jpeg"]
 +                  }
 +                }
 +                """;
 +        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
 +        Attachment fileAtt = new Attachment("file",
 +                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
 +        Attachment configAtt = new Attachment("config", "application/json",
 +                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
 +
 +        Response response = WebClient
-                 .create(CXFTestBase.endPoint + UNPACKER_PATH)
++                .create(endPoint + UNPACKER_PATH)
 +                .type("multipart/form-data")
 +                .accept("application/zip")
 +                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
 +
 +        assertEquals(200, response.getStatus());
 +        Map<String, String> data = readZipArchive((InputStream) 
response.getEntity());
 +
 +        // Should only have JPEG files, no WAV files
 +        boolean hasWav = data.keySet().stream().anyMatch(k -> 
k.endsWith(".wav"));
 +        assertFalse(hasWav, "Should not have WAV files when filtering for 
JPEG only");
 +
 +        boolean hasJpeg = data.keySet().stream()
 +                .anyMatch(k -> k.endsWith(".jpg") || k.endsWith(".jpeg"));
 +        assertTrue(hasJpeg, "Should have JPEG files");
 +    }
 +
 +    /**
 +     * Tests UnpackSelector filtering by excluding mime types.
 +     */
 +    @Test
 +    public void testUnpackSelectorExcludeMimeTypes() throws Exception {
 +        // Exclude WAV files - note: must use canonical type 
"audio/vnd.wave", not alias "audio/x-wav"
 +        String configJson = """
 +                {
 +                  "unpack-selector": {
 +                    "excludeMimeTypes": ["audio/vnd.wave"]
 +                  }
 +                }
 +                """;
 +        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
 +        Attachment fileAtt = new Attachment("file",
 +                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
 +        Attachment configAtt = new Attachment("config", "application/json",
 +                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
 +
 +        Response response = WebClient
-                 .create(CXFTestBase.endPoint + UNPACKER_PATH)
++                .create(endPoint + UNPACKER_PATH)
 +                .type("multipart/form-data")
 +                .accept("application/zip")
 +                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
 +
 +        assertEquals(200, response.getStatus());
 +        Map<String, String> data = readZipArchive((InputStream) 
response.getEntity());
 +
 +        // Should not have WAV files
 +        boolean hasWav = data.keySet().stream().anyMatch(k -> 
k.endsWith(".wav"));
 +        assertFalse(hasWav, "Should not have WAV files when excluding 
audio/vnd.wave");
 +
 +        // But should still have image files
 +        boolean hasImage = data.keySet().stream()
 +                .anyMatch(k -> k.endsWith(".jpg") || k.endsWith(".jpeg") || 
k.endsWith(".png"));
 +        assertTrue(hasImage, "Should still have image files");
 +    }
 +
 +    /**
 +     * Tests depth limiting for shallow extraction.
 +     * TODO: TIKA-XXXX - Investigate embedded-limits resolution from 
multipart config in server
 +     */
 +    @Test
 +    @org.junit.jupiter.api.Disabled("embedded-limits not resolved from 
multipart config in server")
 +    public void testShallowExtraction() throws Exception {
 +        // Set maxDepth=1 for shallow extraction (only direct children)
 +        String configJson = """
 +                {
 +                  "embedded-limits": {
 +                    "maxDepth": 1,
 +                    "throwOnMaxDepth": false
 +                  }
 +                }
 +                """;
 +        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
 +        Attachment fileAtt = new Attachment("file",
 +                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
 +        Attachment configAtt = new Attachment("config", "application/json",
 +                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
 +
 +        Response response = WebClient
-                 .create(CXFTestBase.endPoint + UNPACKER_PATH)
++                .create(endPoint + UNPACKER_PATH)
 +                .type("multipart/form-data")
 +                .accept("application/zip")
 +                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
 +
 +        assertEquals(200, response.getStatus());
 +        // Just verify it succeeds - actual depth limiting behavior depends 
on document structure
 +    }
  }
diff --cc 
tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
index ec56f506b6,08b6c18ba8..271f8dfa55
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
@@@ -136,9 -95,8 +136,9 @@@ public class UnpackerResourceWithConfig
          Attachment configAtt = new Attachment("config", "application/json",
                  new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
  
 +        // URL changed: POST to /unpack/all instead of /unpack/all/config
          Response response = WebClient
-                 .create(CXFTestBase.endPoint + ALL_PATH)
 -                .create(endPoint + ALL_PATH + "/config")
++                .create(endPoint + ALL_PATH)
                  .type("multipart/form-data")
                  .accept("application/zip")
                  .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
@@@ -207,9 -160,8 +207,9 @@@
          Attachment configAtt = new Attachment("config", "application/json",
                  new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
  
 +        // URL changed: POST to /unpack/all instead of /unpack/all/config
          Response response = WebClient
-                 .create(CXFTestBase.endPoint + ALL_PATH)
 -                .create(endPoint + ALL_PATH + "/config")
++                .create(endPoint + ALL_PATH)
                  .type("multipart/form-data")
                  .accept("application/zip")
                  .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));

Reply via email to