This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4732
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c0de3596e2250c4b756f0d1aab83cbfb451dbe55
Author: Lawrence Moorehead <[email protected]>
AuthorDate: Fri May 15 11:47:08 2026 -0400

    Removing duplicate file from unpack/all endpoint.
---
 .../apache/tika/pipes/core/server/PipesWorker.java |  76 -------
 .../tika/pipes/core/FrictionlessUnpackTest.java    |  66 +++---
 .../tika/server/standard/UnpackerResourceTest.java | 248 +++++++++++++++++++++
 3 files changed, 284 insertions(+), 106 deletions(-)

diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index a76defc641..fb7553fee0 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -489,72 +489,6 @@ class PipesWorker implements Callable<PipesResult> {
         mapper.writeValue(os, metadataMap);
     }
 
-    /**
-     * Stores the original document to the temp handler for inclusion in the 
zip.
-     * Uses TikaInputStream's internal file caching to avoid consuming the 
stream.
-     */
-    private void storeOriginalDocument(TikaInputStream tis, 
TempFileUnpackHandler tempHandler)
-            throws IOException {
-        String fileName = getFileNameFromFetchKey();
-
-        // TikaInputStream caches to a temp file internally - get that file
-        Path originalPath = tis.getPath();
-        if (originalPath != null && Files.exists(originalPath)) {
-            // Copy from the cached file
-            try (InputStream is = Files.newInputStream(originalPath)) {
-                tempHandler.storeOriginalDocument(is, fileName);
-            }
-        } else {
-            // Stream hasn't been cached yet - we need to read and reset
-            tis.mark(Integer.MAX_VALUE);
-            try {
-                tempHandler.storeOriginalDocument(tis, fileName);
-            } finally {
-                tis.reset();
-            }
-        }
-    }
-
-    /**
-     * Stores the original document to the frictionless handler for inclusion 
in output.
-     * Uses TikaInputStream's internal file caching to avoid consuming the 
stream.
-     */
-    private void storeOriginalDocumentForFrictionless(TikaInputStream tis,
-                                                      
FrictionlessUnpackHandler frictionlessHandler)
-            throws IOException {
-        String fileName = getFileNameFromFetchKey();
-
-        // TikaInputStream caches to a temp file internally - get that file
-        Path originalPath = tis.getPath();
-        if (originalPath != null && Files.exists(originalPath)) {
-            // Copy from the cached file
-            try (InputStream is = Files.newInputStream(originalPath)) {
-                frictionlessHandler.storeOriginalDocument(is, fileName);
-            }
-        } else {
-            // Stream hasn't been cached yet - we need to read and reset
-            tis.mark(Integer.MAX_VALUE);
-            try {
-                frictionlessHandler.storeOriginalDocument(tis, fileName);
-            } finally {
-                tis.reset();
-            }
-        }
-    }
-
-    /**
-     * Extracts the file name from the fetch key.
-     */
-    private String getFileNameFromFetchKey() {
-        String fetchKey = fetchEmitTuple.getFetchKey().getFetchKey();
-        String fileName = fetchKey;
-        int lastSlash = Math.max(fetchKey.lastIndexOf('/'), 
fetchKey.lastIndexOf('\\'));
-        if (lastSlash >= 0 && lastSlash < fetchKey.length() - 1) {
-            fileName = fetchKey.substring(lastSlash + 1);
-        }
-        return fileName;
-    }
-
     protected ParseDataOrPipesResult parseFromTuple() throws TikaException, 
InterruptedException {
         //start a new metadata object to gather info from the fetch process
         //we want to isolate and not touch the metadata sent into the 
fetchEmitTuple
@@ -575,16 +509,6 @@ class PipesWorker implements Callable<PipesResult> {
         }
 
         try (TikaInputStream tis = tisOrResult.tis()) {
-            // Store original document for zipping/frictionless if requested
-            UnpackHandler handler = localContext.get(UnpackHandler.class);
-            UnpackConfig uc = localContext.get(UnpackConfig.class);
-            if (uc != null && uc.isIncludeOriginal()) {
-                if (handler instanceof FrictionlessUnpackHandler 
frictionlessHandler) {
-                    storeOriginalDocumentForFrictionless(tis, 
frictionlessHandler);
-                } else if (handler instanceof TempFileUnpackHandler 
tempHandler) {
-                    storeOriginalDocument(tis, tempHandler);
-                }
-            }
             return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata, 
localContext);
         } catch (SecurityException e) {
             LOG.error("security exception id={}", fetchEmitTuple.getId(), e);
diff --git 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
index deaf5e3561..14a05aad70 100644
--- 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
+++ 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/FrictionlessUnpackTest.java
@@ -635,28 +635,30 @@ public class FrictionlessUnpackTest {
 
     @Test
     public void testFrictionlessWithIncludeOriginal(@TempDir Path tmp) throws 
Exception {
-        // Test that includeOriginal works with Frictionless format
+        // includeOriginal=true causes the container to appear in the 
Frictionless
+        // package as "unpacked/0.<ext>" (added by ParseHandler._preParse via
+        // unpackHandler.add(0, ...)) and to be listed once in 
datapackage.json.
         Path outputDir = tmp.resolve("output");
         Files.createDirectories(outputDir);
 
         try (PipesClient pipesClient = init(tmp, TEST_DOC_WITH_EMBEDDED)) {
             ParseContext parseContext = new ParseContext();
             parseContext.set(ParseMode.class, ParseMode.UNPACK);
-            
+
             UnpackConfig unpackConfig = new UnpackConfig();
             unpackConfig.setEmitter(EMITTER_NAME);
             
unpackConfig.setOutputFormat(UnpackConfig.OUTPUT_FORMAT.FRICTIONLESS);
             unpackConfig.setOutputMode(UnpackConfig.OUTPUT_MODE.ZIPPED);
-            unpackConfig.setIncludeOriginal(true);  // Include container 
document
+            unpackConfig.setIncludeOriginal(true);
             parseContext.set(UnpackConfig.class, unpackConfig);
-            
+
             PipesResult pipesResult = pipesClient.process(
                     new FetchEmitTuple(TEST_DOC_WITH_EMBEDDED,
                             new FetchKey(FETCHER_NAME, TEST_DOC_WITH_EMBEDDED),
                             new EmitKey(EMITTER_NAME, TEST_DOC_WITH_EMBEDDED),
                             new Metadata(), parseContext,
                             FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
-            
+
             assertTrue(pipesResult.isSuccess(),
                     "Frictionless with includeOriginal should succeed");
         }
@@ -664,39 +666,43 @@ public class FrictionlessUnpackTest {
         List<Path> zipFiles = Files.list(outputDir)
                 .filter(p -> p.toString().endsWith("-frictionless.zip"))
                 .toList();
+        assertEquals(1, zipFiles.size(), "Should create exactly one 
frictionless zip");
 
         try (ZipFile zip = new ZipFile(zipFiles.get(0).toFile())) {
-            // Original file should be at root level or in a specific location
-            boolean hasOriginal = false;
+            // The container itself should be present as the id-0 entry under
+            // unpacked/. Match "unpacked/0" exactly or "unpacked/0.<ext>"
+            // (whichever the active SUFFIX_STRATEGY produces).
+            Set<String> allEntries = new HashSet<>();
             Enumeration<? extends ZipEntry> entries = zip.entries();
             while (entries.hasMoreElements()) {
-                ZipEntry entry = entries.nextElement();
-                // Original could be at root or documented location
-                if (entry.getName().contains(TEST_DOC_WITH_EMBEDDED) ||
-                        entry.getName().equals("original/" + 
TEST_DOC_WITH_EMBEDDED)) {
-                    hasOriginal = true;
-                    break;
-                }
+                allEntries.add(entries.nextElement().getName());
             }
-
-            // Also check datapackage.json for original in resources
+            boolean hasContainerAsId0 = allEntries.stream()
+                    .anyMatch(n -> n.equals("unpacked/0") || 
n.startsWith("unpacked/0."));
+            assertTrue(hasContainerAsId0,
+                    "With includeOriginal=true, the container should appear as 
the " +
+                            "unpacked/0 entry. Entries: " + allEntries);
+
+            // And the manifest's resources should list the container at 
unpacked/0;
+            // no resource path should escape the unpacked/ prefix (no separate
+            // root-level "original" entry should exist).
             ZipEntry dpEntry = zip.getEntry("datapackage.json");
-            if (dpEntry != null) {
-                JsonNode dataPackage;
-                try (InputStream is = zip.getInputStream(dpEntry)) {
-                    dataPackage = OBJECT_MAPPER.readTree(is);
-                }
-                for (JsonNode resource : dataPackage.get("resources")) {
-                    String path = resource.get("path").asText();
-                    if (!path.startsWith("unpacked/")) {
-                        hasOriginal = true;
-                        break;
-                    }
+            assertNotNull(dpEntry, "datapackage.json should be present");
+            JsonNode dataPackage;
+            try (InputStream is = zip.getInputStream(dpEntry)) {
+                dataPackage = OBJECT_MAPPER.readTree(is);
+            }
+            boolean manifestListsContainer = false;
+            for (JsonNode resource : dataPackage.get("resources")) {
+                String path = resource.get("path").asText();
+                assertTrue(path.startsWith("unpacked/"),
+                        "Manifest resources should only list unpacked/ paths; 
got " + path);
+                if (path.equals("unpacked/0") || 
path.startsWith("unpacked/0.")) {
+                    manifestListsContainer = true;
                 }
             }
-
-            assertTrue(hasOriginal,
-                    "With includeOriginal=true, original document should be in 
package");
+            assertTrue(manifestListsContainer,
+                    "Manifest should list the container at unpacked/0");
         }
     }
 
diff --git 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index 21443d5795..ae200959b0 100644
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -33,8 +33,10 @@ import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import javax.imageio.ImageIO;
 
 import com.fasterxml.jackson.databind.JsonNode;
@@ -494,6 +496,252 @@ public class UnpackerResourceTest extends CXFTestBase {
         assertTrue(hasZeroPaddedName, "Should have zero-padded file names 
(e.g., 0000.jpeg)");
     }
 
+    /**
+     * The datapackage.json "resources" array is the manifest for the 
Frictionless
+     * package. Verifies it lists exactly the data files present in the zip, 
with
+     * no missing or extraneous entries. The package envelope (datapackage.json
+     * and the optional metadata.json) is not itself a "resource" and is 
excluded
+     * from the comparison.
+     */
+    @Test
+    public void testFrictionlessDataPackageMatchesArchiveContents() throws 
Exception {
+        String configJson = """
+                {
+                  "parse-context": {
+                    "unpack-config": {
+                      "outputFormat": "FRICTIONLESS",
+                      "outputMode": "ZIPPED",
+                      "includeFullMetadata": true,
+                      "includeOriginal": true
+                    }
+                  }
+                }
+                """;
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment fileAtt = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment configAtt = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+        Response response = WebClient
+                .create(endPoint + ALL_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+
+        assertEquals(200, response.getStatus());
+        Map<String, byte[]> data = readZipArchiveBytes((InputStream) 
response.getEntity());
+
+        byte[] dpBytes = data.get("datapackage.json");
+        assertNotNull(dpBytes, "datapackage.json should be present");
+
+        JsonNode dataPackage = MAPPER.readTree(dpBytes);
+        JsonNode resources = dataPackage.get("resources");
+        assertNotNull(resources, "datapackage.json should have a 'resources' 
array");
+        assertTrue(resources.isArray() && resources.size() > 0,
+                "resources array should be non-empty");
+
+        Set<String> manifestPaths = new HashSet<>();
+        for (JsonNode resource : resources) {
+            manifestPaths.add(resource.get("path").asText());
+        }
+
+        Set<String> archiveDataFiles = new HashSet<>(data.keySet());
+        archiveDataFiles.remove("datapackage.json");
+        archiveDataFiles.remove("metadata.json");
+
+        assertEquals(archiveDataFiles, manifestPaths,
+                "datapackage.json 'resources' must list exactly the data files 
in the zip. " +
+                        "Only-in-manifest: " + difference(manifestPaths, 
archiveDataFiles) +
+                        ", only-in-archive: " + difference(archiveDataFiles, 
manifestPaths));
+    }
+
+    private static Set<String> difference(Set<String> a, Set<String> b) {
+        Set<String> diff = new HashSet<>(a);
+        diff.removeAll(b);
+        return diff;
+    }
+
+    /**
+     * /unpack/all forces unpack-config.includeOriginal=true. In REGULAR mode,
+     * the container must appear exactly once in the archive — at 
"0.&lt;ext&gt;"
+     * at the zip root — with no additional copy elsewhere.
+     */
+    @Test
+    public void testRegularAllContainerAppearsOnce() throws Exception {
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment fileAtt = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+
+        Response response = WebClient
+                .create(endPoint + ALL_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(fileAtt)));
+
+        assertEquals(200, response.getStatus());
+        Map<String, String> data = readZipArchive((InputStream) 
response.getEntity());
+
+        long containerEntries = data.keySet().stream()
+                .filter(k -> !k.endsWith(".metadata.json"))
+                .filter(k -> k.equals("0") || k.startsWith("0."))
+                .count();
+        assertEquals(1, containerEntries,
+                "Container should appear exactly once at the zip root as 
0.<ext>. " +
+                        "Entries: " + data.keySet());
+    }
+
+    /**
+     * Documents the shape of /unpack output in FRICTIONLESS mode (no /all,
+     * default config): a datapackage.json manifest plus the 
unpacked/&lt;id&gt;
+     * children, and no metadata.json envelope.
+     */
+    @Test
+    public void testFrictionlessUnpackShape() throws Exception {
+        String configJson = """
+                {
+                  "parse-context": {
+                    "unpack-config": {
+                      "outputFormat": "FRICTIONLESS",
+                      "outputMode": "ZIPPED"
+                    }
+                  }
+                }
+                """;
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment fileAtt = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment configAtt = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+        Response response = WebClient
+                .create(endPoint + UNPACKER_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+
+        assertEquals(200, response.getStatus());
+        Map<String, String> data = readZipArchive((InputStream) 
response.getEntity());
+
+        assertTrue(data.containsKey("datapackage.json"),
+                "Should contain datapackage.json manifest. Entries: " + 
data.keySet());
+        assertFalse(data.containsKey("metadata.json"),
+                "Should not contain metadata.json without includeFullMetadata. 
Entries: " + data.keySet());
+        boolean hasUnpacked = data.keySet().stream().anyMatch(k -> 
k.startsWith("unpacked/"));
+        assertTrue(hasUnpacked, "Should contain unpacked/ entries. Entries: " 
+ data.keySet());
+    }
+
+    /**
+     * Documents the difference between /unpack and /unpack/all in 
FRICTIONLESS:
+     * /unpack/all forces unpack-config.includeOriginal=true, which causes the
+     * container itself to be added to the unpack output as id 0
+     * ("unpacked/0.&lt;ext&gt;"). /unpack with no extra config does not.
+     */
+    @Test
+    public void testFrictionlessUnpackAllAddsContainerAsUnpackedZero() throws 
Exception {
+        String configJson = """
+                {
+                  "parse-context": {
+                    "unpack-config": {
+                      "outputFormat": "FRICTIONLESS",
+                      "outputMode": "ZIPPED"
+                    }
+                  }
+                }
+                """;
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment unpackFile = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment unpackConfig = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+        Response unpackResponse = WebClient
+                .create(endPoint + UNPACKER_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(unpackFile, 
unpackConfig)));
+        assertEquals(200, unpackResponse.getStatus());
+        Set<String> unpackEntries = new HashSet<>(readZipArchive(
+                (InputStream) unpackResponse.getEntity()).keySet());
+
+        Attachment allFile = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment allConfig = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+        Response allResponse = WebClient
+                .create(endPoint + ALL_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(allFile, allConfig)));
+        assertEquals(200, allResponse.getStatus());
+        Set<String> allEntries = new HashSet<>(readZipArchive(
+                (InputStream) allResponse.getEntity()).keySet());
+
+        boolean unpackHasContainer = unpackEntries.stream().anyMatch(k -> 
k.equals("unpacked/0.doc"));
+        boolean allHasContainer = allEntries.stream().anyMatch(k -> 
k.equals("unpacked/0.doc"));
+        assertFalse(unpackHasContainer,
+                "/unpack alone should not include the container. Entries: " + 
unpackEntries);
+        assertTrue(allHasContainer,
+                "/unpack/all should include the container as unpacked/0.<ext>. 
" +
+                        "Entries: " + allEntries);
+
+        Set<String> onlyInAll = difference(allEntries, unpackEntries);
+        assertEquals(Set.of("unpacked/0.doc"), onlyInAll,
+                "Only the container (unpacked/0.<ext>) should distinguish 
/unpack/all " +
+                        "from /unpack. Difference: " + onlyInAll);
+    }
+
+    /**
+     * Documents that includeFullMetadata=true in FRICTIONLESS adds a
+     * metadata.json envelope whose entries carry X-TIKA:content (the extracted
+     * text) and Content-Type alongside the per-document metadata.
+     */
+    @Test
+    public void testFrictionlessIncludeFullMetadataAddsMetadataJson() throws 
Exception {
+        String configJson = """
+                {
+                  "parse-context": {
+                    "unpack-config": {
+                      "outputFormat": "FRICTIONLESS",
+                      "outputMode": "ZIPPED",
+                      "includeFullMetadata": true
+                    }
+                  }
+                }
+                """;
+        ContentDisposition fileCd = new ContentDisposition("form-data; 
name=\"file\"; filename=\"Doc1_ole.doc\"");
+        Attachment fileAtt = new Attachment("file",
+                ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd);
+        Attachment configAtt = new Attachment("config", "application/json",
+                new 
ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8)));
+
+        Response response = WebClient
+                .create(endPoint + ALL_PATH)
+                .type("multipart/form-data")
+                .accept("application/zip")
+                .post(new MultipartBody(Arrays.asList(fileAtt, configAtt)));
+        assertEquals(200, response.getStatus());
+
+        Map<String, byte[]> data = readZipArchiveBytes((InputStream) 
response.getEntity());
+        byte[] metadataBytes = data.get("metadata.json");
+        assertNotNull(metadataBytes, "metadata.json should be present when 
includeFullMetadata=true. " +
+                "Entries: " + data.keySet());
+
+        JsonNode metadata = MAPPER.readTree(metadataBytes);
+        assertTrue(metadata.isArray() && metadata.size() > 0,
+                "metadata.json should be a non-empty array");
+
+        JsonNode container = metadata.get(0);
+        assertTrue(container.has("Content-Type"),
+                "Container metadata entry should carry Content-Type. Entry: " 
+ container);
+        assertEquals("application/msword", 
container.get("Content-Type").asText(),
+                "Container metadata entry should describe the submitted .doc");
+        assertTrue(container.has("X-TIKA:content"),
+                "Container metadata entry should carry X-TIKA:content 
(extracted text). Entry: " + container);
+        assertTrue(container.get("X-TIKA:content").asText().length() > 0,
+                "X-TIKA:content for the container should be non-empty");
+    }
+
     /**
      * Tests UnpackSelector filtering by mime type.
      */

Reply via email to