This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4732 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 899a34247475e0f238ef40ed22f521b2b7a734e3 Author: Lawrence Moorehead <[email protected]> AuthorDate: Fri May 15 12:24:05 2026 -0400 Use supplied filename as RESOURCE_NAME_KEY during unpack --- .../apache/tika/pipes/core/server/PipesWorker.java | 10 ++++ .../tika/server/standard/UnpackerResourceTest.java | 59 ++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index fb7553fee0..a96d070b57 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@ -37,6 +37,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; import org.apache.tika.extractor.UnpackHandler; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; @@ -503,6 +504,15 @@ class PipesWorker implements Callable<PipesResult> { } // Use newMetadata() to apply any configured write limits Metadata metadata = localContext.newMetadata(); + // Carry the caller-supplied resource name across the fresh-metadata boundary so + // detection, suffix selection, and the Frictionless manifest's name field see + // the logical filename rather than whatever the fetcher's path happens to be + // (e.g., a server-side spool prefix). TikaInputStream.get(path, metadata) + // already honors a pre-set RESOURCE_NAME_KEY. + String suppliedName = fetchEmitTuple.getMetadata().get(TikaCoreProperties.RESOURCE_NAME_KEY); + if (!StringUtils.isBlank(suppliedName)) { + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, suppliedName); + } FetchHandler.TisOrResult tisOrResult = fetchHandler.fetch(fetchEmitTuple, metadata, localContext); if (tisOrResult.pipesResult() != null) { return new ParseDataOrPipesResult(null, tisOrResult.pipesResult()); diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java index ae200959b0..b8c62b17ae 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java @@ -556,6 +556,65 @@ public class UnpackerResourceTest extends CXFTestBase { ", only-in-archive: " + difference(archiveDataFiles, manifestPaths)); } + /** + * The Frictionless manifest's "name" field is supposed to carry the + * original filename of each resource. For the container (unpacked/0.<ext>), + * that name should be the filename the user supplied on the multipart + * upload -- not the server's internal spool filename. + */ + @Test + public void testFrictionlessContainerManifestNameMatchesUploadFilename() throws Exception { + String configJson = """ + { + "parse-context": { + "unpack-config": { + "outputFormat": "FRICTIONLESS", + "outputMode": "ZIPPED" + } + } + } + """; + String uploadFilename = "Doc1_ole.doc"; + ContentDisposition fileCd = new ContentDisposition( + "form-data; name=\"file\"; filename=\"" + uploadFilename + "\""); + Attachment fileAtt = new Attachment("file", + ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV), fileCd); + Attachment configAtt = new Attachment("config", "application/json", + new ByteArrayInputStream(configJson.getBytes(StandardCharsets.UTF_8))); + + Response response = WebClient + .create(endPoint + ALL_PATH) + .type("multipart/form-data") + .accept("application/zip") + .post(new MultipartBody(Arrays.asList(fileAtt, configAtt))); + + assertEquals(200, response.getStatus()); + Map<String, byte[]> data = readZipArchiveBytes((InputStream) response.getEntity()); + + byte[] dpBytes = data.get("datapackage.json"); + assertNotNull(dpBytes, "datapackage.json should be present"); + JsonNode dataPackage = MAPPER.readTree(dpBytes); + + JsonNode containerResource = null; + for (JsonNode resource : dataPackage.get("resources")) { + String path = resource.get("path").asText(); + if (path.equals("unpacked/0") || path.startsWith("unpacked/0.")) { + containerResource = resource; + break; + } + } + assertNotNull(containerResource, + "Manifest should list the container at unpacked/0. Resources: " + + dataPackage.get("resources")); + + JsonNode nameNode = containerResource.get("name"); + assertNotNull(nameNode, + "Container resource should carry a 'name' field. Resource: " + containerResource); + assertEquals(uploadFilename, nameNode.asText(), + "Container's manifest name should be the user-supplied upload filename, " + + "not the server's internal spool filename. Resource: " + containerResource); + } + private static Set<String> difference(Set<String> a, Set<String> b) { Set<String> diff = new HashSet<>(a); diff.removeAll(b);
