This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4732 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1955ee369631162d2617c49ead1466233465b06a Author: tallison <[email protected]> AuthorDate: Fri May 15 20:54:10 2026 -0400 TIKA-4732 -- small clean ups --- .../core/extractor/FrictionlessUnpackHandler.java | 111 --------------------- .../core/extractor/TempFileUnpackHandler.java | 38 ------- .../apache/tika/pipes/core/server/PipesWorker.java | 34 ++----- 3 files changed, 8 insertions(+), 175 deletions(-) diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java index 705288bcb0..e1b89e9869 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java @@ -61,10 +61,6 @@ public class FrictionlessUnpackHandler extends AbstractUnpackHandler implements private final EmitKey containerEmitKey; private final UnpackConfig unpackConfig; private final List<FrictionlessFileInfo> embeddedFiles = new ArrayList<>(); - private Path originalDocumentPath; - private String originalDocumentName; - private String originalDocumentHash; - private long originalDocumentBytes; private boolean closed = false; /** @@ -153,39 +149,6 @@ public class FrictionlessUnpackHandler extends AbstractUnpackHandler implements return emitKey; } - /** - * Stores the original container document for optional inclusion. - * - * @param inputStream the original document input stream - * @param fileName the file name for the original document - * @throws IOException if storing fails - */ - public void storeOriginalDocument(InputStream inputStream, String fileName) throws IOException { - this.originalDocumentName = fileName; - this.originalDocumentPath = tempDirectory.resolve(fileName); - - MessageDigest digest; - try { - digest = MessageDigest.getInstance("SHA-256"); - } catch (NoSuchAlgorithmException e) { - throw new IOException("SHA-256 algorithm not available", e); - } - - long bytes = 0; - try (DigestInputStream dis = new DigestInputStream(inputStream, digest); - OutputStream os = Files.newOutputStream(originalDocumentPath)) { - byte[] buffer = new byte[8192]; - int read; - while ((read = dis.read(buffer)) != -1) { - os.write(buffer, 0, read); - bytes += read; - } - } - - this.originalDocumentHash = FrictionlessResource.formatHash(digest.digest()); - this.originalDocumentBytes = bytes; - } - /** * Builds the DataPackage manifest from collected files. * @@ -195,17 +158,6 @@ public class FrictionlessUnpackHandler extends AbstractUnpackHandler implements public DataPackage buildDataPackage(String containerName) { DataPackage dataPackage = new DataPackage(containerName); - // Add original document if included - if (unpackConfig.isIncludeOriginal() && hasOriginalDocument()) { - dataPackage.addResource(FrictionlessResource.create( - originalDocumentName, - detectMediatypeFromFilename(originalDocumentName), - originalDocumentBytes, - originalDocumentHash, - originalDocumentName - )); - } - // Add all embedded files with unpacked/ prefix for (FrictionlessFileInfo fileInfo : embeddedFiles) { String path = UNPACKED_DIR + "/" + fileInfo.fileName(); @@ -222,48 +174,6 @@ public class FrictionlessUnpackHandler extends AbstractUnpackHandler implements return dataPackage; } - /** - * Simple mediatype detection from filename extension. - */ - private String detectMediatypeFromFilename(String filename) { - if (filename == null) { - return "application/octet-stream"; - } - String lower = filename.toLowerCase(java.util.Locale.ROOT); - if (lower.endsWith(".pdf")) { - return "application/pdf"; - } else if (lower.endsWith(".xml")) { - return "application/xml"; - } else if (lower.endsWith(".doc")) { - return "application/msword"; - } else if (lower.endsWith(".docx")) { - return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; - } else if (lower.endsWith(".xls")) { - return "application/vnd.ms-excel"; - } else if (lower.endsWith(".xlsx")) { - return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; - } else if (lower.endsWith(".ppt")) { - return "application/vnd.ms-powerpoint"; - } else if (lower.endsWith(".pptx")) { - return "application/vnd.openxmlformats-officedocument.presentationml.presentation"; - } else if (lower.endsWith(".txt")) { - return "text/plain"; - } else if (lower.endsWith(".html") || lower.endsWith(".htm")) { - return "text/html"; - } else if (lower.endsWith(".json")) { - return "application/json"; - } else if (lower.endsWith(".png")) { - return "image/png"; - } else if (lower.endsWith(".jpg") || lower.endsWith(".jpeg")) { - return "image/jpeg"; - } else if (lower.endsWith(".gif")) { - return "image/gif"; - } else if (lower.endsWith(".zip")) { - return "application/zip"; - } - return "application/octet-stream"; - } - /** * Returns the temporary directory where files are stored. */ @@ -292,27 +202,6 @@ public class FrictionlessUnpackHandler extends AbstractUnpackHandler implements return !embeddedFiles.isEmpty(); } - /** - * Returns the path to the original document if stored. - */ - public Path getOriginalDocumentPath() { - return originalDocumentPath; - } - - /** - * Returns the name of the original document if stored. - */ - public String getOriginalDocumentName() { - return originalDocumentName; - } - - /** - * Returns true if the original document was stored. - */ - public boolean hasOriginalDocument() { - return originalDocumentPath != null && Files.exists(originalDocumentPath); - } - /** * Returns the UnpackConfig used by this handler. */ diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java index 6f665f27d3..ada8a2daa7 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java @@ -42,8 +42,6 @@ public class TempFileUnpackHandler extends AbstractUnpackHandler private final EmitKey containerEmitKey; private final UnpackConfig unpackConfig; private final List<EmbeddedFileInfo> embeddedFiles = new ArrayList<>(); - private Path originalDocumentPath; - private String originalDocumentName; private boolean closed = false; /** @@ -112,42 +110,6 @@ public class TempFileUnpackHandler extends AbstractUnpackHandler return !embeddedFiles.isEmpty(); } - /** - * Stores the original container document for inclusion in the zip. - * Call this before parsing if includeOriginal is enabled. - * - * @param inputStream the original document input stream - * @param fileName the file name for the original document - */ - public void storeOriginalDocument(InputStream inputStream, String fileName) throws IOException { - this.originalDocumentName = fileName; - this.originalDocumentPath = tempDirectory.resolve("_original_" + fileName); - try (OutputStream os = Files.newOutputStream(originalDocumentPath)) { - inputStream.transferTo(os); - } - } - - /** - * Returns the path to the original document if stored. - */ - public Path getOriginalDocumentPath() { - return originalDocumentPath; - } - - /** - * Returns the name of the original document if stored. - */ - public String getOriginalDocumentName() { - return originalDocumentName; - } - - /** - * Returns true if the original document was stored. - */ - public boolean hasOriginalDocument() { - return originalDocumentPath != null && Files.exists(originalDocumentPath); - } - @Override public void close() throws IOException { if (!closed) { diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index a96d070b57..911b2940a2 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@ -320,15 +320,9 @@ class PipesWorker implements Callable<PipesResult> { DataPackage dataPackage = frictionlessHandler.buildDataPackage(containerName); try { - // Emit original document if included - if (unpackConfig.isIncludeOriginal() && frictionlessHandler.hasOriginalDocument()) { - String originalEmitKey = baseEmitKey + "/" + frictionlessHandler.getOriginalDocumentName(); - try (InputStream is = Files.newInputStream(frictionlessHandler.getOriginalDocumentPath())) { - streamEmitter.emit(originalEmitKey, is, new Metadata(), parseContext); - } - } - - // Emit each embedded file under unpacked/ + // Emit each embedded file under unpacked/. + // When includeOriginal=true the container itself is added as id 0 by + // ParseHandler._preParse, so it appears here as one of the embedded entries. for (FrictionlessUnpackHandler.FrictionlessFileInfo fileInfo : frictionlessHandler.getEmbeddedFiles()) { String fileEmitKey = baseEmitKey + "/unpacked/" + fileInfo.fileName(); try (InputStream is = Files.newInputStream(fileInfo.filePath())) { @@ -385,15 +379,9 @@ class PipesWorker implements Callable<PipesResult> { zos.closeEntry(); } - // Add original document if included (at root level) - if (unpackConfig.isIncludeOriginal() && frictionlessHandler.hasOriginalDocument()) { - ZipEntry originalEntry = new ZipEntry(frictionlessHandler.getOriginalDocumentName()); - zos.putNextEntry(originalEntry); - Files.copy(frictionlessHandler.getOriginalDocumentPath(), zos); - zos.closeEntry(); - } - - // Add all embedded files under unpacked/ + // Add all embedded files under unpacked/. + // When includeOriginal=true the container itself is added as id 0 by + // ParseHandler._preParse, so it appears here as one of the embedded entries. for (FrictionlessUnpackHandler.FrictionlessFileInfo fileInfo : frictionlessHandler.getEmbeddedFiles()) { ZipEntry fileEntry = new ZipEntry("unpacked/" + fileInfo.fileName()); zos.putNextEntry(fileEntry); @@ -442,14 +430,8 @@ class PipesWorker implements Callable<PipesResult> { private void createZipFile(Path zipFile, TempFileUnpackHandler tempHandler, UnpackConfig unpackConfig) throws IOException { try (ZipOutputStream zos = new ZipOutputStream(Files.newOutputStream(zipFile))) { - // Include original document if requested - if (unpackConfig.isIncludeOriginal() && tempHandler.hasOriginalDocument()) { - ZipEntry originalEntry = new ZipEntry(tempHandler.getOriginalDocumentName()); - zos.putNextEntry(originalEntry); - Files.copy(tempHandler.getOriginalDocumentPath(), zos); - zos.closeEntry(); - } - + // When includeOriginal=true the container itself is added as id 0 by + // ParseHandler._preParse, so it appears here as one of the embedded entries. for (TempFileUnpackHandler.EmbeddedFileInfo fileInfo : tempHandler.getEmbeddedFiles()) { // Add the embedded file ZipEntry fileEntry = new ZipEntry(fileInfo.fileName());
