This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4732
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1955ee369631162d2617c49ead1466233465b06a
Author: tallison <[email protected]>
AuthorDate: Fri May 15 20:54:10 2026 -0400

    TIKA-4732 -- small clean ups
---
 .../core/extractor/FrictionlessUnpackHandler.java  | 111 ---------------------
 .../core/extractor/TempFileUnpackHandler.java      |  38 -------
 .../apache/tika/pipes/core/server/PipesWorker.java |  34 ++-----
 3 files changed, 8 insertions(+), 175 deletions(-)

diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
index 705288bcb0..e1b89e9869 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/FrictionlessUnpackHandler.java
@@ -61,10 +61,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
     private final EmitKey containerEmitKey;
     private final UnpackConfig unpackConfig;
     private final List<FrictionlessFileInfo> embeddedFiles = new ArrayList<>();
-    private Path originalDocumentPath;
-    private String originalDocumentName;
-    private String originalDocumentHash;
-    private long originalDocumentBytes;
     private boolean closed = false;
 
     /**
@@ -153,39 +149,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
         return emitKey;
     }
 
-    /**
-     * Stores the original container document for optional inclusion.
-     *
-     * @param inputStream the original document input stream
-     * @param fileName    the file name for the original document
-     * @throws IOException if storing fails
-     */
-    public void storeOriginalDocument(InputStream inputStream, String 
fileName) throws IOException {
-        this.originalDocumentName = fileName;
-        this.originalDocumentPath = tempDirectory.resolve(fileName);
-
-        MessageDigest digest;
-        try {
-            digest = MessageDigest.getInstance("SHA-256");
-        } catch (NoSuchAlgorithmException e) {
-            throw new IOException("SHA-256 algorithm not available", e);
-        }
-
-        long bytes = 0;
-        try (DigestInputStream dis = new DigestInputStream(inputStream, 
digest);
-             OutputStream os = Files.newOutputStream(originalDocumentPath)) {
-            byte[] buffer = new byte[8192];
-            int read;
-            while ((read = dis.read(buffer)) != -1) {
-                os.write(buffer, 0, read);
-                bytes += read;
-            }
-        }
-
-        this.originalDocumentHash = 
FrictionlessResource.formatHash(digest.digest());
-        this.originalDocumentBytes = bytes;
-    }
-
     /**
      * Builds the DataPackage manifest from collected files.
      *
@@ -195,17 +158,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
     public DataPackage buildDataPackage(String containerName) {
         DataPackage dataPackage = new DataPackage(containerName);
 
-        // Add original document if included
-        if (unpackConfig.isIncludeOriginal() && hasOriginalDocument()) {
-            dataPackage.addResource(FrictionlessResource.create(
-                    originalDocumentName,
-                    detectMediatypeFromFilename(originalDocumentName),
-                    originalDocumentBytes,
-                    originalDocumentHash,
-                    originalDocumentName
-            ));
-        }
-
         // Add all embedded files with unpacked/ prefix
         for (FrictionlessFileInfo fileInfo : embeddedFiles) {
             String path = UNPACKED_DIR + "/" + fileInfo.fileName();
@@ -222,48 +174,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
         return dataPackage;
     }
 
-    /**
-     * Simple mediatype detection from filename extension.
-     */
-    private String detectMediatypeFromFilename(String filename) {
-        if (filename == null) {
-            return "application/octet-stream";
-        }
-        String lower = filename.toLowerCase(java.util.Locale.ROOT);
-        if (lower.endsWith(".pdf")) {
-            return "application/pdf";
-        } else if (lower.endsWith(".xml")) {
-            return "application/xml";
-        } else if (lower.endsWith(".doc")) {
-            return "application/msword";
-        } else if (lower.endsWith(".docx")) {
-            return 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document";
-        } else if (lower.endsWith(".xls")) {
-            return "application/vnd.ms-excel";
-        } else if (lower.endsWith(".xlsx")) {
-            return 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
-        } else if (lower.endsWith(".ppt")) {
-            return "application/vnd.ms-powerpoint";
-        } else if (lower.endsWith(".pptx")) {
-            return 
"application/vnd.openxmlformats-officedocument.presentationml.presentation";
-        } else if (lower.endsWith(".txt")) {
-            return "text/plain";
-        } else if (lower.endsWith(".html") || lower.endsWith(".htm")) {
-            return "text/html";
-        } else if (lower.endsWith(".json")) {
-            return "application/json";
-        } else if (lower.endsWith(".png")) {
-            return "image/png";
-        } else if (lower.endsWith(".jpg") || lower.endsWith(".jpeg")) {
-            return "image/jpeg";
-        } else if (lower.endsWith(".gif")) {
-            return "image/gif";
-        } else if (lower.endsWith(".zip")) {
-            return "application/zip";
-        }
-        return "application/octet-stream";
-    }
-
     /**
      * Returns the temporary directory where files are stored.
      */
@@ -292,27 +202,6 @@ public class FrictionlessUnpackHandler extends 
AbstractUnpackHandler implements
         return !embeddedFiles.isEmpty();
     }
 
-    /**
-     * Returns the path to the original document if stored.
-     */
-    public Path getOriginalDocumentPath() {
-        return originalDocumentPath;
-    }
-
-    /**
-     * Returns the name of the original document if stored.
-     */
-    public String getOriginalDocumentName() {
-        return originalDocumentName;
-    }
-
-    /**
-     * Returns true if the original document was stored.
-     */
-    public boolean hasOriginalDocument() {
-        return originalDocumentPath != null && 
Files.exists(originalDocumentPath);
-    }
-
     /**
      * Returns the UnpackConfig used by this handler.
      */
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
index 6f665f27d3..ada8a2daa7 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java
@@ -42,8 +42,6 @@ public class TempFileUnpackHandler extends 
AbstractUnpackHandler
     private final EmitKey containerEmitKey;
     private final UnpackConfig unpackConfig;
     private final List<EmbeddedFileInfo> embeddedFiles = new ArrayList<>();
-    private Path originalDocumentPath;
-    private String originalDocumentName;
     private boolean closed = false;
 
     /**
@@ -112,42 +110,6 @@ public class TempFileUnpackHandler extends 
AbstractUnpackHandler
         return !embeddedFiles.isEmpty();
     }
 
-    /**
-     * Stores the original container document for inclusion in the zip.
-     * Call this before parsing if includeOriginal is enabled.
-     *
-     * @param inputStream the original document input stream
-     * @param fileName the file name for the original document
-     */
-    public void storeOriginalDocument(InputStream inputStream, String 
fileName) throws IOException {
-        this.originalDocumentName = fileName;
-        this.originalDocumentPath = tempDirectory.resolve("_original_" + 
fileName);
-        try (OutputStream os = Files.newOutputStream(originalDocumentPath)) {
-            inputStream.transferTo(os);
-        }
-    }
-
-    /**
-     * Returns the path to the original document if stored.
-     */
-    public Path getOriginalDocumentPath() {
-        return originalDocumentPath;
-    }
-
-    /**
-     * Returns the name of the original document if stored.
-     */
-    public String getOriginalDocumentName() {
-        return originalDocumentName;
-    }
-
-    /**
-     * Returns true if the original document was stored.
-     */
-    public boolean hasOriginalDocument() {
-        return originalDocumentPath != null && 
Files.exists(originalDocumentPath);
-    }
-
     @Override
     public void close() throws IOException {
         if (!closed) {
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index a96d070b57..911b2940a2 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -320,15 +320,9 @@ class PipesWorker implements Callable<PipesResult> {
         DataPackage dataPackage = 
frictionlessHandler.buildDataPackage(containerName);
 
         try {
-            // Emit original document if included
-            if (unpackConfig.isIncludeOriginal() && 
frictionlessHandler.hasOriginalDocument()) {
-                String originalEmitKey = baseEmitKey + "/" + 
frictionlessHandler.getOriginalDocumentName();
-                try (InputStream is = 
Files.newInputStream(frictionlessHandler.getOriginalDocumentPath())) {
-                    streamEmitter.emit(originalEmitKey, is, new Metadata(), 
parseContext);
-                }
-            }
-
-            // Emit each embedded file under unpacked/
+            // Emit each embedded file under unpacked/.
+            // When includeOriginal=true the container itself is added as id 0 
by
+            // ParseHandler._preParse, so it appears here as one of the 
embedded entries.
             for (FrictionlessUnpackHandler.FrictionlessFileInfo fileInfo : 
frictionlessHandler.getEmbeddedFiles()) {
                 String fileEmitKey = baseEmitKey + "/unpacked/" + 
fileInfo.fileName();
                 try (InputStream is = 
Files.newInputStream(fileInfo.filePath())) {
@@ -385,15 +379,9 @@ class PipesWorker implements Callable<PipesResult> {
                 zos.closeEntry();
             }
 
-            // Add original document if included (at root level)
-            if (unpackConfig.isIncludeOriginal() && 
frictionlessHandler.hasOriginalDocument()) {
-                ZipEntry originalEntry = new 
ZipEntry(frictionlessHandler.getOriginalDocumentName());
-                zos.putNextEntry(originalEntry);
-                Files.copy(frictionlessHandler.getOriginalDocumentPath(), zos);
-                zos.closeEntry();
-            }
-
-            // Add all embedded files under unpacked/
+            // Add all embedded files under unpacked/.
+            // When includeOriginal=true the container itself is added as id 0 
by
+            // ParseHandler._preParse, so it appears here as one of the 
embedded entries.
             for (FrictionlessUnpackHandler.FrictionlessFileInfo fileInfo : 
frictionlessHandler.getEmbeddedFiles()) {
                 ZipEntry fileEntry = new ZipEntry("unpacked/" + 
fileInfo.fileName());
                 zos.putNextEntry(fileEntry);
@@ -442,14 +430,8 @@ class PipesWorker implements Callable<PipesResult> {
     private void createZipFile(Path zipFile, TempFileUnpackHandler tempHandler,
                                UnpackConfig unpackConfig) throws IOException {
         try (ZipOutputStream zos = new 
ZipOutputStream(Files.newOutputStream(zipFile))) {
-            // Include original document if requested
-            if (unpackConfig.isIncludeOriginal() && 
tempHandler.hasOriginalDocument()) {
-                ZipEntry originalEntry = new 
ZipEntry(tempHandler.getOriginalDocumentName());
-                zos.putNextEntry(originalEntry);
-                Files.copy(tempHandler.getOriginalDocumentPath(), zos);
-                zos.closeEntry();
-            }
-
+            // When includeOriginal=true the container itself is added as id 0 
by
+            // ParseHandler._preParse, so it appears here as one of the 
embedded entries.
             for (TempFileUnpackHandler.EmbeddedFileInfo fileInfo : 
tempHandler.getEmbeddedFiles()) {
                 // Add the embedded file
                 ZipEntry fileEntry = new ZipEntry(fileInfo.fileName());

Reply via email to