This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 213747d7e TIKA-3794 -- Fix bug that prevented specification of 
rendered image type via http header in the PDFParser.
213747d7e is described below

commit 213747d7e6f45f3e30cf40dcce8b2135f9d52bc2
Author: tallison <[email protected]>
AuthorDate: Mon Jun 20 15:14:34 2022 -0400

    TIKA-3794 -- Fix bug that prevented specification of rendered image type 
via http header in the PDFParser.
---
 CHANGES.txt                                        |   6 ++
 .../java/org/apache/tika/parser/pdf/PDFParser.java |   6 +-
 .../apache/tika/parser/pdf/PDFParserConfig.java    |   2 +-
 .../org/apache/tika/server/core/CXFTestBase.java   |  22 +++++
 .../tika/server/standard/UnpackerResourceTest.java |  96 +++++++++++++++++++++
 .../test-documents/testColorRendering.pdf          | Bin 0 -> 1794 bytes
 6 files changed, 128 insertions(+), 4 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 8066d6ac6..cd390d441 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 2.4.2 - ???
+
+   * Fix bug that prevented specification of rendered image type
+     via http header in the PDFParser (TIKA-3794).
+
+
 Release 2.4.1 - 06/14/2022
 
    * Implement bulk upload in the OpenSearch emitter (TIKA-3791).
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index b3233c35e..e790378ae 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -741,9 +741,9 @@ public class PDFParser extends AbstractParser implements 
RenderingParser, Initia
         }
         //set a default renderer if nothing was defined
         PDFBoxRenderer pdfBoxRenderer = new PDFBoxRenderer();
-        pdfBoxRenderer.setDPI(defaultConfig.getOcrDPI());
-        pdfBoxRenderer.setImageType(defaultConfig.getOcrImageType());
-        
pdfBoxRenderer.setImageFormatName(defaultConfig.getOcrImageFormatName());
+        pdfBoxRenderer.setDPI(config.getOcrDPI());
+        pdfBoxRenderer.setImageType(config.getOcrImageType());
+        pdfBoxRenderer.setImageFormatName(config.getOcrImageFormatName());
         config.setRenderer(pdfBoxRenderer);
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index cf477c697..acd57e47f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -637,7 +637,7 @@ public class PDFParserConfig implements Serializable {
      * @see #setOcrImageType(ImageType)
      */
     public void setOcrImageType(String ocrImageTypeString) {
-        this.ocrImageType = parseImageType(ocrImageTypeString);
+        setOcrImageType(parseImageType(ocrImageTypeString));
     }
 
     /**
diff --git 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index 8d927cb7c..8bfc02023 100644
--- 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++ 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -184,6 +184,28 @@ public abstract class CXFTestBase {
         return data;
     }
 
+    protected Map<String, byte[]> readZipArchiveBytes(InputStream inputStream) 
throws IOException {
+        Map<String, byte[]> data = new HashMap<>();
+        Path tempFile = null;
+        try {
+            tempFile = writeTemporaryArchiveFile(inputStream, "zip");
+            ZipFile zip = new ZipFile(tempFile.toFile());
+            Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+            while (entries.hasMoreElements()) {
+                ZipArchiveEntry entry = entries.nextElement();
+                ByteArrayOutputStream bos = new ByteArrayOutputStream();
+                IOUtils.copy(zip.getInputStream(entry), bos);
+                data.put(entry.getName(), bos.toByteArray());
+            }
+            zip.close();
+        } finally {
+            if (tempFile != null ) {
+                Files.delete(tempFile);
+            }
+        }
+        return data;
+    }
+
     protected String readArchiveText(InputStream inputStream) throws 
IOException {
         Path tempFile = writeTemporaryArchiveFile(inputStream, "zip");
         ZipFile zip = new ZipFile(tempFile.toFile());
diff --git 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index af157b916..e7b1bf86c 100644
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -23,10 +23,13 @@ import static 
org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+import javax.imageio.ImageIO;
 import javax.ws.rs.core.Response;
 
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
@@ -35,6 +38,8 @@ import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.junit.jupiter.api.Test;
 
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.server.core.CXFTestBase;
 import org.apache.tika.server.core.TikaServerParseExceptionMapper;
@@ -233,4 +238,95 @@ public class UnpackerResourceTest extends CXFTestBase {
         String txt = readArchiveText((InputStream) response.getEntity());
         CXFTestBase.assertContains("Happy New Year", txt);
     }
+
+    @Test
+    public void testPDFPerPageRenderColor() throws Exception {
+
+        Response response = WebClient.create(CXFTestBase.endPoint + ALL_PATH)
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + 
"imageStrategy",
+                        "RenderPagesAtPageEnd")
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + 
"ocrImageType", "rgb")
+                
.accept("application/zip").put(ClassLoader.getSystemResourceAsStream(
+                        "test-documents/testColorRendering.pdf"));
+        Map<String, byte[]> results = readZipArchiveBytes((InputStream) 
response.getEntity());
+        byte[] renderedImage = null;
+        for (Map.Entry<String, byte[]> e : results.entrySet()) {
+            if (e.getKey().startsWith("tika-pdfbox-rendering")) {
+                renderedImage = e.getValue();
+                break;
+            }
+        }
+        assertEquals("image/png",
+                TikaConfig.getDefaultConfig().getDetector()
+                        .detect(new ByteArrayInputStream(renderedImage), new 
Metadata()).toString()
+        );
+
+        try (InputStream is = new ByteArrayInputStream(renderedImage)) {
+            BufferedImage image = ImageIO.read(is);
+            //top left
+            AverageColor averageColor =
+                    getAverageColor(image, 0, image.getWidth() / 5, 0, 
image.getHeight() / 10);
+            assertTrue(averageColor.red > 250);
+            assertTrue(averageColor.green < 1);
+            assertTrue(averageColor.blue < 1);
+
+            //bottom left = green
+            averageColor = getAverageColor(image, 0, image.getWidth() / 5,
+                    image.getHeight() / 2 + image.getHeight() / 10,
+                    image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+            assertTrue(averageColor.red < 1);
+            assertTrue(averageColor.green > 250);
+            assertTrue(averageColor.blue < 1);
+
+            //bottom right = blue
+            averageColor = getAverageColor(image, image.getWidth() / 2 + 
image.getWidth() / 10,
+                    image.getWidth() / 2 + 2 * image.getWidth() / 10,
+                    image.getHeight() / 2 + image.getHeight() / 10,
+                    image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+            assertTrue(averageColor.red < 1);
+            assertTrue(averageColor.green < 1);
+            assertTrue(averageColor.blue > 250);
+        }
+    }
+
+    private static AverageColor getAverageColor(BufferedImage image, int minX, 
int maxX, int minY,
+                                                int maxY) {
+        long totalRed = 0;
+        long totalGreen = 0;
+        long totalBlue = 0;
+        int pixels = 0;
+        for (int x = minX; x < maxX; x++) {
+            for (int y = minY; y < maxY; y++) {
+                int clr = image.getRGB(x, y);
+                int red = (clr & 0x00ff0000) >> 16;
+                int green = (clr & 0x0000ff00) >> 8;
+                int blue = clr & 0x000000ff;
+                totalRed += red;
+                totalGreen += green;
+                totalBlue += blue;
+                pixels++;
+            }
+        }
+        return new AverageColor((double) totalRed / (double) pixels,
+                (double) totalGreen / (double) pixels, (double) totalBlue / 
(double) pixels);
+    }
+
+    public static class AverageColor {
+        double red;
+        double green;
+        double blue;
+
+        public AverageColor(double averageRed, double averageGreen, double 
averageBlue) {
+            this.red = averageRed;
+            this.green = averageGreen;
+            this.blue = averageBlue;
+        }
+
+        @Override
+        public String toString() {
+            return "AverageColor{" + "red=" + red + ", green=" + green + ", 
blue=" + blue + '}';
+        }
+    }
 }
diff --git 
a/tika-server/tika-server-standard/src/test/resources/test-documents/testColorRendering.pdf
 
b/tika-server/tika-server-standard/src/test/resources/test-documents/testColorRendering.pdf
new file mode 100644
index 000000000..4cf60720c
Binary files /dev/null and 
b/tika-server/tika-server-standard/src/test/resources/test-documents/testColorRendering.pdf
 differ

Reply via email to