[ 
https://issues.apache.org/jira/browse/PDFBOX-4818?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Tilman Hausherr updated PDFBOX-4818:
------------------------------------
    Comment: was deleted

(was: I don't see where you are closing the "doc" PDDocument object in 
"render()".)

> Is it possible to render a pdf to multi pic with PdfRenderer multi threads?
> ---------------------------------------------------------------------------
>
>                 Key: PDFBOX-4818
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-4818
>             Project: PDFBox
>          Issue Type: Improvement
>          Components: Rendering
>    Affects Versions: 2.0.19
>            Reporter: jiangpeiheng
>            Priority: Major
>
> Hi, pdfbox developers.
> I'm using pdfbox to render my pdf file to multi jpg pictures. I've read the 
> faq, and found that the PDDocument is not thread safe which means operating 
> same PDDocument with multi threads risks.
> The method now I'm using is generate multi PDDocuments to render different 
> parts of pdf to speed up the render process. However, it costs much memory. 
> And when meeting some big pdf (400+ pages), it may occur OOM.
> So I'm wondering, what is the best way to use pdfbox to render a pdf?
> Here is my code for rendering:
> {code:java}
> package com.bytedance.esign.utils.pdf;
> import com.bytedance.esign.constants.enums.ResponseCode;
> import com.bytedance.esign.exception.EsignException;
> import com.bytedance.esign.threadpool.ThreadPoolManager;
> import com.bytedance.esign.utils.ContractLoadingRecorder;
> import com.google.common.collect.Lists;
> import lombok.AllArgsConstructor;
> import lombok.Data;
> import lombok.extern.slf4j.Slf4j;
> import org.apache.commons.io.IOUtils;
> import org.apache.pdfbox.io.MemoryUsageSetting;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.rendering.PDFRenderer;
> import javax.imageio.ImageIO;
> import java.awt.image.BufferedImage;
> import java.io.ByteArrayInputStream;
> import java.io.ByteArrayOutputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.util.ArrayList;
> import java.util.Collections;
> import java.util.List;
> import java.util.concurrent.CompletableFuture;
> import java.util.function.Supplier;
> /**
>  * pdfbox 工具
>  *
>  * @author jiangpeiheng create on 2020/1/15
>  */
> @Slf4j
> public class PdfBoxUtil {
>     private static final String LOG_PERFIX = "PDF_BOX_UTIL";
>     private static final int MAX_PAGE_FOR_SINGLE_RENDER_TASK = 15;
>     static {
>         
> System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", 
> "true");
>     }
>     /**
>      * pdf转图片
>      *
>      * @param is
>      * @param contractId
>      * @return
>      */
>     public static List<byte[]> pdfToJpg(InputStream is, String contractId) {
>         long start = System.currentTimeMillis();
>         try {
>             byte[] docBytes = IOUtils.toByteArray(is);
>             PDDocument doc = load(docBytes);    // 只为获取页数
>             int pageCount = doc.getPages().getCount();
>             close(doc);
>             doc = null; // 方便gc
>             // 向redis上报总页数
>             ContractLoadingRecorder.setTotalPage(contractId, pageCount);
>             // 拆任务
>             List<CompletableFuture<List<byte[]>>> futures = 
> splitTask(docBytes, pageCount, contractId);
>             List<byte[]> images = Lists.newArrayList();
>             futures.forEach(future -> images.addAll(future.join()));
>             // 判断最终切割的页数
>             if (pageCount != images.size()) {
>                 log.error("[{}]PDF渲染图片,最终获取到的图片页数与预期页数不符,expect:{}, 
> actual:{}",
>                         LOG_PERFIX, pageCount, images.size());
>                 throw new EsignException(ResponseCode.SYSTEM_ERROR.getCode(), 
> "PDF切图,最终图片页数与预计页数不符!");
>             }
>             log.info("[{}]PDF渲染图片整体流程成功", LOG_PERFIX);
>             return images;
>         } catch (Exception e) {
>             log.error("[{}]PDF渲染图片整体流程异常, contractId:{}, e:",
>                     LOG_PERFIX, contractId, e);
>             return Collections.emptyList();
>         } finally {
>             log.info("[{}]PDF渲染图片整体流程完成, contractId:{}, cost:{}",
>                     LOG_PERFIX, contractId, System.currentTimeMillis() - 
> start);
>         }
>     }
>     private static List<CompletableFuture<List<byte[]>>> splitTask(byte[] 
> pdfBytes, int pages, String contractId) {
>         List<CompletableFuture<List<byte[]>>> futures = Lists.newArrayList();
>         int start = 0;
>         int end = Math.min(start + MAX_PAGE_FOR_SINGLE_RENDER_TASK, pages);
>         while (start != end && end <= pages) {
>             RenderTask task = new RenderTask(pdfBytes, start, end, 
> contractId);
>             futures.add(CompletableFuture.supplyAsync(task, 
> ThreadPoolManager.PDF_RENDER_EXECUTOR));
>             start = end;
>             end = Math.min(start + MAX_PAGE_FOR_SINGLE_RENDER_TASK, pages);
>         }
>         return futures;
>     }
>     @Data
>     @AllArgsConstructor
>     private static class RenderTask implements Supplier<List<byte[]>> {
>         private byte[] pdfBytes;
>         private int start;
>         private int end;
>         private String contractId;
>         @Override
>         public List<byte[]> get() {
>             return render(pdfBytes, start, end, contractId);
>         }
>     }
>     private static List<byte[]> render(byte[] pdfBytes, int start, int end, 
> String contractId) {
>         long startTime = System.currentTimeMillis();
>         try (
>                 PDDocument doc = load(pdfBytes)
>         ) {
>             log.info("[{}]载入并发线程的PDDocument耗时:{}",
>                     LOG_PERFIX, System.currentTimeMillis() - startTime);
>             PDFRenderer renderer = new PDFRenderer(doc);
>             List<byte[]> images = new ArrayList<>();
>             for (int i = start; i < end; i++) {
>                 BufferedImage bim = renderer.renderImageWithDPI(i, 200);
>                 images.add(transformImage(bim));
>                 long subStart = System.currentTimeMillis();
>                 ContractLoadingRecorder.incrRenderedPage(contractId);
>                 log.info("[{}]上报新增渲染页数耗时:{}",
>                         LOG_PERFIX, System.currentTimeMillis() - subStart);
>             }
>             log.info("[{}]单task渲染PDF成功, start:{}, end:{}, pages:{}",
>                     LOG_PERFIX, start, end, images.size());
>             return images;
>         } catch (Exception e) {
>             log.error("[{}]单task渲染PDF异常, start:{}, end:{}, e:",
>                     LOG_PERFIX, start, end, e);
>             return Collections.emptyList();
>         } finally {
>             log.info("[{}]单task渲染PDF完成, start:{}, end:{}, cost:{}",
>                     LOG_PERFIX, start, end, System.currentTimeMillis() - 
> startTime);
>         }
>     }
>     private static byte[] transformImage(BufferedImage bim) throws 
> IOException {
>         ByteArrayOutputStream os = new ByteArrayOutputStream();
>         ImageIO.write(bim, "jpg", os);
>         return os.toByteArray();
>     }
>     private static PDDocument load(byte[] docBytes) throws IOException {
>         return PDDocument.load(new ByteArrayInputStream(docBytes),
>                 MemoryUsageSetting.setupTempFileOnly());
>     }
>     private static void close(PDDocument doc) throws IOException {
>         if (doc == null) {
>             return;
>         }
>         doc.close();
>     }
> }
> {code}
> Thanks
> Jiangpeiheng



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to