[
https://issues.apache.org/jira/browse/PDFBOX-4818?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17088295#comment-17088295
]
Tilman Hausherr commented on PDFBOX-4818:
-----------------------------------------
Please ignore my (deleted) comment, I just woke up.
"return PDDocument.load(new ByteArrayInputStream(docBytes),
MemoryUsageSetting.setupTempFileOnly());"
replace that with "return PDDocument.load(docBytes,
MemoryUsageSetting.setupTempFileOnly());"
I don't see a problem with your code... but obviously, rendering does need some
memory, and the more dpi, the more memory it will need, even if you are using
disk temp memory. (which will slow down the process)
I possible, please share that 400+ page file.
> Is it possible to render a pdf to multi pic with PdfRenderer multi threads?
> ---------------------------------------------------------------------------
>
> Key: PDFBOX-4818
> URL: https://issues.apache.org/jira/browse/PDFBOX-4818
> Project: PDFBox
> Issue Type: Improvement
> Components: Rendering
> Affects Versions: 2.0.19
> Reporter: jiangpeiheng
> Priority: Major
>
> Hi, pdfbox developers.
> I'm using pdfbox to render my pdf file to multi jpg pictures. I've read the
> faq, and found that the PDDocument is not thread safe which means operating
> same PDDocument with multi threads risks.
> The method now I'm using is generate multi PDDocuments to render different
> parts of pdf to speed up the render process. However, it costs much memory.
> And when meeting some big pdf (400+ pages), it may occur OOM.
> So I'm wondering, what is the best way to use pdfbox to render a pdf?
> Here is my code for rendering:
> {code:java}
> package com.bytedance.esign.utils.pdf;
> import com.bytedance.esign.constants.enums.ResponseCode;
> import com.bytedance.esign.exception.EsignException;
> import com.bytedance.esign.threadpool.ThreadPoolManager;
> import com.bytedance.esign.utils.ContractLoadingRecorder;
> import com.google.common.collect.Lists;
> import lombok.AllArgsConstructor;
> import lombok.Data;
> import lombok.extern.slf4j.Slf4j;
> import org.apache.commons.io.IOUtils;
> import org.apache.pdfbox.io.MemoryUsageSetting;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.rendering.PDFRenderer;
> import javax.imageio.ImageIO;
> import java.awt.image.BufferedImage;
> import java.io.ByteArrayInputStream;
> import java.io.ByteArrayOutputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.util.ArrayList;
> import java.util.Collections;
> import java.util.List;
> import java.util.concurrent.CompletableFuture;
> import java.util.function.Supplier;
> /**
> * pdfbox 工具
> *
> * @author jiangpeiheng create on 2020/1/15
> */
> @Slf4j
> public class PdfBoxUtil {
> private static final String LOG_PERFIX = "PDF_BOX_UTIL";
> private static final int MAX_PAGE_FOR_SINGLE_RENDER_TASK = 15;
> static {
>
> System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion",
> "true");
> }
> /**
> * pdf转图片
> *
> * @param is
> * @param contractId
> * @return
> */
> public static List<byte[]> pdfToJpg(InputStream is, String contractId) {
> long start = System.currentTimeMillis();
> try {
> byte[] docBytes = IOUtils.toByteArray(is);
> PDDocument doc = load(docBytes); // 只为获取页数
> int pageCount = doc.getPages().getCount();
> close(doc);
> doc = null; // 方便gc
> // 向redis上报总页数
> ContractLoadingRecorder.setTotalPage(contractId, pageCount);
> // 拆任务
> List<CompletableFuture<List<byte[]>>> futures =
> splitTask(docBytes, pageCount, contractId);
> List<byte[]> images = Lists.newArrayList();
> futures.forEach(future -> images.addAll(future.join()));
> // 判断最终切割的页数
> if (pageCount != images.size()) {
> log.error("[{}]PDF渲染图片,最终获取到的图片页数与预期页数不符,expect:{},
> actual:{}",
> LOG_PERFIX, pageCount, images.size());
> throw new EsignException(ResponseCode.SYSTEM_ERROR.getCode(),
> "PDF切图,最终图片页数与预计页数不符!");
> }
> log.info("[{}]PDF渲染图片整体流程成功", LOG_PERFIX);
> return images;
> } catch (Exception e) {
> log.error("[{}]PDF渲染图片整体流程异常, contractId:{}, e:",
> LOG_PERFIX, contractId, e);
> return Collections.emptyList();
> } finally {
> log.info("[{}]PDF渲染图片整体流程完成, contractId:{}, cost:{}",
> LOG_PERFIX, contractId, System.currentTimeMillis() -
> start);
> }
> }
> private static List<CompletableFuture<List<byte[]>>> splitTask(byte[]
> pdfBytes, int pages, String contractId) {
> List<CompletableFuture<List<byte[]>>> futures = Lists.newArrayList();
> int start = 0;
> int end = Math.min(start + MAX_PAGE_FOR_SINGLE_RENDER_TASK, pages);
> while (start != end && end <= pages) {
> RenderTask task = new RenderTask(pdfBytes, start, end,
> contractId);
> futures.add(CompletableFuture.supplyAsync(task,
> ThreadPoolManager.PDF_RENDER_EXECUTOR));
> start = end;
> end = Math.min(start + MAX_PAGE_FOR_SINGLE_RENDER_TASK, pages);
> }
> return futures;
> }
> @Data
> @AllArgsConstructor
> private static class RenderTask implements Supplier<List<byte[]>> {
> private byte[] pdfBytes;
> private int start;
> private int end;
> private String contractId;
> @Override
> public List<byte[]> get() {
> return render(pdfBytes, start, end, contractId);
> }
> }
> private static List<byte[]> render(byte[] pdfBytes, int start, int end,
> String contractId) {
> long startTime = System.currentTimeMillis();
> try (
> PDDocument doc = load(pdfBytes)
> ) {
> log.info("[{}]载入并发线程的PDDocument耗时:{}",
> LOG_PERFIX, System.currentTimeMillis() - startTime);
> PDFRenderer renderer = new PDFRenderer(doc);
> List<byte[]> images = new ArrayList<>();
> for (int i = start; i < end; i++) {
> BufferedImage bim = renderer.renderImageWithDPI(i, 200);
> images.add(transformImage(bim));
> long subStart = System.currentTimeMillis();
> ContractLoadingRecorder.incrRenderedPage(contractId);
> log.info("[{}]上报新增渲染页数耗时:{}",
> LOG_PERFIX, System.currentTimeMillis() - subStart);
> }
> log.info("[{}]单task渲染PDF成功, start:{}, end:{}, pages:{}",
> LOG_PERFIX, start, end, images.size());
> return images;
> } catch (Exception e) {
> log.error("[{}]单task渲染PDF异常, start:{}, end:{}, e:",
> LOG_PERFIX, start, end, e);
> return Collections.emptyList();
> } finally {
> log.info("[{}]单task渲染PDF完成, start:{}, end:{}, cost:{}",
> LOG_PERFIX, start, end, System.currentTimeMillis() -
> startTime);
> }
> }
> private static byte[] transformImage(BufferedImage bim) throws
> IOException {
> ByteArrayOutputStream os = new ByteArrayOutputStream();
> ImageIO.write(bim, "jpg", os);
> return os.toByteArray();
> }
> private static PDDocument load(byte[] docBytes) throws IOException {
> return PDDocument.load(new ByteArrayInputStream(docBytes),
> MemoryUsageSetting.setupTempFileOnly());
> }
> private static void close(PDDocument doc) throws IOException {
> if (doc == null) {
> return;
> }
> doc.close();
> }
> }
> {code}
> Thanks
> Jiangpeiheng
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]