include/vcl/filter/pdfdocument.hxx | 2 vcl/Library_vcl.mk | 1 vcl/inc/pdf/ExternalPDFStreams.hxx | 2 vcl/inc/pdf/pdfcompat.hxx | 42 ++++++++++++ vcl/source/filter/ipdf/pdfcompat.cxx | 113 +++++++++++++++++++++++++++++++++ vcl/source/filter/ipdf/pdfdocument.cxx | 14 ++++ vcl/source/filter/ipdf/pdfread.cxx | 104 +----------------------------- 7 files changed, 178 insertions(+), 100 deletions(-)
New commits: commit 03a0b41ba21f7d546160c819e088e0b0023b68bb Author: Dennis Francis <dennis.fran...@collabora.com> AuthorDate: Tue Oct 18 16:14:37 2022 +0530 Commit: Andras Timar <andras.ti...@collabora.com> CommitDate: Wed Oct 26 20:38:17 2022 +0200 vcl: re-exporting broken pdfs -> empty pages Certain pdf documents when loaded in LO_IMPORT_USE_PDFIUM=1 mode even if pdf-version < v1.6 sometimes has missing objects referred by other objects for determing its stream length for instance. As a result parsing fails and results in a pdf with empty pages. A round trip through pdfium and exporting to v1.6 seems to cure the issue. Possibly it does some repairing work to determine the length of the stream in a independent pass through the file. Change-Id: Id09f67eddab4163ed12a3a3f3a73baf92e2912aa Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141854 Tested-by: Jenkins CollaboraOffice <jenkinscollaboraoff...@gmail.com> Reviewed-by: Andras Timar <andras.ti...@collabora.com> diff --git a/include/vcl/filter/pdfdocument.hxx b/include/vcl/filter/pdfdocument.hxx index dd03029227d2..fbe0be89cdc6 100644 --- a/include/vcl/filter/pdfdocument.hxx +++ b/include/vcl/filter/pdfdocument.hxx @@ -576,6 +576,8 @@ public: //@{ /// Read elements from the start of the stream till its end. bool Read(SvStream& rStream); + /// Calls Read() first and if it fails it tries to fixup and then retry. + bool ReadWithPossibleFixup(SvStream& rStream); void SetSignatureLine(std::vector<sal_Int8>&& rSignatureLine); void SetSignaturePage(size_t nPage); /// Sign the read document with xCertificate in the edit buffer. diff --git a/vcl/Library_vcl.mk b/vcl/Library_vcl.mk index b0a6ee533133..25f6a0ef9562 100644 --- a/vcl/Library_vcl.mk +++ b/vcl/Library_vcl.mk @@ -450,6 +450,7 @@ $(eval $(call gb_Library_add_exception_objects,vcl,\ vcl/source/filter/ipict/ipict \ vcl/source/filter/ipsd/ipsd \ vcl/source/filter/ipict/shape \ + vcl/source/filter/ipdf/pdfcompat \ vcl/source/filter/ipdf/pdfread \ vcl/source/filter/ipdf/pdfdocument \ vcl/source/filter/iras/iras \ diff --git a/vcl/inc/pdf/ExternalPDFStreams.hxx b/vcl/inc/pdf/ExternalPDFStreams.hxx index 7840217630c8..45b15f7a74bc 100644 --- a/vcl/inc/pdf/ExternalPDFStreams.hxx +++ b/vcl/inc/pdf/ExternalPDFStreams.hxx @@ -42,7 +42,7 @@ struct VCL_DLLPUBLIC ExternalPDFStream aPDFStream.WriteBytes(maDataContainer.getData(), maDataContainer.getSize()); aPDFStream.Seek(0); auto pPDFDocument = std::make_shared<filter::PDFDocument>(); - if (!pPDFDocument->Read(aPDFStream)) + if (!pPDFDocument->ReadWithPossibleFixup(aPDFStream)) { SAL_WARN("vcl.pdfwriter", "PDFWriterImpl::writeReferenceXObject: reading the PDF document failed"); diff --git a/vcl/inc/pdf/pdfcompat.hxx b/vcl/inc/pdf/pdfcompat.hxx new file mode 100644 index 000000000000..8f629b3bc8ee --- /dev/null +++ b/vcl/inc/pdf/pdfcompat.hxx @@ -0,0 +1,42 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include <config_features.h> +#include <tools/gen.hxx> +#include <tools/stream.hxx> +#include <tools/UnitConversion.hxx> +#include <vcl/graph.hxx> + +namespace vcl::pdf +{ +/// Convert to inch, then assume 96 DPI. +inline double pointToPixel(const double fPoint, const double fResolutionDPI) +{ + return o3tl::convert(fPoint, o3tl::Length::pt, o3tl::Length::in) * fResolutionDPI; +} + +/// Decide if PDF data is old enough to be compatible. +bool isCompatible(SvStream& rInStream, sal_uInt64 nPos, sal_uInt64 nSize); + +/// Converts to highest supported format version (currently 1.6). +/// Usually used to deal with missing referenced objects in the +/// source pdf stream. +bool convertToHighestSupported(SvStream& rInStream, SvStream& rOutStream); + +/// Takes care of transparently downgrading the version of the PDF stream in +/// case it's too new for our PDF export. +bool getCompatibleStream(SvStream& rInStream, SvStream& rOutStream); + +BinaryDataContainer createBinaryDataContainer(SvStream& rStream); + +} // end of vcl::filter::ipdf namespace + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/vcl/source/filter/ipdf/pdfcompat.cxx b/vcl/source/filter/ipdf/pdfcompat.cxx new file mode 100644 index 000000000000..52be1f3b2c07 --- /dev/null +++ b/vcl/source/filter/ipdf/pdfcompat.cxx @@ -0,0 +1,113 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <pdf/pdfcompat.hxx> + +#include <vcl/filter/PDFiumLibrary.hxx> +#include <sal/log.hxx> + +namespace vcl::pdf +{ +/// Decide if PDF data is old enough to be compatible. +bool isCompatible(SvStream& rInStream, sal_uInt64 nPos, sal_uInt64 nSize) +{ + if (nSize < 8) + return false; + + // %PDF-x.y + sal_uInt8 aFirstBytes[8]; + rInStream.Seek(nPos); + sal_uLong nRead = rInStream.ReadBytes(aFirstBytes, 8); + if (nRead < 8) + return false; + + if (aFirstBytes[0] != '%' || aFirstBytes[1] != 'P' || aFirstBytes[2] != 'D' + || aFirstBytes[3] != 'F' || aFirstBytes[4] != '-') + return false; + + sal_Int32 nMajor = OString(char(aFirstBytes[5])).toInt32(); + sal_Int32 nMinor = OString(char(aFirstBytes[7])).toInt32(); + return !(nMajor > 1 || (nMajor == 1 && nMinor > 6)); +} + +/// Converts to highest supported format version (1.6). +/// Usually used to deal with missing referenced objects in source +/// pdf stream. +bool convertToHighestSupported(SvStream& rInStream, SvStream& rOutStream) +{ + sal_uInt64 nPos = STREAM_SEEK_TO_BEGIN; + sal_uInt64 nSize = STREAM_SEEK_TO_END; + rInStream.Seek(nPos); + // Convert to PDF-1.6. + auto pPdfium = vcl::pdf::PDFiumLibrary::get(); + if (!pPdfium) + return false; + + // Read input into a buffer. + SvMemoryStream aInBuffer; + aInBuffer.WriteStream(rInStream, nSize); + + SvMemoryStream aSaved; + { + // Load the buffer using pdfium. + std::unique_ptr<vcl::pdf::PDFiumDocument> pPdfDocument + = pPdfium->openDocument(aInBuffer.GetData(), aInBuffer.GetSize(), OString()); + if (!pPdfDocument) + return false; + + // 16 means PDF-1.6. + if (!pPdfDocument->saveWithVersion(aSaved, 16)) + return false; + } + + aSaved.Seek(STREAM_SEEK_TO_BEGIN); + rOutStream.WriteStream(aSaved); + + return rOutStream.good(); +} + +/// Takes care of transparently downgrading the version of the PDF stream in +/// case it's too new for our PDF export. +bool getCompatibleStream(SvStream& rInStream, SvStream& rOutStream) +{ + sal_uInt64 nPos = STREAM_SEEK_TO_BEGIN; + sal_uInt64 nSize = STREAM_SEEK_TO_END; + bool bCompatible = isCompatible(rInStream, nPos, nSize); + rInStream.Seek(nPos); + if (bCompatible) + // Not converting. + rOutStream.WriteStream(rInStream, nSize); + else + convertToHighestSupported(rInStream, rOutStream); + + return rOutStream.good(); +} + +BinaryDataContainer createBinaryDataContainer(SvStream& rStream) +{ + // Save the original PDF stream for later use. + SvMemoryStream aMemoryStream; + if (!getCompatibleStream(rStream, aMemoryStream)) + return {}; + + const sal_uInt32 nStreamLength = aMemoryStream.TellEnd(); + + auto aPdfData = std::make_unique<std::vector<sal_uInt8>>(nStreamLength); + + aMemoryStream.Seek(STREAM_SEEK_TO_BEGIN); + aMemoryStream.ReadBytes(aPdfData->data(), aPdfData->size()); + if (aMemoryStream.GetError()) + return {}; + + return { std::move(aPdfData) }; +} + +} // end vcl::filter::ipdf namespace + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/vcl/source/filter/ipdf/pdfdocument.cxx b/vcl/source/filter/ipdf/pdfdocument.cxx index 4573d414cfc6..4430c7217fc7 100644 --- a/vcl/source/filter/ipdf/pdfdocument.cxx +++ b/vcl/source/filter/ipdf/pdfdocument.cxx @@ -8,6 +8,8 @@ */ #include <vcl/filter/pdfdocument.hxx> +#include <pdf/pdfcompat.hxx> +#include <config_features.h> #include <map> #include <memory> @@ -1348,6 +1350,18 @@ void PDFDocument::SetIDObject(size_t nID, PDFObjectElement* pObject) m_aIDObjects[nID] = pObject; } +bool PDFDocument::ReadWithPossibleFixup(SvStream& rStream) +{ + if (Read(rStream)) + return true; + + // Read failed, try a roundtrip through pdfium and then retry. + rStream.Seek(0); + SvMemoryStream aStandardizedStream; + vcl::pdf::convertToHighestSupported(rStream, aStandardizedStream); + return Read(aStandardizedStream); +} + bool PDFDocument::Read(SvStream& rStream) { // Check file magic. diff --git a/vcl/source/filter/ipdf/pdfread.cxx b/vcl/source/filter/ipdf/pdfread.cxx index 7a6209c9aa31..392e76ac0cd1 100644 --- a/vcl/source/filter/ipdf/pdfread.cxx +++ b/vcl/source/filter/ipdf/pdfread.cxx @@ -8,8 +8,7 @@ */ #include <vcl/pdfread.hxx> - -#include <tools/UnitConversion.hxx> +#include <pdf/pdfcompat.hxx> #include <pdf/PdfConfig.hxx> #include <vcl/graph.hxx> @@ -22,99 +21,6 @@ using namespace com::sun::star; -namespace -{ -/// Convert to inch, then assume 96 DPI. -inline double pointToPixel(const double fPoint, const double fResolutionDPI) -{ - return o3tl::convert(fPoint, o3tl::Length::pt, o3tl::Length::in) * fResolutionDPI; -} - -/// Decide if PDF data is old enough to be compatible. -bool isCompatible(SvStream& rInStream, sal_uInt64 nPos, sal_uInt64 nSize) -{ - if (nSize < 8) - return false; - - // %PDF-x.y - sal_uInt8 aFirstBytes[8]; - rInStream.Seek(nPos); - sal_uLong nRead = rInStream.ReadBytes(aFirstBytes, 8); - if (nRead < 8) - return false; - - if (aFirstBytes[0] != '%' || aFirstBytes[1] != 'P' || aFirstBytes[2] != 'D' - || aFirstBytes[3] != 'F' || aFirstBytes[4] != '-') - return false; - - sal_Int32 nMajor = OString(char(aFirstBytes[5])).toInt32(); - sal_Int32 nMinor = OString(char(aFirstBytes[7])).toInt32(); - return !(nMajor > 1 || (nMajor == 1 && nMinor > 6)); -} - -/// Takes care of transparently downgrading the version of the PDF stream in -/// case it's too new for our PDF export. -bool getCompatibleStream(SvStream& rInStream, SvStream& rOutStream) -{ - sal_uInt64 nPos = STREAM_SEEK_TO_BEGIN; - sal_uInt64 nSize = STREAM_SEEK_TO_END; - bool bCompatible = isCompatible(rInStream, nPos, nSize); - rInStream.Seek(nPos); - if (bCompatible) - // Not converting. - rOutStream.WriteStream(rInStream, nSize); - else - { - // Downconvert to PDF-1.6. - auto pPdfium = vcl::pdf::PDFiumLibrary::get(); - if (!pPdfium) - return false; - - // Read input into a buffer. - SvMemoryStream aInBuffer; - aInBuffer.WriteStream(rInStream, nSize); - - SvMemoryStream aSaved; - { - // Load the buffer using pdfium. - std::unique_ptr<vcl::pdf::PDFiumDocument> pPdfDocument - = pPdfium->openDocument(aInBuffer.GetData(), aInBuffer.GetSize(), OString()); - if (!pPdfDocument) - return false; - - // 16 means PDF-1.6. - if (!pPdfDocument->saveWithVersion(aSaved, 16)) - return false; - } - - aSaved.Seek(STREAM_SEEK_TO_BEGIN); - rOutStream.WriteStream(aSaved); - } - - return rOutStream.good(); -} - -BinaryDataContainer createBinaryDataContainer(SvStream& rStream) -{ - // Save the original PDF stream for later use. - SvMemoryStream aMemoryStream; - if (!getCompatibleStream(rStream, aMemoryStream)) - return {}; - - const sal_uInt32 nStreamLength = aMemoryStream.TellEnd(); - - auto aPdfData = std::make_unique<std::vector<sal_uInt8>>(nStreamLength); - - aMemoryStream.Seek(STREAM_SEEK_TO_BEGIN); - aMemoryStream.ReadBytes(aPdfData->data(), aPdfData->size()); - if (aMemoryStream.GetError()) - return {}; - - return { std::move(aPdfData) }; -} - -} // end anonymous namespace - namespace vcl { size_t RenderPDFBitmaps(const void* pBuffer, int nSize, std::vector<BitmapEx>& rBitmaps, @@ -157,8 +63,8 @@ size_t RenderPDFBitmaps(const void* pBuffer, int nSize, std::vector<BitmapEx>& r } // Returned unit is points, convert that to pixel. - const size_t nPageWidth = pointToPixel(nPageWidthPoints, fResolutionDPI); - const size_t nPageHeight = pointToPixel(nPageHeightPoints, fResolutionDPI); + const size_t nPageWidth = vcl::pdf::pointToPixel(nPageWidthPoints, fResolutionDPI); + const size_t nPageHeight = vcl::pdf::pointToPixel(nPageHeightPoints, fResolutionDPI); std::unique_ptr<vcl::pdf::PDFiumBitmap> pPdfBitmap = pPdfium->createBitmap(nPageWidth, nPageHeight, /*nAlpha=*/1); if (!pPdfBitmap) @@ -217,7 +123,7 @@ size_t RenderPDFBitmaps(const void* pBuffer, int nSize, std::vector<BitmapEx>& r bool importPdfVectorGraphicData(SvStream& rStream, std::shared_ptr<VectorGraphicData>& rVectorGraphicData) { - BinaryDataContainer aDataContainer = createBinaryDataContainer(rStream); + BinaryDataContainer aDataContainer = vcl::pdf::createBinaryDataContainer(rStream); if (aDataContainer.isEmpty()) { SAL_WARN("vcl.filter", "ImportPDF: empty PDF data array"); @@ -428,7 +334,7 @@ size_t ImportPDFUnloaded(const OUString& rURL, std::vector<PDFGraphicResult>& rG ::utl::UcbStreamHelper::CreateStream(rURL, StreamMode::READ | StreamMode::SHARE_DENYNONE)); // Save the original PDF stream for later use. - BinaryDataContainer aDataContainer = createBinaryDataContainer(*xStream); + BinaryDataContainer aDataContainer = vcl::pdf::createBinaryDataContainer(*xStream); if (aDataContainer.isEmpty()) return 0;