utils/pdfinfo.1 | 6 ++++++ utils/pdfinfo.cc | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+)
New commits: commit c498cfe5a6292f2c696178b69e6fb275f1a4a4da Author: Adrian Johnson <ajohn...@redneon.com> Date: Fri Oct 1 09:24:03 2021 +0000 pdfinfo: add -url option to print all URLs in a PDF diff --git a/utils/pdfinfo.1 b/utils/pdfinfo.1 index abd34a8c..2a17bbd5 100644 --- a/utils/pdfinfo.1 +++ b/utils/pdfinfo.1 @@ -126,6 +126,12 @@ file. Note that extracting text this way might be slow for big PDF files. (Implies .BR \-struct .) .TP +.B \-url +Print all URLs in the PDF. Only the URL types supported by Poppler are listed. +Currently, this is limited to Annotations. Note: only URLs referenced by the PDF objects +such as Link Annotations are listed. pdfinfo does not attempt to extract strings +matching http://... from the text content. +.TP .B \-isodates Prints dates in ISO-8601 format (including the time zone). .TP diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index e34f21fc..d47b2564 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -87,6 +87,7 @@ static bool printEnc = false; static bool printStructure = false; static bool printStructureText = false; static bool printDests = false; +static bool printUrls = false; static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to convert" }, { "-l", argInt, &lastPage, 0, "last page to convert" }, @@ -99,6 +100,7 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to { "-isodates", argFlag, &isoDates, 0, "print the dates in ISO-8601 format" }, { "-rawdates", argFlag, &rawDates, 0, "print the undecoded date strings directly from the PDF file" }, { "-dests", argFlag, &printDests, 0, "print all named destinations in the PDF" }, + { "-url", argFlag, &printUrls, 0, "print all URLs inside PDF objects (does not scan text content)" }, { "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" }, { "-listenc", argFlag, &printEnc, 0, "list available encodings" }, { "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" }, @@ -412,6 +414,26 @@ static void printDestinations(PDFDoc *doc, const UnicodeMap *uMap) } } +static void printUrlList(PDFDoc *doc) +{ + printf("Page Type URL\n"); + for (int pg = firstPage; pg <= lastPage; pg++) { + Page *page = doc->getPage(pg); + if (page) { + Links *links = page->getLinks(); + for (int i = 0; i < links->getNumLinks(); i++) { + AnnotLink *annot = links->getLink(i); + LinkAction *action = annot->getAction(); + if (action->getKind() == actionURI) { + LinkURI *linkUri = dynamic_cast<LinkURI *>(action); + std::string uri = linkUri->getURI(); + printf("%4d Annotation %s\n", pg, uri.c_str()); + } + } + } + } +} + static void printPdfSubtype(PDFDoc *doc, const UnicodeMap *uMap) { const Object info = doc->getDocInfo(); @@ -1015,6 +1037,8 @@ int main(int argc, char *argv[]) } } else if (printDests) { printDestinations(doc.get(), uMap); + } else if (printUrls) { + printUrlList(doc.get()); } else { // print info long long filesize = 0;