poppler/UTF.cc | 436 ++++++++----------------------------------------------- poppler/UTF.h | 10 + utils/pdfinfo.1 | 3 utils/pdfinfo.cc | 70 ++++++++ 4 files changed, 150 insertions(+), 369 deletions(-)
New commits: commit b10e715b6a12d63922e428512d2d14682fd1cefc Author: Adrian Johnson <[email protected]> Date: Thu Sep 16 06:59:14 2021 +0930 Ignore custom metadata that is not a string diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index 73bbbc76..b46d1aa3 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -679,22 +679,24 @@ static void printCustomInfo(PDFDoc *doc, const UnicodeMap *uMap) printInfoDate(info.getDict(), "ModDate", "ModDate: ", uMap); } } else { - // print key - Unicode *u; - int len = utf8ToUCS4(key.c_str(), &u); - printUCS4String(u, len, uMap); - fputs(":", stdout); - while (len < 15) { - fputs(" ", stdout); - len++; - } - gfree(u); - - // print value Object obj = dict->lookup(key.c_str()); - GooString val_str(obj.getString()); - printTextString(&val_str, uMap); - fputc('\n', stdout); + if (obj.isString()) { + // print key + Unicode *u; + int len = utf8ToUCS4(key.c_str(), &u); + printUCS4String(u, len, uMap); + fputs(":", stdout); + while (len < 15) { + fputs(" ", stdout); + len++; + } + gfree(u); + + // print value + GooString val_str(obj.getString()); + printTextString(&val_str, uMap); + fputc('\n', stdout); + } } } } commit 2bcf030e294cddf47abb63d53944b5e932848917 Author: Adrian Johnson <[email protected]> Date: Wed Sep 15 22:31:10 2021 +0930 pdfinfo: Add -custom option to print custom metadata diff --git a/poppler/UTF.cc b/poppler/UTF.cc index 9097b312..b78fd2ff 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -176,377 +176,28 @@ static const uint32_t UTF8_REJECT = 12; static const uint32_t UCS4_MAX = 0x10FFFF; static const Unicode REPLACEMENT_CHAR = 0xFFFD; +// clang-format off static const uint8_t decodeUtf8Table[] = { - // The first part of the table maps bytes to character classes - // to reduce the size of the transition table and create bitmasks. - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, // 00..1f - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, // 20..3f - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, // 40..5f - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, // 60..7f - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, // 80..9f - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, // a0..bf - 8, - 8, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, // c0..df - 10, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 4, - 3, - 3, - 11, - 6, - 6, - 6, - 5, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, // e0..ff - - // The second part is a transition table that maps a combination - // of a state of the automaton and a character class to a state. - 0, - 12, - 24, - 36, - 60, - 96, - 84, - 12, - 12, - 12, - 48, - 72, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 0, - 12, - 12, - 12, - 12, - 12, - 0, - 12, - 0, - 12, - 12, - 12, - 24, - 12, - 12, - 12, - 12, - 12, - 24, - 12, - 24, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 24, - 12, - 12, - 12, - 12, - 12, - 24, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 24, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 36, - 12, - 36, - 12, - 12, - 12, - 36, - 12, - 12, - 12, - 12, - 12, - 36, - 12, - 36, - 12, - 12, - 12, - 36, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, - 12, + // The first part of the table maps bytes to character classes + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, }; +// clang-format on // Decode utf8 state machine for fast UTF-8 decoding. Initialise state // to 0 and call decodeUtf8() for each byte of UTF-8. Return value @@ -566,6 +217,53 @@ inline uint32_t decodeUtf8(uint32_t *state, uint32_t *codep, char byte) return *state; } +int utf8CountUCS4(const char *utf8) +{ + uint32_t codepoint; + uint32_t state = 0; + int count = 0; + + while (*utf8) { + decodeUtf8(&state, &codepoint, *utf8); + if (state == UTF8_ACCEPT) { + count++; + } else if (state == UTF8_REJECT) { + count++; // replace with REPLACEMENT_CHAR + state = 0; + } + utf8++; + } + if (state != UTF8_ACCEPT && state != UTF8_REJECT) + count++; // replace with REPLACEMENT_CHAR + + return count; +} + +int utf8ToUCS4(const char *utf8, Unicode **ucs4_out) +{ + int len = utf8CountUCS4(utf8); + Unicode *u = (Unicode *)gmallocn(len, sizeof(Unicode)); + int n = 0; + uint32_t codepoint; + uint32_t state = 0; + + while (*utf8 && n < len) { + decodeUtf8(&state, &codepoint, *utf8); + if (state == UTF8_ACCEPT) { + u[n++] = codepoint; + } else if (state == UTF8_REJECT) { + u[n++] = REPLACEMENT_CHAR; // invalid byte for this position + state = 0; + } + utf8++; + } + if (state != UTF8_ACCEPT && state != UTF8_REJECT) + u[n] = REPLACEMENT_CHAR; // invalid byte for this position + + *ucs4_out = u; + return len; +} + // Count number of UTF-16 code units required to convert a UTF-8 string // (excluding terminating NULL). Each invalid byte is counted as a // code point since the UTF-8 conversion functions will replace it with diff --git a/poppler/UTF.h b/poppler/UTF.h index 2e4cfe7f..d22fd409 100644 --- a/poppler/UTF.h +++ b/poppler/UTF.h @@ -42,6 +42,16 @@ bool UnicodeIsValid(Unicode ucs4); // is a unicode whitespace character bool UnicodeIsWhitespace(Unicode ucs4); +// Count number of UCS-4 characters required to convert a UTF-8 string to +// UCS-4 (excluding terminating NULL). +int POPPLER_PRIVATE_EXPORT utf8CountUCS4(const char *utf8); + +// Convert a UTF-8 string to a UCS-4 +// utf8 - utf8 bytes +// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree. +// returns number of UCS-4 characters +int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char *utf8, Unicode **ucs4_out); + // Count number of UTF-16 code units required to convert a UTF-8 string // (excluding terminating NULL). Each invalid byte is counted as a // code point since the UTF-8 conversion functions will replace it with diff --git a/utils/pdfinfo.1 b/utils/pdfinfo.1 index 41190842..f4225a9e 100644 --- a/utils/pdfinfo.1 +++ b/utils/pdfinfo.1 @@ -99,6 +99,9 @@ TrimBox, and ArtBox. Prints document-level metadata. (This is the "Metadata" stream from the PDF file's Catalog object.) .TP +.B \-custom +Prints custom and standard metadata. +.TP .B \-js Prints all JavaScript in the PDF. .TP diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index 655c3f5e..73bbbc76 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -43,6 +43,7 @@ #include <ctime> #include <cmath> #include <map> +#include <set> #include "parseargs.h" #include "printencodings.h" #include "goo/GooString.h" @@ -73,6 +74,7 @@ static int firstPage = 1; static int lastPage = 0; static bool printBoxes = false; static bool printMetadata = false; +static bool printCustom = false; static bool printJS = false; static bool isoDates = false; static bool rawDates = false; @@ -90,6 +92,7 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to { "-l", argInt, &lastPage, 0, "last page to convert" }, { "-box", argFlag, &printBoxes, 0, "print the page bounding boxes" }, { "-meta", argFlag, &printMetadata, 0, "print the document metadata (XML)" }, + { "-custom", argFlag, &printCustom, 0, "print both custom and standard metadata" }, { "-js", argFlag, &printJS, 0, "print all JavaScript in the PDF" }, { "-struct", argFlag, &printStructure, 0, "print the logical document structure (for tagged files)" }, { "-struct-text", argFlag, &printStructureText, 0, "print text contents along with document structure (for tagged files)" }, @@ -119,6 +122,15 @@ static void printTextString(const GooString *s, const UnicodeMap *uMap) gfree(u); } +static void printUCS4String(const Unicode *u, int len, const UnicodeMap *uMap) +{ + char buf[8]; + for (int i = 0; i < len; i++) { + int n = uMap->mapUnicode(u[i], buf, sizeof(buf)); + fwrite(buf, 1, n, stdout); + } +} + static void printInfoString(Dict *infoDict, const char *key, const char *text, const UnicodeMap *uMap) { const GooString *s1; @@ -634,6 +646,60 @@ static void printPdfSubtype(PDFDoc *doc, const UnicodeMap *uMap) } } +static void printCustomInfo(PDFDoc *doc, const UnicodeMap *uMap) +{ + Object info = doc->getDocInfo(); + if (info.isDict()) { + Dict *dict = info.getDict(); + + // Sort keys + std::set<std::string> keys; + for (int i = 0; i < dict->getLength(); i++) { + std::string key(dict->getKey(i)); + if (key != "Trapped") { + keys.insert(key); + } + } + + for (const std::string &key : keys) { + if (key == "CreationDate") { + if (isoDates) { + printISODate(info.getDict(), "CreationDate", "CreationDate: ", uMap); + } else if (rawDates) { + printInfoString(info.getDict(), "CreationDate", "CreationDate: ", uMap); + } else { + printInfoDate(info.getDict(), "CreationDate", "CreationDate: ", uMap); + } + } else if (key == "ModDate") { + if (isoDates) { + printISODate(info.getDict(), "ModDate", "ModDate: ", uMap); + } else if (rawDates) { + printInfoString(info.getDict(), "ModDate", "ModDate: ", uMap); + } else { + printInfoDate(info.getDict(), "ModDate", "ModDate: ", uMap); + } + } else { + // print key + Unicode *u; + int len = utf8ToUCS4(key.c_str(), &u); + printUCS4String(u, len, uMap); + fputs(":", stdout); + while (len < 15) { + fputs(" ", stdout); + len++; + } + gfree(u); + + // print value + Object obj = dict->lookup(key.c_str()); + GooString val_str(obj.getString()); + printTextString(&val_str, uMap); + fputc('\n', stdout); + } + } + } +} + static void printInfo(PDFDoc *doc, const UnicodeMap *uMap, long long filesize, bool multiPage) { Page *page; @@ -908,6 +974,8 @@ int main(int argc, char *argv[]) fputc('\n', stdout); delete metadata; } + } else if (printCustom) { + printCustomInfo(doc.get(), uMap); } else if (printJS) { // print javascript JSInfo jsInfo(doc.get(), firstPage - 1);
