utils/pdftotext.1 | 3 +++ utils/pdftotext.cc | 14 ++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-)
New commits: commit 2f40575018d75a1412f5c4f8616dfe26d46f504e Author: William Bader <[email protected]> Date: Mon Feb 8 23:29:05 2021 +0000 Add pdftotext -cropbox option diff --git a/utils/pdftotext.1 b/utils/pdftotext.1 index ea2874f7..3ae217b4 100644 --- a/utils/pdftotext.1 +++ b/utils/pdftotext.1 @@ -82,6 +82,9 @@ word in the file. Generate an XHTML file containing bounding box information for each block, line, and word in the file. .TP +.B \-cropbox +Use the crop box rather than the media box with \-bbox and \-bbox-layout. +.TP .BI \-enc " encoding-name" Sets the encoding to use for text output. This defaults to "UTF-8". .TP diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index f17cfd53..4cb18dfd 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -82,6 +82,7 @@ static int h = 0; static bool bbox = false; static bool bboxLayout = false; static bool physLayout = false; +static bool useCropBox = false; static double fixedPitch = 0; static bool rawOrder = false; static bool discardDiag = false; @@ -114,6 +115,7 @@ static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to { "-nopgbrk", argFlag, &noPageBreaks, 0, "don't insert page breaks between pages" }, { "-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html. Sets -htmlmeta" }, { "-bbox-layout", argFlag, &bboxLayout, 0, "like -bbox but with extra layout bounding box data. Sets -htmlmeta" }, + { "-cropbox", argFlag, &useCropBox, 0, "use the crop box rather than media box" }, { "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" }, { "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" }, { "-q", argFlag, &quiet, 0, "don't print any messages or errors" }, @@ -496,8 +498,10 @@ void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int l fprintf(f, "<doc>\n"); for (int page = first; page <= last; ++page) { - fprintf(f, " <page width=\"%f\" height=\"%f\">\n", doc->getPageMediaWidth(page), doc->getPageMediaHeight(page)); - doc->displayPage(textOut, page, resolution, resolution, 0, true, false, false); + const double wid = useCropBox ? doc->getPageCropWidth(page) : doc->getPageMediaWidth(page); + const double hgt = useCropBox ? doc->getPageCropHeight(page) : doc->getPageMediaHeight(page); + fprintf(f, " <page width=\"%f\" height=\"%f\">\n", wid, hgt); + doc->displayPage(textOut, page, resolution, resolution, 0, !useCropBox, useCropBox, false); for (flow = textOut->getFlows(); flow; flow = flow->getNext()) { fprintf(f, " <flow>\n"); for (blk = flow->getBlocks(); blk; blk = blk->getNext()) { @@ -519,8 +523,10 @@ void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int { fprintf(f, "<doc>\n"); for (int page = first; page <= last; ++page) { - fprintf(f, " <page width=\"%f\" height=\"%f\">\n", doc->getPageMediaWidth(page), doc->getPageMediaHeight(page)); - doc->displayPage(textOut, page, resolution, resolution, 0, true, false, false); + double wid = useCropBox ? doc->getPageCropWidth(page) : doc->getPageMediaWidth(page); + double hgt = useCropBox ? doc->getPageCropHeight(page) : doc->getPageMediaHeight(page); + fprintf(f, " <page width=\"%f\" height=\"%f\">\n", wid, hgt); + doc->displayPage(textOut, page, resolution, resolution, 0, !useCropBox, useCropBox, false); TextWordList *wordlist = textOut->makeWordList(); const int word_length = wordlist != nullptr ? wordlist->getLength() : 0; TextWord *word; _______________________________________________ poppler mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/poppler
