Hey, I needed character boxes and rotation of bounding boxes for a project I'm working on, so I added them. I've attached the patch. If there is interest, I would like to get it merged into master. Thanks!
-Micah
commit b24eb09f165299b5e5ec585ca4a7d26cbdbe90c6 Author: Micah Chambers <[email protected]> Date: Tue Dec 1 15:35:40 2015 +0000 Added rotation, character bounding box - bbox now includes rotation (0, 90, 180, 270) - there is now a bbox-char that outputs individual character bounding boxes diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index d931a96..ddaa217 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -64,6 +64,7 @@ static void printInfoString(FILE *f, Dict *infoDict, const char *key, const char *text1, const char *text2, UnicodeMap *uMap); static void printInfoDate(FILE *f, Dict *infoDict, const char *key, const char *fmt); void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last); +void printCharBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last); void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last); static int firstPage = 1; @@ -74,6 +75,7 @@ static int y = 0; static int w = 0; static int h = 0; static GBool bbox = gFalse; +static GBool bboxChar = gFalse; static GBool bboxLayout = gFalse; static GBool physLayout = gFalse; static double fixedPitch = 0; @@ -122,6 +124,8 @@ static const ArgDesc argDesc[] = { "don't insert page breaks between pages"}, {"-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html. Sets -htmlmeta"}, + {"-bbox-char", argFlag, &bboxChar, 0, + "output bounding box for each character and page size to html. Sets -htmlmeta"}, {"-bbox-layout", argFlag, &bboxLayout, 0, "like -bbox but with extra layout bounding box data. Sets -htmlmeta"}, {"-opw", argString, ownerPassword, sizeof(ownerPassword), @@ -184,7 +188,7 @@ int main(int argc, char *argv[]) { // parse args ok = parseArgs(argDesc, &argc, argv); - if (bboxLayout) { + if (bboxLayout || bboxChar) { bbox = gTrue; } if (bbox) { @@ -366,6 +370,9 @@ int main(int argc, char *argv[]) { if (bboxLayout) { printDocBBox(f, doc, textOut, firstPage, lastPage); } + else if (bboxChar) { + printCharBBox(f, doc, textOut, firstPage, lastPage); + } else { printWordBBox(f, doc, textOut, firstPage, lastPage); } @@ -381,10 +388,10 @@ int main(int argc, char *argv[]) { doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0, gTrue, gFalse, gFalse); } else { - + for (int page = firstPage; page <= lastPage; ++page) { doc->displayPageSlice(textOut, page, resolution, resolution, 0, - gTrue, gFalse, gFalse, + gTrue, gFalse, gFalse, x, y, w, h); } } @@ -553,14 +560,44 @@ void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int const int word_length = wordlist != NULL ? wordlist->getLength() : 0; TextWord *word; double xMinA, yMinA, xMaxA, yMaxA; + int rot; if (word_length == 0) fprintf(stderr, "no word list\n"); for (int i = 0; i < word_length; ++i) { word = wordlist->get(i); + rot = 90*word->getRotation(); word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA); const std::string myString = myXmlTokenReplace(word->getText()->getCString()); - fprintf(f," <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str()); + fprintf(f," <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\" rot=\"%i\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, rot, myString.c_str()); + } + fprintf(f, " </page>\n"); + delete wordlist; + } + fprintf(f, "</doc>\n"); +} + +void printCharBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last) { + fprintf(f, "<doc>\n"); + for (int page = first; page <= last; ++page) { + fprintf(f, " <page width=\"%f\" height=\"%f\">\n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page)); + doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse); + TextWordList *wordlist = textOut->makeWordList(); + const int word_length = wordlist != NULL ? wordlist->getLength() : 0; + TextWord *word; + double xMinA, yMinA, xMaxA, yMaxA; + int rot; + if (word_length == 0) + fprintf(stderr, "no word list\n"); + + for (int i = 0; i < word_length; ++i) { + word = wordlist->get(i); + const std::string myString = myXmlTokenReplace(word->getText()->getCString()); + rot = 90*word->getRotation(); + for (unsigned int j = 0; j < myString.length(); ++j) { + word->getCharBBox(j, &xMinA, &yMinA, &xMaxA, &yMaxA); + fprintf(f," <char xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\" rot=\"%i\">%c</char>\n", xMinA, yMinA, xMaxA, yMaxA, rot, myString[j]); + } } fprintf(f, " </page>\n"); delete wordlist;
_______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
