Hey,

I needed character boxes and rotation of bounding boxes for a project I'm
working on, so I added them. I've attached the patch. If there is interest,
I would like to get it merged into master. Thanks!

-Micah
commit b24eb09f165299b5e5ec585ca4a7d26cbdbe90c6
Author: Micah Chambers <[email protected]>
Date:   Tue Dec 1 15:35:40 2015 +0000

    Added rotation, character bounding box
    
    - bbox now includes rotation (0, 90, 180, 270)
    - there is now a bbox-char that outputs individual character bounding boxes

diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index d931a96..ddaa217 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -64,6 +64,7 @@ static void printInfoString(FILE *f, Dict *infoDict, const char *key,
 			    const char *text1, const char *text2, UnicodeMap *uMap);
 static void printInfoDate(FILE *f, Dict *infoDict, const char *key, const char *fmt);
 void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
+void printCharBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
 void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last);
 
 static int firstPage = 1;
@@ -74,6 +75,7 @@ static int y = 0;
 static int w = 0;
 static int h = 0;
 static GBool bbox = gFalse;
+static GBool bboxChar = gFalse;
 static GBool bboxLayout = gFalse;
 static GBool physLayout = gFalse;
 static double fixedPitch = 0;
@@ -122,6 +124,8 @@ static const ArgDesc argDesc[] = {
    "don't insert page breaks between pages"},
   {"-bbox", argFlag,     &bbox,  0,
    "output bounding box for each word and page size to html.  Sets -htmlmeta"},
+  {"-bbox-char", argFlag,     &bboxChar,  0,
+   "output bounding box for each character and page size to html.  Sets -htmlmeta"},
   {"-bbox-layout", argFlag,     &bboxLayout,  0,
    "like -bbox but with extra layout bounding box data.  Sets -htmlmeta"},
   {"-opw",     argString,   ownerPassword,  sizeof(ownerPassword),
@@ -184,7 +188,7 @@ int main(int argc, char *argv[]) {
 
   // parse args
   ok = parseArgs(argDesc, &argc, argv);
-  if (bboxLayout) {
+  if (bboxLayout || bboxChar) {
     bbox = gTrue;
   }
   if (bbox) {
@@ -366,6 +370,9 @@ int main(int argc, char *argv[]) {
       if (bboxLayout) {
         printDocBBox(f, doc, textOut, firstPage, lastPage);
       }
+      else if (bboxChar) {
+        printCharBBox(f, doc, textOut, firstPage, lastPage);
+      }
       else {
         printWordBBox(f, doc, textOut, firstPage, lastPage);
       }
@@ -381,10 +388,10 @@ int main(int argc, char *argv[]) {
 	doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
 			  gTrue, gFalse, gFalse);
       } else {
-	
+
 	for (int page = firstPage; page <= lastPage; ++page) {
 	  doc->displayPageSlice(textOut, page, resolution, resolution, 0,
-			      gTrue, gFalse, gFalse, 
+			      gTrue, gFalse, gFalse,
 			      x, y, w, h);
 	}
       }
@@ -553,14 +560,44 @@ void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int
     const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
     TextWord *word;
     double xMinA, yMinA, xMaxA, yMaxA;
+    int rot;
     if (word_length == 0)
       fprintf(stderr, "no word list\n");
 
     for (int i = 0; i < word_length; ++i) {
       word = wordlist->get(i);
+      rot = 90*word->getRotation();
       word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
       const std::string myString = myXmlTokenReplace(word->getText()->getCString());
-      fprintf(f,"    <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str());
+      fprintf(f,"    <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\" rot=\"%i\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, rot, myString.c_str());
+    }
+    fprintf(f, "  </page>\n");
+    delete wordlist;
+  }
+  fprintf(f, "</doc>\n");
+}
+
+void printCharBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last) {
+  fprintf(f, "<doc>\n");
+  for (int page = first; page <= last; ++page) {
+    fprintf(f, "  <page width=\"%f\" height=\"%f\">\n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page));
+    doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
+    TextWordList *wordlist = textOut->makeWordList();
+    const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
+    TextWord *word;
+    double xMinA, yMinA, xMaxA, yMaxA;
+    int rot;
+    if (word_length == 0)
+      fprintf(stderr, "no word list\n");
+
+    for (int i = 0; i < word_length; ++i) {
+      word = wordlist->get(i);
+      const std::string myString = myXmlTokenReplace(word->getText()->getCString());
+      rot = 90*word->getRotation();
+      for (unsigned int j = 0; j < myString.length(); ++j) {
+        word->getCharBBox(j, &xMinA, &yMinA, &xMaxA, &yMaxA);
+        fprintf(f,"    <char xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\" rot=\"%i\">%c</char>\n", xMinA, yMinA, xMaxA, yMaxA, rot, myString[j]);
+      }
     }
     fprintf(f, "  </page>\n");
     delete wordlist;
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to