List,

I've attached a small addition to pdftotext that outputs bounding box information to html like this:

<doc>
  <page width="612.000000" height="792.000000"/>
    <word xMin="56.800000" yMin="57.208000" xMax="75.412000" 
yMax="70.492000">The</word>
  </page>
</doc>

I had a need, maybe others will too.

-KB
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index d4e004b..c15667e 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -61,6 +61,7 @@ static int x = 0;
 static int y = 0;
 static int w = 0;
 static int h = 0;
+static int bbox = 0;
 static GBool physLayout = gFalse;
 static GBool rawOrder = gFalse;
 static GBool htmlMeta = gFalse;
@@ -103,6 +104,8 @@ static const ArgDesc argDesc[] = {
    "output end-of-line convention (unix, dos, or mac)"},
   {"-nopgbrk", argFlag,     &noPageBreaks,  0,
    "don't insert page breaks between pages"},
+  {"-bbox", argFlag,     &bbox,  0,
+   "output bounding box for each word and page size to html.  Sets -htmlmeta"},
   {"-opw",     argString,   ownerPassword,  sizeof(ownerPassword),
    "owner password (for encrypted files)"},
   {"-upw",     argString,   userPassword,   sizeof(userPassword),
@@ -128,7 +131,7 @@ int main(int argc, char *argv[]) {
   GooString *textFileName;
   GooString *ownerPW, *userPW;
   TextOutputDev *textOut;
-  FILE *f;
+  FILE *f = stdout;
   UnicodeMap *uMap;
   Object info;
   GBool ok;
@@ -139,6 +142,9 @@ int main(int argc, char *argv[]) {
 
   // parse args
   ok = parseArgs(argDesc, &argc, argv);
+  if (bbox){
+    htmlMeta = gTrue;
+  }
   if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) 
{
     fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION);
     fprintf(stderr, "%s\n", popplerCopyright);
@@ -281,33 +287,61 @@ int main(int argc, char *argv[]) {
     info.free();
     fputs("</head>\n", f);
     fputs("<body>\n", f);
-    fputs("<pre>\n", f);
-    if (f != stdout) {
+    if (!bbox) fputs("<pre>\n", f);
+    if (f != stdout && !bbox) { // if writing bbox, f remains open
       fclose(f);
     }
   }
 
   // write text file
-  textOut = new TextOutputDev(textFileName->getCString(),
-                             physLayout, rawOrder, htmlMeta);
-  if (textOut->isOk()) {
-    if ((w==0) && (h==0) && (x==0) && (y==0)) {
-      doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 
0,
-                       gTrue, gFalse, gFalse);
-    } else {
-      int page;
-      
-      for (page = firstPage; page <= lastPage; ++page) {
+  if (bbox) {
+    textOut = new TextOutputDev("/dev/null",
+                               physLayout, rawOrder, htmlMeta);
+    if (textOut->isOk()) {
+      fprintf(f, "<doc>\n");
+      for (int page = firstPage; page <= lastPage; ++page) {
+       fprintf(f, "  <page width=\"%f\" 
height=\"%f\"/>\n",doc->getPageCropWidth(page), doc->getPageCropHeight(page) );
+       doc->displayPage(textOut, page, resolution, resolution, 0,
+                        gTrue, gFalse, gFalse);
+       TextWordList *wordlist;
+       wordlist = textOut->makeWordList();
+       int word_length = wordlist->getLength();
+       TextWord *word;
+       double xMinA, yMinA, xMaxA, yMaxA;
+       if (!word_length)
+         fprintf(stderr, "no word list\n");
+       
+       for (int i=0; i < word_length; i++){
+         word = wordlist->get(i);
+         word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA);
+         fprintf(f,"    <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" 
yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, (char *) word->getText());
+       }
+       fprintf(f, "  </page>\n");
+      }
+      fprintf(f, "</doc>\n");      
+    }
+    fclose(f);
+    
+  } else {
+    textOut = new TextOutputDev(textFileName->getCString(),
+                               physLayout, rawOrder, htmlMeta);
+    if (textOut->isOk()) {
+      if ((w==0) && (h==0) && (x==0) && (y==0)) {
+       doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 
0,
+                         gTrue, gFalse, gFalse);
+      } else {
+       
+       for (int page = firstPage; page <= lastPage; ++page) {
        doc->displayPageSlice(textOut, page, resolution, resolution, 0,
                              gTrue, gFalse, gFalse, 
                              x, y, w, h);
-      }        
-    }  
-
-  } else {
+       }
+      }
+    } else {
     delete textOut;
     exitCode = 2;
     goto err3;
+    }
   }
   delete textOut;
 
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to