Re: [poppler] commit? bounding box html in pdftotext

Kenneth Berland Sun, 30 May 2010 09:32:17 -0700

1)  Since I sent my last diff, I've:

a) added some string processing to make sure no HTML reservedcharacters are placed into the output. I process each word.b) altered the html a bit so that XML parsers can deal with it.I've put in a title tag or an empty title tag and added end tags to themeta tags.


2)  Addressing your concerns:

        a) I've removed the initialization of stdout.

        b) I close f now and reopen it.  This also removes the warning.

c) If a user is running with the -bbox option, they want wordbounding boxes. If there are no words, I think a line to stderr isappropriate.


-KB


On Wed, 26 May 2010, Albert Astals Cid wrote:

A Dimecres, 26 de maig de 2010, Kenneth Berland va escriure:
I get a compiler warning without it.

pdftotext.cc: In function ‘int main(int, char**)’:
pdftotext.cc:164: warning: ‘f’ may be used uninitialized in this function
That change will not get accepted, sorry, initializing f to stdout is not asolution.
Also i do not like the fact that you do not close f if you are writing thebbox? Can't you just open it again like the code already does?
Also i do not understand why the code considers a page having no text anerror.
Albert
-KB

On Wed, 26 May 2010, Albert Astals Cid wrote:
> A Diumenge, 9 de maig de 2010, Kenneth Berland va escriure:
>> List,
>>>> I've attached a small addition to pdftotext that outputs bounding box
>> information to html like this:
>>>> <doc>>>>> <page width="612.000000" height="792.000000"/>>>>> <word xMin="56.800000" yMin="57.208000" xMax="75.412000">>>> yMax="70.492000">The</word> </page>
>> </doc>
>>>> I had a need, maybe others will too.>>>> -KB>> Why is this change necessary?>> - FILE *f;
> +  FILE *f = stdout;
>> Albert
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index d4e004b..dcf4802 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -49,6 +49,7 @@
 #include "CharTypes.h"
 #include "UnicodeMap.h"
 #include "Error.h"
+#include <string>
 
 static void printInfoString(FILE *f, Dict *infoDict, char *key,
                            char *text1, char *text2, UnicodeMap *uMap);
@@ -61,6 +62,7 @@ static int x = 0;
 static int y = 0;
 static int w = 0;
 static int h = 0;
+static int bbox = 0;
 static GBool physLayout = gFalse;
 static GBool rawOrder = gFalse;
 static GBool htmlMeta = gFalse;
@@ -103,6 +105,8 @@ static const ArgDesc argDesc[] = {
    "output end-of-line convention (unix, dos, or mac)"},
   {"-nopgbrk", argFlag,     &noPageBreaks,  0,
    "don't insert page breaks between pages"},
+  {"-bbox", argFlag,     &bbox,  0,
+   "output bounding box for each word and page size to html.  Sets -htmlmeta"},
   {"-opw",     argString,   ownerPassword,  sizeof(ownerPassword),
    "owner password (for encrypted files)"},
   {"-upw",     argString,   userPassword,   sizeof(userPassword),
@@ -122,6 +126,34 @@ static const ArgDesc argDesc[] = {
   {NULL}
 };
 
+using namespace std;
+
+string myStringReplace( string inString, string oldToken, string newToken ){
+
+  size_t foundLoc;
+  int advance = 0;
+  do {
+    foundLoc = inString.find(oldToken, advance);
+    if (foundLoc != string::npos){
+      inString.replace( foundLoc, oldToken.length(), newToken );
+      advance = foundLoc + newToken.length();
+    }
+  } while (foundLoc != string::npos );
+  return(inString);
+}
+
+string myXmlTokenReplace( char* inString ){
+  string myString(inString);
+  myString = myStringReplace(myString,string("&"),string("&amp;") );
+  myString = myStringReplace(myString,string("'"),string("&apos;") );
+  myString = myStringReplace(myString,string("\""),string("&quot;") );
+  myString = myStringReplace(myString,string("<"),string("&lt;") );
+  myString = myStringReplace(myString,string(">"),string("&gt;") );
+  return(myString);
+}
+
+
+
 int main(int argc, char *argv[]) {
   PDFDoc *doc;
   GooString *fileName;
@@ -139,6 +171,9 @@ int main(int argc, char *argv[]) {
 
   // parse args
   ok = parseArgs(argDesc, &argc, argv);
+  if (bbox){
+    htmlMeta = gTrue;
+  }
   if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) 
{
     fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION);
     fprintf(stderr, "%s\n", popplerCopyright);
@@ -257,57 +292,98 @@ int main(int argc, char *argv[]) {
        goto err3;
       }
     }
-    fputs("<html>\n", f);
+    fputs("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" 
\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\";>",f);
+    fputs("<html xmlns=\"http://www.w3.org/1999/xhtml\";>\n", f);
     fputs("<head>\n", f);
     doc->getDocInfo(&info);
     if (info.isDict()) {
-      printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
-                     uMap);
+      Object obj;
+      if (info.getDict()->lookup("Title", &obj)->isString()) {
+       printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
+                       uMap);
+      } else {
+       fputs("<title></title>\n", f);
+      }
       printInfoString(f, info.getDict(), "Subject",
-                     "<meta name=\"Subject\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Subject\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Keywords",
-                     "<meta name=\"Keywords\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Keywords\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Author",
-                     "<meta name=\"Author\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Author\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Creator",
-                     "<meta name=\"Creator\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Creator\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Producer",
-                     "<meta name=\"Producer\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Producer\" content=\"", "\"/>\n", uMap);
       printInfoDate(f, info.getDict(), "CreationDate",
-                   "<meta name=\"CreationDate\" content=\"\">\n");
+                   "<meta name=\"CreationDate\" content=\"\"/>\n");
       printInfoDate(f, info.getDict(), "LastModifiedDate",
-                   "<meta name=\"ModDate\" content=\"\">\n");
+                   "<meta name=\"ModDate\" content=\"\"/>\n");
     }
     info.free();
     fputs("</head>\n", f);
     fputs("<body>\n", f);
-    fputs("<pre>\n", f);
+    if (!bbox) fputs("<pre>\n", f);
     if (f != stdout) {
       fclose(f);
     }
   }
 
   // write text file
-  textOut = new TextOutputDev(textFileName->getCString(),
-                             physLayout, rawOrder, htmlMeta);
-  if (textOut->isOk()) {
-    if ((w==0) && (h==0) && (x==0) && (y==0)) {
-      doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 
0,
-                       gTrue, gFalse, gFalse);
-    } else {
-      int page;
-      
-      for (page = firstPage; page <= lastPage; ++page) {
+  if (bbox) {
+    textOut = new TextOutputDev("/dev/null",
+                               physLayout, rawOrder, htmlMeta);
+    if (!(f = fopen(textFileName->getCString(), "ab"))) {
+      error(-1, "Couldn't open text file '%s' for append", 
textFileName->getCString());
+      exitCode = 2;
+      goto err3;
+    }
+    
+    if (textOut->isOk()) {
+      fprintf(f, "<doc>\n");
+      for (int page = firstPage; page <= lastPage; ++page) {
+       fprintf(f, "  <page width=\"%f\" 
height=\"%f\">\n",doc->getPageCropWidth(page), doc->getPageCropHeight(page) );
+       doc->displayPage(textOut, page, resolution, resolution, 0,
+                        gTrue, gFalse, gFalse);
+       TextWordList *wordlist;
+       wordlist = textOut->makeWordList();
+       int word_length = wordlist->getLength();
+       TextWord *word;
+       double xMinA, yMinA, xMaxA, yMaxA;
+       if (!word_length)
+         fprintf(stderr, "no word list\n");
+       
+       for (int i=0; i < word_length; i++){
+         word = wordlist->get(i);
+         word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA);
+         string myString = myXmlTokenReplace( (char*) word->getText() );
+         fprintf(f,"    <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" 
yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, (char*) myString.c_str() 
);
+       }
+       fprintf(f, "  </page>\n");
+      }
+      fprintf(f, "</doc>\n");      
+    }
+    fclose(f);
+    
+  } else {
+    textOut = new TextOutputDev(textFileName->getCString(),
+                               physLayout, rawOrder, htmlMeta);
+    if (textOut->isOk()) {
+      if ((w==0) && (h==0) && (x==0) && (y==0)) {
+       doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 
0,
+                         gTrue, gFalse, gFalse);
+      } else {
+       
+       for (int page = firstPage; page <= lastPage; ++page) {
        doc->displayPageSlice(textOut, page, resolution, resolution, 0,
                              gTrue, gFalse, gFalse, 
                              x, y, w, h);
-      }        
-    }  
-
-  } else {
+       }
+      }
+    } else {
     delete textOut;
     exitCode = 2;
     goto err3;
+    }
   }
   delete textOut;
 
@@ -322,7 +398,7 @@ int main(int argc, char *argv[]) {
        goto err3;
       }
     }
-    fputs("</pre>\n", f);
+    if (!bbox) fputs("</pre>\n", f);
     fputs("</body>\n", f);
     fputs("</html>\n", f);
     if (f != stdout) {

_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Re: [poppler] commit? bounding box html in pdftotext

Reply via email to