Yikes,

I have rewritten the replace function with standard C (and attached it this time.)

-KB

On Wed, 22 Sep 2010, Kenneth Berland wrote:

I have rewritten the replace function with standard C.

-KB


On Sun, 11 Jul 2010, Albert Astals Cid wrote:

A Dimarts, 6 de juliol de 2010, Kenneth Berland va escriure:

Can I use std::string within any GooString methods I write (e.g. replace) or am I limited to the C Standard library (i.e. string.h)?

No std:: usage anywhere in poppler (except in the cpp frontend).

Albert


On Mon, 5 Jul 2010, Kenneth Berland wrote:

Can I use std::string within any GooString methods I write (e.g. replace) or am I limited to the C Standard library (i.e. string.h)?

-KB


On Tue, 8 Jun 2010, Albert Astals Cid wrote:

A Dimarts, 8 de juny de 2010, vàreu escriure:
Does GooString have a replace() method? I could not find one. Does this
mean I should write one?

Yes, you'll have to write one or get the char * from the GooString and use c-
string ones.

Albert


-KB

On Sun, 30 May 2010, Albert Astals Cid wrote:
> A Diumenge, 30 de maig de 2010, Kenneth Berland va escriure:
>> 1)  Since I sent my last diff, I've:
>>        a) added some string processing to make sure no HTML reserved
>> >> characters are placed into the output.  I process each word.
>> >> b) altered the html a bit so that XML parsers can deal with it. >> >> I've put in a title tag or an empty title tag and added end tags to the
>> meta tags.
>> >> 2)  Addressing your concerns:
>>        a) I've removed the initialization of stdout.
>> >> b) I close f now and reopen it. This also removes the warning.
>> >>       c) If a user is running with the -bbox option, they want word
>> >> bounding boxes. If there are no words, I think a line to stderr is
>> appropriate.
> > Cool, though we try not to use the std (yeah it sucks i know), can you
> either use GooString or char *?
> > > Thanks,
> >  Albert
> >> -KB
>> >> On Wed, 26 May 2010, Albert Astals Cid wrote:
>>> A Dimecres, 26 de maig de 2010, Kenneth Berland va escriure:
>>>> I get a compiler warning without it.
>>>> >>>> pdftotext.cc: In function ‘int main(int, char**)’:
>>>> pdftotext.cc:164: warning: ‘f’ may be used uninitialized in this
>>>> function
>>> >>> That change will not get accepted, sorry, initializing f to stdout is
>>> not a solution.
>>> >>> Also i do not like the fact that you do not close f if you are writing
>>> the bbox? Can't you just open it again like the code already does?
>>> >>> Also i do not understand why the code considers a page having no text
>>> an error.
>>> >>> Albert
>>> >>>> -KB
>>>> >>>> On Wed, 26 May 2010, Albert Astals Cid wrote:
>>>>> A Diumenge, 9 de maig de 2010, Kenneth Berland va escriure:
>>>>>> List,
>>>>>> >>>>>> I've attached a small addition to pdftotext that outputs bounding
>>>>>> box information to html like this:
>>>>>> >>>>>> <doc>
>>>>>> >>>>>>    <page width="612.000000" height="792.000000"/>
>>>>>> >>>>>> <word xMin="56.800000" yMin="57.208000" xMax="75.412000"
>>>>>> >>>>>> yMax="70.492000">The</word> </page>
>>>>>> </doc>
>>>>>> >>>>>> I had a need, maybe others will too.
>>>>>> >>>>>> -KB
>>>>> >>>>> Why is this change necessary?
>>>>> >>>>> -  FILE *f;
>>>>> +  FILE *f = stdout;
>>>>> >>>>> Albert
>>> >>> _______________________________________________
>>> poppler mailing list
>>> [email protected]
>>> http://lists.freedesktop.org/mailman/listinfo/poppler
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index d4e004b..889a6c4 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -61,6 +61,7 @@ static int x = 0;
 static int y = 0;
 static int w = 0;
 static int h = 0;
+static int bbox = 0;
 static GBool physLayout = gFalse;
 static GBool rawOrder = gFalse;
 static GBool htmlMeta = gFalse;
@@ -103,6 +104,8 @@ static const ArgDesc argDesc[] = {
    "output end-of-line convention (unix, dos, or mac)"},
   {"-nopgbrk", argFlag,     &noPageBreaks,  0,
    "don't insert page breaks between pages"},
+  {"-bbox", argFlag,     &bbox,  0,
+   "output bounding box for each word and page size to html.  Sets -htmlmeta"},
   {"-opw",     argString,   ownerPassword,  sizeof(ownerPassword),
    "owner password (for encrypted files)"},
   {"-upw",     argString,   userPassword,   sizeof(userPassword),
@@ -122,6 +125,47 @@ static const ArgDesc argDesc[] = {
   {NULL}
 };
 
+char* str_replace(char *str, char *oldstr, char *newstr) {
+  int i, count = 0;
+  int newlen = strlen(newstr);
+  int oldlen = strlen(oldstr);
+  
+  for (i = 0; str[i]; ++i)
+    if (strstr(&str[i], oldstr) == &str[i])
+      ++count, i += oldlen - 1;
+  
+  char *ret = (char *) calloc(i + 1 + count * (newlen - oldlen), sizeof(char));
+  
+  i = 0;
+  while (*str)
+    if (strstr(str, oldstr) == str)
+      strcpy(&ret[i], newstr),
+       i += newlen,
+       str += oldlen;
+    else
+      ret[i++] = *str++;
+  
+  ret[i] = '\0';
+  
+  return ret;
+}
+
+
+char* myXmlTokenReplace( char* inString ){
+  char* new0 = str_replace(inString,"&","&amp;");
+  char* new1 = str_replace(new0,"'","&apos;");
+  free(new0);
+  new0 = str_replace(new1,"\"","&quot;");
+  free(new1);
+  new1 = str_replace(new0,"<","&lt;");
+  free(new0);
+  new0 = str_replace(new1,">","&gt;");
+  free(new1);
+  return( new0 );
+}
+
+
+
 int main(int argc, char *argv[]) {
   PDFDoc *doc;
   GooString *fileName;
@@ -139,6 +183,9 @@ int main(int argc, char *argv[]) {
 
   // parse args
   ok = parseArgs(argDesc, &argc, argv);
+  if (bbox){
+    htmlMeta = gTrue;
+  }
   if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) 
{
     fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION);
     fprintf(stderr, "%s\n", popplerCopyright);
@@ -257,57 +304,99 @@ int main(int argc, char *argv[]) {
        goto err3;
       }
     }
-    fputs("<html>\n", f);
+    fputs("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" 
\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\";>",f);
+    fputs("<html xmlns=\"http://www.w3.org/1999/xhtml\";>\n", f);
     fputs("<head>\n", f);
     doc->getDocInfo(&info);
     if (info.isDict()) {
-      printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
-                     uMap);
+      Object obj;
+      if (info.getDict()->lookup("Title", &obj)->isString()) {
+       printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
+                       uMap);
+      } else {
+       fputs("<title></title>\n", f);
+      }
       printInfoString(f, info.getDict(), "Subject",
-                     "<meta name=\"Subject\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Subject\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Keywords",
-                     "<meta name=\"Keywords\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Keywords\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Author",
-                     "<meta name=\"Author\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Author\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Creator",
-                     "<meta name=\"Creator\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Creator\" content=\"", "\"/>\n", uMap);
       printInfoString(f, info.getDict(), "Producer",
-                     "<meta name=\"Producer\" content=\"", "\">\n", uMap);
+                     "<meta name=\"Producer\" content=\"", "\"/>\n", uMap);
       printInfoDate(f, info.getDict(), "CreationDate",
-                   "<meta name=\"CreationDate\" content=\"\">\n");
+                   "<meta name=\"CreationDate\" content=\"\"/>\n");
       printInfoDate(f, info.getDict(), "LastModifiedDate",
-                   "<meta name=\"ModDate\" content=\"\">\n");
+                   "<meta name=\"ModDate\" content=\"\"/>\n");
     }
     info.free();
     fputs("</head>\n", f);
     fputs("<body>\n", f);
-    fputs("<pre>\n", f);
+    if (!bbox) fputs("<pre>\n", f);
     if (f != stdout) {
       fclose(f);
     }
   }
 
   // write text file
-  textOut = new TextOutputDev(textFileName->getCString(),
-                             physLayout, rawOrder, htmlMeta);
-  if (textOut->isOk()) {
-    if ((w==0) && (h==0) && (x==0) && (y==0)) {
-      doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 
0,
-                       gTrue, gFalse, gFalse);
-    } else {
-      int page;
-      
-      for (page = firstPage; page <= lastPage; ++page) {
+  if (bbox) {
+    textOut = new TextOutputDev("/dev/null",
+                               physLayout, rawOrder, htmlMeta);
+    if (!(f = fopen(textFileName->getCString(), "ab"))) {
+      error(-1, "Couldn't open text file '%s' for append", 
textFileName->getCString());
+      exitCode = 2;
+      goto err3;
+    }
+    
+    if (textOut->isOk()) {
+      fprintf(f, "<doc>\n");
+      for (int page = firstPage; page <= lastPage; ++page) {
+       fprintf(f, "  <page width=\"%f\" 
height=\"%f\">\n",doc->getPageCropWidth(page), doc->getPageCropHeight(page) );
+       doc->displayPage(textOut, page, resolution, resolution, 0,
+                        gTrue, gFalse, gFalse);
+       TextWordList *wordlist;
+       wordlist = textOut->makeWordList();
+       int word_length = wordlist->getLength();
+       TextWord *word;
+       double xMinA, yMinA, xMaxA, yMaxA;
+       if (!word_length)
+         fprintf(stderr, "no word list\n");
+       
+       for (int i=0; i < word_length; i++){
+         word = wordlist->get(i);
+         word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA);
+         char* replacedText = myXmlTokenReplace( (char*) word->getText() );
+         fprintf(f,"    <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" 
yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, replacedText );
+         free(replacedText);
+       }
+       fprintf(f, "  </page>\n");
+      }
+      fprintf(f, "</doc>\n");      
+    }
+    fclose(f);
+    
+  } else {
+    textOut = new TextOutputDev(textFileName->getCString(),
+                               physLayout, rawOrder, htmlMeta);
+    if (textOut->isOk()) {
+      if ((w==0) && (h==0) && (x==0) && (y==0)) {
+       doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 
0,
+                         gTrue, gFalse, gFalse);
+      } else {
+       
+       for (int page = firstPage; page <= lastPage; ++page) {
        doc->displayPageSlice(textOut, page, resolution, resolution, 0,
                              gTrue, gFalse, gFalse, 
                              x, y, w, h);
-      }        
-    }  
-
-  } else {
+       }
+      }
+    } else {
     delete textOut;
     exitCode = 2;
     goto err3;
+    }
   }
   delete textOut;
 
@@ -322,7 +411,7 @@ int main(int argc, char *argv[]) {
        goto err3;
       }
     }
-    fputs("</pre>\n", f);
+    if (!bbox) fputs("</pre>\n", f);
     fputs("</body>\n", f);
     fputs("</html>\n", f);
     if (f != stdout) {
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to