pdftotext -utf8

Roman Czyborra Fri, 23 Feb 2001 15:08:29 -0800
Dear Derek, while http://foolabs.com/xpdf/cracking.html the
http://business-report.de/BGBL/bgbl1f/b101009f.pdf to make it
Braille-readible I discovered that your fine

pdftotext version 0.92
Copyright 1996-2000 Derek B. Noonburg
Usage: pdftotext [options] <PDF-file> [<text-file>]
  -f <int>        : first page to convert
  -l <int>        : last page to convert
  -ascii7         : convert to 7-bit ASCII (default is 8-bit ISO Latin-1)
  -latin2         : convert to ISO Latin-2 character set
  -latin5         : convert to ISO Latin-5 character set
  -raw            : keep strings in content stream order
  -upw <string>   : user password (for encrypted files)
  -q              : don't print any messages or errors
  -v              : print copyright and version info
  -h              : print usage information
  -help           : print usage information

uses incomplete character tables, leading to missing characters:

        5. In § 570b Abs. 3 wird die Angabe  § 569a Abs. 1 oder 2" 
        durch die Angabe  § 569 Abs. 1 oder 2" ersetzt.

The following minimal patch fixes this partial problem for me:

*** TextOutputDev.cc    2000-12-04 05:28:07+01  1.1
--- TextOutputDev.cc    2001-02-23 22:41:14+01
***************
*** 143,150 ****
    "*",                                // bullet
    "...",                      // ellipsis
    "-", "-",                   // emdash, hyphen
!   "\"", "\"",                 // quotedblleft, quotedblright
!   "'",                                // quotesingle
    "TM"                                // trademark
  };
  
--- 143,150 ----
    "*",                                // bullet
    "...",                      // ellipsis
    "-", "-",                   // emdash, hyphen
!   "\"", "\"", "\"",           // quotedblleft, quotedblright, quotedblbase
!   "'", "'",                   // quotesingle, quotesinglbase
    "TM"                                // trademark
  };
  
*** TextOutputFontInfo.h        2000-12-04 05:28:07+01  1.1
--- TextOutputFontInfo.h        2001-02-23 23:02:30+01
***************
*** 232,238 ****
  static FontEncoding ascii7Encoding(ascii7EncodingNames,
                                     ascii7EncodingSize);
  
! #define isoLatin1EncodingSize 279
  static char *isoLatin1EncodingNames[isoLatin1EncodingSize] = {
    NULL,
    NULL,
--- 232,238 ----
  static FontEncoding ascii7Encoding(ascii7EncodingNames,
                                     ascii7EncodingSize);
  
! #define isoLatin1EncodingSize 281
  static char *isoLatin1EncodingNames[isoLatin1EncodingSize] = {
    NULL,
    NULL,
***************
*** 509,517 ****
    "ellipsis",
    "emdash",
    "hyphen",
!   "quotedblleft",
    "quotedblright",
!   "quotesingle",
    "trademark"
  };
  static FontEncoding isoLatin1Encoding(isoLatin1EncodingNames,
--- 509,517 ----
    "ellipsis",
    "emdash",
    "hyphen",
!   "quotedblbase", "quotedblleft",
    "quotedblright",
!   "quotesinglbase", "quotesingle",
    "trademark"
  };
  static FontEncoding isoLatin1Encoding(isoLatin1EncodingNames,
***************
*** 792,798 ****
  static FontEncoding isoLatin2Encoding(isoLatin2EncodingNames,
                                        isoLatin2EncodingSize);
  
! #define isoLatin5EncodingSize 279
  static char *isoLatin5EncodingNames[isoLatin5EncodingSize] = {
    NULL,
    NULL,
--- 792,798 ----
  static FontEncoding isoLatin2Encoding(isoLatin2EncodingNames,
                                        isoLatin2EncodingSize);
  
! #define isoLatin5EncodingSize 281
  static char *isoLatin5EncodingNames[isoLatin5EncodingSize] = {
    NULL,
    NULL,
***************
*** 1069,1077 ****
    "ellipsis",
    "emdash",
    "hyphen",
!   "quotedblleft",
    "quotedblright",
!   "quotesingle",
    "trademark"
  };
  static FontEncoding isoLatin5Encoding(isoLatin5EncodingNames,
--- 1069,1077 ----
    "ellipsis",
    "emdash",
    "hyphen",
!   "quotedblbase", "quotedblleft",
    "quotedblright",
!   "quotesinglbase", "quotesingle",
    "trademark"
  };
  static FontEncoding isoLatin5Encoding(isoLatin5EncodingNames,


Yet more such problem cases are shown by a table derived from the
http://partners.adobe.com/asn/developer/acrosdk/docs/PDFRef.pdf on
http://partners.adobe.com/asn/developer/technotes/acrobatpdf.html
namely: dagger + daggerdbl ++ florin fl. fraction / guilsinglleft <
guilsinglright > perthousand %o and the combining accents grave breve
caron circumflex dotaccent hungarumlaut ogonek ring tilde:

PDF     WIN     MAC     STD     XPDF    NAME
030     -       371     306             breve
031     -       377     317             caron
032     210     366     303             circumflex
033     -       372     307             dotaccent
034     -       375     315             hungarumlaut
035     -       376     316             ogonek
036     -       373     312             ring
037     230     367     304             tilde
140     140     140     301             grave
200     225     245     267     *       bullet3
201     206     240     262             dagger
202     207     340     263             daggerdbl
203     205     311     274     ...     ellipsis
204     227     321     320     -       emdash
205     226     320     261            endash
206     203     304     246             florin
207     -       332     244             fraction
210     213     334     254             guilsinglleft
211     233     335     255             guilsinglright
212     -       -       -       -       minus
213     211     344     275             perthousand
214     204     343     271             quotedblbase
215     223     322     252     "       quotedblleft
216     224     323     272     "       quotedblright
217     221     324     140     `       quoteleft
220     222     325     047     '       quoteright
221     202     342     270             quotesinglbase
222     231     252     -       TM      trademark
223     -       336     256     fi      fi
224     -       337     257     fl      fl
225     -       -       350     L       Lslash
226     214     316     352     OE      OE
227     212     -       -       S       Scaron
230     237     331     -       Y       Ydieresis
231     216     -       -       *       Zcaron2
232     -       365     365     i       dotlessi
233     -       -       370     l       lslash
234     234     317     372     oe      oe
235     232     -       -       s       scaron
236     236     -       -       *       zcaron2
240     200     -       -       *       Euro1

In file:/usr/src/xpdf-0.92/xpdf/TextOutputFontInfo.h you wrote:

// This file was automatically generated by makeTextFontInfo.

Yet makeTextFontInfo appears nowhere to be found and your charset data
strutures look clumsy to extend to me.  Now you bookkeep 4 options:

  -ascii7         : convert to 7-bit ASCII
 (default is 8-bit ISO Latin-1)
  -latin2         : convert to ISO Latin-2 character set
  -latin5         : convert to ISO Latin-5 character set

To preserve my German quotation marks and Euro currency signs one
would also like to have -cp1252 and -latin9, -cp1250 for Latin-2,
-cp1254 for Turkish, -cp1251 and -koi7f for Cyrillic &c. ad nauseam.
You could keep this can of worm closed by making pdftotext -utf8 a
filter into which we pipe PDF on standard input and transform this
into plain UTF-8 text on standard output to be piped into anybody's
favorite display filter to display the character quotedblbase directly
in http://mail.nl.linux.org/lists/linux-utf8/ as =E2=80=9E or in
http://czyborra.com/charsets/codepages.html#CP1252 as =84 or in
Latin-1 as substitute » or in ASCII as ,, or "

-
Linux-UTF8:   i18n of Linux on all levels
Archive:      http://mail.nl.linux.org/lists/
pdftotext -utf8

Reply via email to