Re: [poppler] Could not parse charref for nameToUnicode errors

Ed Catmur Fri, 21 Dec 2007 18:19:22 -0800

On Wed, 2007-12-19 at 14:11 +0000, Jonathan Kew wrote:
> On 19 Dec 2007, at 12:06 pm, Adrian Johnson wrote:
> > http://annarchy.freedesktop.org/~ajohnson/test.pdf
> >
> > The numbers "1", "2", and "3", are mapped to the text "test", "text",
> > and "the". The "Z" has the glyph name "g1" so it should be ignored  
> > when extracting text.
> >
> > I have found a bug in the code. With the test file I get
> >
> >  $ pdftotext test.pdf -
> >  Error: Could not parse charref for nameToUnicode: g1
> >  This is = test of text extr=?tion using the glyph n=mes
> >
> > The output should be:
> >  This is a test of text extraction using the glyph names
> >
> > It looks like the glyph names "u00061" and "u0063" are not decoded
> > correctly.
> 
> To be more specific, it looks as though the names are being  
> interpreted as decimal rather than hexadecimal.


The problem is that the uXXXX names are being eaten by the legacy block
    // Not in Adobe Glyph Mapping convention: look for names of the form
'Axx',
    // 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B' are any letters,
'xx' is
    // two hex digits, and 'nn' is 2-4 decimal digits

The solution is to move that block to after the code for dealing with
uXXXX names, which are known to be Unicode-style hex names.

Patch attached, also reduces error output.

Ed

--- poppler/GfxFont.cc	2007/12/21 01:05:59	1.2
+++ poppler/GfxFont.cc	2007/12/22 01:29:44
@@ -798,26 +798,28 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, cha
   // references and variants
   if (missing) {
     for (code = 0; code < 256; ++code) {
-      if ((charName = enc[code]) && !toUnicode[code] &&
-	  strcmp(charName, ".notdef")) {
-	if ((n = parseCharName(charName, uBuf, sizeof(uBuf)/sizeof(*uBuf), 
-			       gFalse, // don't check simple names (pass 1)
-			       gTrue, // do check ligatures
-			       globalParams->getMapNumericCharNames(),
-			       hex,
-			       gTrue))) // do check variants
-	  ctu->setMapping((CharCode)code, uBuf, n);
-	else
-	  error(-1, "Could not parse charref for nameToUnicode: %s", charName);
-      }
-    }
-
-  // if the 'mapUnknownCharNames' flag is set, do a simple pass-through
-  // mapping for unknown character names
-  } else if (missing && globalParams->getMapUnknownCharNames()) {
-    for (code = 0; code < 256; ++code) {
       if (!toUnicode[code]) {
-	toUnicode[code] = code;
+	if ((charName = enc[code]) && strcmp(charName, ".notdef")
+	    && (n = parseCharName(charName, uBuf, sizeof(uBuf)/sizeof(*uBuf), 
+				  gFalse, // don't check simple names (pass 1)
+				  gTrue, // do check ligatures
+				  globalParams->getMapNumericCharNames(),
+				  hex,
+				  gTrue))) { // do check variants
+	  ctu->setMapping((CharCode)code, uBuf, n);
+	} else if (globalParams->getMapUnknownCharNames()) {
+	  // if the 'mapUnknownCharNames' flag is set, do a simple pass-through
+	  // mapping for unknown character names
+	  if (charName && charName[0]) {
+	    for (n = 0; n < sizeof(uBuf)/sizeof(*uBuf); ++n)
+	      if (!(uBuf[n] = charName[n]))
+		break;
+	    ctu->setMapping((CharCode)code, uBuf, n);
+	  } else {
+	    uBuf[0] = code;
+	    ctu->setMapping((CharCode)code, uBuf, 1);
+	  }
+	}
       }
     }
   }
@@ -961,7 +963,7 @@ static int parseCharName(char *charName,
 {
   if (uLen <= 0) {
     error(-1, "Zero-length output buffer (recursion overflow?) in "
-	  "nameToUnicode: %s", charName);
+	  "parseCharName, component \"%s\"", charName);
     return 0;
   }
   // Step 1: drop all the characters from the glyph name starting with the
@@ -996,8 +998,8 @@ static int parseCharName(char *charName,
 			       ligaturesRecurse, numeric, hex, variants)))
 	  n += m;
 	else
-	  error(-1, "Could not parse ligature component in charref for "
-		"nameToUnicode: %s", charName);
+	  error(-1, "Could not parse ligature component \"%s\" of \"%s\" in "
+		"parseCharName", lig_part, charName);
       }
       lig_part = lig_end + 1;
     } while (lig_end && n < uLen);
@@ -1016,31 +1018,7 @@ static int parseCharName(char *charName,
     return 1;
   }
   if (numeric) {
-    // Not in Adobe Glyph Mapping convention: look for names of the form 'Axx',
-    // 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B' are any letters, 'xx' is
-    // two hex digits, and 'nn' is 2-4 decimal digits
     unsigned int n = strlen(charName);
-    if (hex && n == 3 && isalpha(charName[0]) &&
-	isxdigit(charName[1]) && isxdigit(charName[2])) {
-      sscanf(charName+1, "%x", (unsigned int *)uBuf);
-      return 1;
-    } else if (hex && n == 2 &&
-	       isxdigit(charName[0]) && isxdigit(charName[1])) {
-      sscanf(charName, "%x", (unsigned int *)uBuf);
-      return 1;
-    } else if (!hex && n >= 2 && n <= 4 &&
-	       isdigit(charName[0]) && isdigit(charName[1])) {
-      uBuf[0] = (Unicode)atoi(charName);
-      return 1;
-    } else if (n >= 3 && n <= 5 &&
-	       isdigit(charName[1]) && isdigit(charName[2])) {
-      uBuf[0] = (Unicode)atoi(charName+1);
-      return 1;
-    } else if (n >= 4 && n <= 6 &&
-	       isdigit(charName[2]) && isdigit(charName[3])) {
-      uBuf[0] = (Unicode)atoi(charName+2);
-      return 1;
-    }
     // 3.3. otherwise, if the component is of the form "uni" (U+0075 U+006E
     // U+0069) followed by a sequence of uppercase hexadecimal digits (0 .. 9,
     // A .. F, i.e. U+0030 .. U+0039, U+0041 .. U+0046), the length of that
@@ -1081,6 +1059,30 @@ static int parseCharName(char *charName,
 	return 1;
       }
     }
+    // Not in Adobe Glyph Mapping convention: look for names of the form 'Axx',
+    // 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B' are any letters, 'xx' is
+    // two hex digits, and 'nn' is 2-4 decimal digits
+    if (hex && n == 3 && isalpha(charName[0]) &&
+	isxdigit(charName[1]) && isxdigit(charName[2])) {
+      sscanf(charName+1, "%x", (unsigned int *)uBuf);
+      return 1;
+    } else if (hex && n == 2 &&
+	       isxdigit(charName[0]) && isxdigit(charName[1])) {
+      sscanf(charName, "%x", (unsigned int *)uBuf);
+      return 1;
+    } else if (!hex && n >= 2 && n <= 4 &&
+	       isdigit(charName[0]) && isdigit(charName[1])) {
+      uBuf[0] = (Unicode)atoi(charName);
+      return 1;
+    } else if (n >= 3 && n <= 5 &&
+	       isdigit(charName[1]) && isdigit(charName[2])) {
+      uBuf[0] = (Unicode)atoi(charName+1);
+      return 1;
+    } else if (n >= 4 && n <= 6 &&
+	       isdigit(charName[2]) && isdigit(charName[3])) {
+      uBuf[0] = (Unicode)atoi(charName+2);
+      return 1;
+    }
   }
   // 3.5. otherwise, map the component to the empty string
   return 0;

_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Re: [poppler] Could not parse charref for nameToUnicode errors

Reply via email to