Hi all,

I have implemented a patch intended to improve searching and copying
text from pdf files produced by xdvipdfmx.

With this patch xdvipdfmx will attempt to extract Unicode values from
glyph names for PUA glyphs and alphabetic presentation forms just like
it currently does for unencoded glyphs (the PUA Unicode value is still
used as a fallback choice). For those TTF fonts which have no glyph
names in their 'post' table there is an attempt to recover the source
Unicode by reverting the GSUB rules. Thus is is now possible to enable
typographic features in fonts which ship with MS Windows (e. g. Times
New Roman or Palatino Linotype) without breaking the document
searchability.

I see there is no bug tracker for texlive, so I am sending this patch
here hoping that somebody can test and apply it.

BTW I have noticed the following bug in the current texlive source
tree: xdvipdfmx is linked with cidtype0.o for dvipdfmx, which causes it
to fail on any attempt to load an OT-CFF font. This can be fixed by
adding cidtype0.[ch] to dvipdfmx_SOURCES in Makefile.am (and rerunning
automake).

-- 
Regards,
Alexey Kryukov <anagnost at yandex dot ru>

Moscow State University
Faculty of History
--- tt_gsub.h.bak	2013-12-14 06:32:52.000000000 +0400
+++ tt_gsub.h	2013-12-15 00:40:12.000000000 +0400
@@ -57,6 +57,12 @@
                                 USHORT *gid_in, USHORT num_gids,
                                 USHORT *gid_out);
 
+#ifdef XETEX
+extern int otl_gsub_lookup_glyph (otl_gsub *gsub_list,
+                                USHORT gid_in,
+                                ULONG *gid_out);
+#endif
+
 #if  0  
 extern int  otl_gsub_dump      (otl_gsub *gsub_list,
                                 const char *script,
--- tt_gsub.c.bak	2013-12-14 06:32:42.000000000 +0400
+++ tt_gsub.c	2013-12-16 00:57:48.000000000 +0400
@@ -1524,6 +1524,166 @@
   return -1;
 }
 
+#ifdef XETEX
+/* Take a substitution glyph and look where it might come from. This is the
+ * reverse of that clt_lookup_coverage does.
+ * The returned value is a GID rather than an index in the coverage table.
+ * Returns -1 if not found
+ */
+static long
+clt_single_subst_rev (struct otl_gsub_subtab *subtab, USHORT gid)
+{
+  long i, j;
+
+  if (subtab->SubstFormat == 1) { /* list */
+    struct otl_gsub_single1 *data = (subtab->table).single1;
+    struct clt_coverage *cov = &(data->coverage);
+
+    if (cov->format == 1) {
+      for (i = 0; i < cov->count; i++) {
+        if (cov->list[i] + data->DeltaGlyphID == gid)
+          return cov->list[i]; /* found */
+      }
+    /* Fontforge generates coverage format 2 even for single substitution format 1 */
+    } else if (cov->format == 2) {
+      for (i=0; i<cov->count; i++) {
+        struct clt_range *rng = &(cov->range[i]);
+
+        for (j = rng->Start; j<=rng->End; j++) {
+          if ((j + data->DeltaGlyphID) == gid)
+            return j; /* found */
+        }
+      }
+    }
+  } else if (subtab->SubstFormat == 2) { /* range */
+    struct otl_gsub_single2 *data = (subtab->table).single2;
+    struct clt_coverage *cov = &(data->coverage);
+    GlyphID *subst = data->Substitute;
+
+    for (i = 0; i < data->GlyphCount; i++) {
+      if (gid == data->Substitute[i]) {
+        for (j=0; j<cov->count; j++) {
+          struct clt_range *rng = &(cov->range[j]);
+
+          if (rng->StartCoverageIndex + rng->End - rng->Start >= i)
+            return rng->Start + i - rng->StartCoverageIndex; /* found */
+        }
+      }
+    }
+  } else {
+    ERROR("Unknown coverage format");
+  }
+
+  return -1; /* not found */
+}
+
+/* For alternates and ligatures just take a coverage index (which is already
+ * known from other data) and retrieve the source glyph GID
+ */
+static long
+clt_lookup_coverage_rev (struct clt_coverage *cov, USHORT idx)
+{
+  USHORT i;
+
+  if (cov->format == 1) {
+    for (i = 0; i < cov->count; i++) {
+      if (i == idx)
+        return cov->list[i]; /* found */
+    }
+  }
+
+  return -1; /* not found */
+}
+
+int otl_gsub_lookup_glyph ( otl_gsub *gsub_list,
+                            USHORT gid_in,
+                            ULONG *gids_out)
+{
+  int i, j, ret;
+  struct otl_gsub_tab *gsub;
+  struct otl_gsub_subtab *subtab;
+
+  for (i = 0; i < gsub_list->num_gsubs; i++) {
+    gsub = &(gsub_list->gsubs[i]);
+    for (j = 0; j < gsub->num_subtables; j++) {
+      subtab = &(gsub->subtables[j]);
+
+      switch ((int) subtab->LookupType) {
+      case OTL_GSUB_TYPE_SINGLE:
+        ret = clt_single_subst_rev(subtab, gid_in);
+        if (ret >= 0) {
+          if (verbose > 2)
+            MESG("Found a single substitution rule format %d: output glyph GID %d, source glyph GID %d\n",
+                subtab->SubstFormat, gid_in, ret);
+          gids_out[0] = ret;
+          return 1; /* found */
+        }
+        break;
+      case OTL_GSUB_TYPE_ALTERNATE:
+        if (subtab->SubstFormat == 1) {
+          struct otl_gsub_alternate1 *data = (subtab->table).alternate1;
+          struct clt_coverage *cov = &(data->coverage);
+          USHORT k, l;
+
+          for (k = 0; k < data->AlternateSetCount; k++) {
+            struct otl_gsub_altset *altset = &(data->AlternateSet[k]);
+
+            for (l = 0; l < altset->GlyphCount; l++) {
+              if (altset->Alternate[l] == gid_in) {
+                ret = clt_lookup_coverage_rev(cov, k);
+                if (verbose > 2)
+                  MESG("Found an alternate substitution rule: output glyph GID %d, source glyph GID %d\n",
+                      gid_in, ret);
+                gids_out[0] = ret;
+                return 1; /* found */
+              }
+            }
+          }
+        }
+        break;
+      case OTL_GSUB_TYPE_LIGATURE:
+        if (subtab->SubstFormat == 1) {
+          struct otl_gsub_ligature1 *data = subtab->table.ligature1;
+          struct clt_coverage *cov = &(data->coverage);
+          USHORT k, l, m;
+          USHORT cnt, lastcnt=0;
+
+          for (k = 0; k < data->LigSetCount; k++) {
+            struct otl_gsub_ligset *ligset = &(data->LigatureSet[k]);
+
+            for (l = 0; l < ligset->LigatureCount; l++) {
+              struct otl_gsub_ligtab *lig = &(ligset->Ligature[l]);
+              cnt = lig->CompCount;
+
+              /* Prefer ligatures with a larger number of components to avoid
+               * situations like matching 'fi' and 'i' glyphs for the 'ffi' ligature
+               */
+              if (lig->LigGlyph == gid_in && lastcnt < cnt) {
+                gids_out[0] = clt_lookup_coverage_rev(cov, k);
+
+                for (m=1; m<cnt; m++)
+                  gids_out[m] = lig->Component[m-1];
+                lastcnt = cnt;
+              }
+            }
+            if (lastcnt > 0) {
+                if (verbose > 2)
+                  MESG("Found a ligature substitution rule for source glyph GID %d, has %d components\n",
+                      gid_in, lastcnt);
+              return lastcnt; /* found */
+            }
+          }
+        }
+        break;
+      default:
+        break;
+      }
+    }
+  }
+  return 0;
+}
+#endif
+
 #if  0
 static int
 otl_gsub_dump_single (struct otl_gsub_subtab *subtab)
--- tt_cmap.c.bak	2013-12-14 06:21:04.000000000 +0400
+++ tt_cmap.c	2013-12-16 00:59:59.000000000 +0400
@@ -33,7 +33,6 @@
 #include "error.h"
 
 #include "sfnt.h"
-#include "tt_gsub.h"
 
 /* Sorry for placing this here.
  * We need to rewrite TrueType font support code...
@@ -839,6 +838,80 @@
   return 1;
 }
 
+#ifdef XETEX
+static long cmap_uni_from_gid (tt_cmap *ttcmap,USHORT tgid) {
+  if (ttcmap->format == 4) {
+    struct cmap4 *map = ttcmap->map;
+    USHORT c0, c1, gid, count, uch;
+    USHORT i, j, d, segCount;
+
+    segCount = map->segCountX2 / 2;
+    gid = 0;
+
+    for (count = 0, i = 0; i < segCount; i++) {
+      c0 = map->startCount[i];
+      c1 = map->endCount[i];
+      d  = map->idRangeOffset[i] / 2 - (segCount - i);
+      for (j = 0; j <= c1 - c0; j++) {
+        uch = c0 + j;
+
+        if (map->idRangeOffset[i] == 0) {
+          gid = (uch + map->idDelta[i]) & 0xffff;
+        } else {
+          gid = (map->glyphIndexArray[j+d] +
+                 map->idDelta[i]) & 0xffff;
+        }
+        if (tgid == gid)
+          return uch;
+      }
+    }
+  } else if (ttcmap->format == 12) {
+    ULONG  i, uch;
+    USHORT gid, count;
+    struct cmap12 *map = ttcmap->map;
+
+    for (count = 0, i = 0; i < map->nGroups; i++) {
+      for (uch  = map->groups[i].startCharCode;
+           uch <= map->groups[i].endCharCode; uch++) {
+        unsigned char *p;
+        int      len;
+        long     d;
+
+        p   = wbuf + 2;
+        d   = uch - map->groups[i].startCharCode;
+        gid = (USHORT) ((map->groups[i].startGlyphID + d) & 0xffff);
+        if (tgid == gid)
+          return uch;
+      }
+    }
+  }
+  return -1;
+}
+
+static int is_unicode_PUA_or_presentation (ULONG uni)
+{
+  return  ((uni >= 0xE000 && uni <= 0xF8FF) || (uni >= 0xFB00 && uni <= 0xFB4F) ||
+           (uni >= 0xF0000 && uni <= 0xFFFFD) || (uni >= 0x100000 && uni <= 0x10FFFD));
+}
+#endif
+
+static void
+store_to_CMap (CMap *cmap, USHORT gid, int  unicode_count, long *unicodes)
+{
+  /* the Unicode characters go into wbuf[2] and following, in UTF16BE */
+  /* we rely on WBUF_SIZE being more than adequate for MAX_UNICODES  */
+  unsigned char* p = wbuf + 2;
+  int  k;
+  long len = 0;
+
+  for (k = 0; k < unicode_count; ++k) {
+    len += UC_sput_UTF16BE(unicodes[k], &p, wbuf+WBUF_SIZE);
+  }
+  wbuf[0] = (gid >> 8) & 0xff;
+  wbuf[1] =  gid & 0xff;
+  CMap_add_bfchar(cmap, wbuf, 2, wbuf + 2, len);
+}
+
 /*
  * Substituted glyphs:
  *
@@ -850,11 +923,47 @@
 
 static USHORT
 handle_subst_glyphs (CMap *cmap,
-		     CMap *cmap_add, const char *used_glyphs,
+		     CMap *cmap_add,
+		     tt_cmap *ttcmap,
+		     const char *used_glyphs,
 		     sfnt *sfont)
 {
   USHORT count;
   USHORT i, gid;
+#ifdef XETEX
+  otl_gsub *gsub_list = NULL, *gsub_list_min = NULL;
+  int has_gsub;
+  USHORT *gid_out;
+
+  if (!FT_HAS_GLYPH_NAMES(sfont->ft_face)) {
+    gsub_list = otl_gsub_new();
+    has_gsub = otl_gsub_add_feat(gsub_list, "*", "*", "*", sfont);
+    if (has_gsub < 0 && verbose > VERBOSE_LEVEL_MIN)
+      WARN("This font seems to have neither glyph names nor GSUB substitutions.");
+    else {
+      /* A "minimalistic" subset of the font's GSUB, which will include only most
+       * commonly used features and only for the default language system of a few
+       * most commonly used scripts. The idea is to reduce the processing time by
+       * applying first this downsized list (which is supposed to be sufficient to
+       * resolve most substitution glyphs) and then, if necessary, making a second
+       * attempt with the full set of GSUB rules. By this way we also prevent some
+       * language specific features (such as Turkish case mappings for i/dotlessi)
+       * from being occasionally triggered, as we can't correctly support them
+       * anyway and should care about producing results suitable for the majority
+       * of users.
+       * Note that this subset doesn't include the 'c2sc' feature. This is done
+       * to prevent it from being preferred over 'smcp', thus causing the unicode
+       * values of capital letters to be assigned to small capitals (this would
+       * be the case for the version of Times New Roman which ships with Windows 8.1).
+       * Of course if the font has a separate set of 'c2sc' glyphs, then they still
+       * can be successfully found at the second pass.
+       */
+      gsub_list_min = otl_gsub_new();
+      otl_gsub_add_feat(gsub_list_min, "(DFLT|latn|cyrl|grek|hebr|arab|armn)", "dflt",
+        "(liga|dlig|rlig|ccmp|smcp|hist|hlig|onum|pnum|tnum|isol|fina|medi|init|salt|ss\?\?)", sfont);
+    }
+  }
+#endif
 
   for (count = 0, i = 0; i < 8192; i++) {
     int   j;
@@ -872,36 +981,59 @@
 	continue;
 
       if (!cmap_add) {
-#if XETEX
+#ifdef XETEX
+        int  unicode_count = -1;
         if (FT_HAS_GLYPH_NAMES(sfont->ft_face)) {
           /* JK: try to look up Unicode values from the glyph name... */
 #define MAX_UNICODES	16
 #define MAX_NAME	256
           static char name[MAX_NAME] = "(none)";
-          long unicodes[MAX_UNICODES];
-          int  unicode_count = -1;
+          ULONG unicodes[MAX_UNICODES];
 	  FT_Error err = FT_Get_Glyph_Name(sfont->ft_face, gid, name, MAX_NAME);
+#undef MAX_NAME
           if (!err) {
             unicode_count = agl_get_unicodes(name, unicodes, MAX_UNICODES);
           }
-#undef MAX_UNICODES
-#undef MAX_NAME
-          if (unicode_count == -1) {
+          if (unicode_count == -1)
             MESG("No Unicode mapping available: GID=%u, name=%s\n", gid, name);
+          else
+            store_to_CMap (cmap, gid, unicode_count, unicodes);
+	}
+
+        /* If the glyph name check fails (e. g. because there are no glyph names
+         * in the 'post' table of a TrueType font), then attempt to get
+         * a Unicode value be reversing the GSUB table
+         */
+        if (unicode_count == -1 && has_gsub >= 0) {
+          ULONG unicodes[MAX_UNICODES];
+#undef MAX_UNICODES
+          unicode_count = otl_gsub_lookup_glyph(gsub_list_min, gid, unicodes);
+          if (unicode_count > 0) {
+            int  k;
+            for (k = 0; k < unicode_count; ++k)
+              unicodes[k] = cmap_uni_from_gid(ttcmap, unicodes[k]);
+            store_to_CMap (cmap, gid, unicode_count, unicodes);
           } else {
-            /* the Unicode characters go into wbuf[2] and following, in UTF16BE */
-            /* we rely on WBUF_SIZE being more than adequate for MAX_UNICODES  */
-            unsigned char* p = wbuf + 2;
+            unicode_count = otl_gsub_lookup_glyph(gsub_list, gid, unicodes);
+            if (unicode_count > 0) {
             int  k;
-            len = 0;
-            for (k = 0; k < unicode_count; ++k) {
-              len += UC_sput_UTF16BE(unicodes[k], &p, wbuf+WBUF_SIZE);
+              for (k = 0; k < unicode_count; ++k)
+                unicodes[k] = cmap_uni_from_gid(ttcmap, unicodes[k]);
+              store_to_CMap (cmap, gid, unicode_count, unicodes);
+            }
             }
-            wbuf[0] = (gid >> 8) & 0xff;
-            wbuf[1] =  gid & 0xff;
-            CMap_add_bfchar(cmap, wbuf, 2, wbuf + 2, len);
           }
+
+        /* If nothing else helps and the glyph belongs to PUA or presentation
+         * forms, then just map it to its initial Unicode value
+         */
+        if (unicode_count == -1) {
+          ULONG unicodes[0];
+          unicodes[0] = cmap_uni_from_gid(ttcmap, gid);
+          if (is_unicode_PUA_or_presentation (unicodes[0]))
+            store_to_CMap (cmap, gid, 1, unicodes);
 	}
+
 #else
 	WARN("No Unicode mapping available: GID=%u", gid);
 #endif
@@ -938,18 +1070,17 @@
     }
   }
 
+#ifdef XETEX
+  if (gsub_list) {
+    otl_gsub_release(gsub_list);
+    otl_gsub_release(gsub_list_min);
+  }
+#endif
   return count;
 }
 
-static int is_unicode_PUA ( ULONG uni )
-{
-  if ((uni >= 0xE000 && uni <= 0xF8FF) || (uni >= 0xF0000 && uni <= 0xFFFFD) ||  (uni >= 0x100000 && uni <= 0x10FFFD))
-    return true;
-  return false;
-}
-
 static pdf_obj *
-create_ToUnicode_cmap4 (struct cmap4 *map,
+create_ToUnicode_cmap4 (tt_cmap *ttcmap,
 			const char *cmap_name, CMap *cmap_add,
 			const char *used_glyphs,
 			sfnt *sfont)
@@ -959,6 +1090,7 @@
   USHORT    c0, c1, gid, count, ch;
   USHORT    i, j, d, segCount;
   char      used_glyphs_copy[8192];
+  struct cmap4 *map = ttcmap->map;
 
   cmap = CMap_new();
   CMap_set_name (cmap, cmap_name);
@@ -976,8 +1108,12 @@
     d  = map->idRangeOffset[i] / 2 - (segCount - i);
     for (j = 0; j <= c1 - c0; j++) {
       ch = c0 + j;
-      if (is_unicode_PUA(ch))
+
+#ifdef XETEX
+      /* Skip PUA glyphs and alphabetic presentation forms. Will handle them later */
+      if (is_unicode_PUA_or_presentation(ch))
         continue;
+#endif
 
       if (map->idRangeOffset[i] == 0) {
 	gid = (ch + map->idDelta[i]) & 0xffff;
@@ -1009,7 +1145,7 @@
     }
   }
 
-  count += handle_subst_glyphs(cmap, cmap_add, used_glyphs_copy, sfont);
+  count += handle_subst_glyphs(cmap, cmap_add, ttcmap, used_glyphs_copy, sfont);
 
   if (count < 1)
     stream = NULL;
@@ -1022,7 +1158,7 @@
 }
 
 static pdf_obj *
-create_ToUnicode_cmap12 (struct cmap12 *map,
+create_ToUnicode_cmap12 (tt_cmap *ttcmap,
 			 const char *cmap_name, CMap *cmap_add,
 			 const char *used_glyphs,
 			 sfnt *sfont)
@@ -1032,6 +1168,7 @@
   ULONG     i, ch;
   USHORT    gid, count;
   char      used_glyphs_copy[8192];
+  struct cmap12 *map = ttcmap->map;
 
   cmap = CMap_new();
   CMap_set_name (cmap, cmap_name);
@@ -1047,8 +1184,12 @@
       unsigned char *p;
       int      len;
       long     d;
-      if (is_unicode_PUA(ch))
+
+#ifdef XETEX
+      /* Skip PUA glyphs and alphabetic presentation forms. Will handle them later */
+      if (is_unicode_PUA_or_presentation(ch))
         continue;
+#endif
 
       p   = wbuf + 2;
       d   = ch - map->groups[i].startCharCode;
@@ -1065,7 +1206,7 @@
     }
   }
 
-  count += handle_subst_glyphs(cmap, cmap_add, used_glyphs_copy, sfont);
+  count += handle_subst_glyphs(cmap, cmap_add, ttcmap, used_glyphs_copy, sfont);
 
   if (count < 1)
     stream = NULL;
@@ -1186,12 +1327,12 @@
     if (!ttcmap)
       continue;
     if (ttcmap->format == 4) {
-      cmap_obj = create_ToUnicode_cmap4(ttcmap->map,
+      cmap_obj = create_ToUnicode_cmap4(ttcmap,
 					cmap_name, cmap_add, used_glyphs, sfont);
       break;
     }
     if (ttcmap->format == 12) {
-      cmap_obj = create_ToUnicode_cmap12(ttcmap->map,
+      cmap_obj = create_ToUnicode_cmap12(ttcmap,
 				       cmap_name, cmap_add, used_glyphs, sfont);
       break;
     }

--------------------------------------------------
Subscriptions, Archive, and List information, etc.:
  http://tug.org/mailman/listinfo/xetex

Reply via email to