The code for the \XeTeXgenerateactualtext feature (it's an integer parameter; set it to 1 to get ActualText added to the PDF, for better copy/paste and search in Acrobat) is now on sourceforge, in an "actualtext" branch, for anyone who wants to try building and experimenting with it.

Note that this requires a new version of xdvipdfmx, as it uses a new DVI opcode. The patch for xdvipdfmx is attached here (based on the current TeXLive svn source).

Akira, if you could check that the patch seems OK, that would be great. I've not really looked at dvipdfm-x code in a long time. I haven't pushed this it to TL yet, as it's all rather experimental, but I hope we can safely include it for TL'16.

JK
Index: texk/dvipdfm-x/ChangeLog
===================================================================
--- texk/dvipdfm-x/ChangeLog	(revision 39834)
+++ texk/dvipdfm-x/ChangeLog	(working copy)
@@ -1,3 +1,8 @@
+2016-02-23  Jonathan Kew  <[email protected]>
+
+	* dvicodes.h, dvi.c, pdfdev.[ch]: Support xetex 0.99995 output
+	(dvi format 7) that includes ActualText content.
+
 2016-02-17  Akira Kakuto  <[email protected]>
 
 	* spc_pdfm.c: Remaining fix for that on 2016-02-13.
Index: texk/dvipdfm-x/dvi.c
===================================================================
--- texk/dvipdfm-x/dvi.c	(revision 39834)
+++ texk/dvipdfm-x/dvi.c	(working copy)
@@ -357,13 +357,13 @@
   /* file_position now points to last non padding character or
    * beginning of file */
   if (dvi_file_size - current < 4 || current == 0 ||
-      !(ch == DVI_ID || ch == DVIV_ID || ch == XDV_ID)) {
+      !(ch == DVI_ID || ch == DVIV_ID || ch == XDV_ID || ch == XDV_ID_OLD)) {
     MESG("DVI ID = %d\n", ch);
     ERROR(invalid_signature);
   } 
 
   post_id_byte = ch;
-  is_xdv = ch == XDV_ID;
+  is_xdv = (ch == XDV_ID || ch == XDV_ID_OLD);
   is_ptex = ch == DVIV_ID;
 
   /* Make sure post_post is really there */
@@ -388,7 +388,7 @@
     ERROR(invalid_signature);
   }
   ch = get_unsigned_byte(dvi_file);
-  if (!(ch == DVI_ID || ch == XDV_ID)) {
+  if (!(ch == DVI_ID || ch == XDV_ID || ch == XDV_ID_OLD)) {
     MESG("DVI ID = %d\n", ch);
     ERROR(invalid_signature);
   }
@@ -484,13 +484,13 @@
   
   /* An Ascii pTeX DVI file has id_byte DVI_ID in the preamble but DVIV_ID in the postamble. */
   ch = get_unsigned_byte(dvi_file);
-  if (!(ch == DVI_ID || ch == XDV_ID)) {
+  if (!(ch == DVI_ID || ch == XDV_ID || ch == XDV_ID_OLD)) {
     MESG("DVI ID = %d\n", ch);
     ERROR(invalid_signature);
   }
 
   pre_id_byte = ch;
-  is_xdv = ch == XDV_ID;
+  is_xdv = (ch == XDV_ID || ch == XDV_ID_OLD);
   is_ptex = ch == DVI_ID; /* maybe */
   
   dvi_info.unit_num = get_positive_quad(dvi_file, "DVI", "unit_num");
@@ -1648,7 +1648,7 @@
 }
 
 static void
-do_glyphs (void)
+do_glyphs (int do_actual_text)
 {
   struct loaded_font *font;
   spt_t  width, height, depth, *xloc, *yloc, glyph_width = 0;
@@ -1660,6 +1660,22 @@
 
   font  = &loaded_fonts[current_font];
 
+  if (do_actual_text) {
+    slen = (unsigned int) get_buffered_unsigned_pair();
+    if (lr_mode >= SKIMMING) {
+      for (i = 0; i < slen; i++) {
+        skip_bufferd_bytes(2);
+      }
+    } else {
+      uint16_t *unicodes = NEW(slen, uint16_t);
+      for (i = 0; i < slen; i++) {
+        unicodes[i] = (uint16_t) get_buffered_unsigned_pair();
+      }
+      pdf_dev_begin_actualtext (unicodes, slen);
+      RELEASE(unicodes);
+    }
+  }
+
   width = get_buffered_signed_quad();
 
   if (lr_mode >= SKIMMING) {
@@ -1739,6 +1755,10 @@
   RELEASE(xloc);
   RELEASE(yloc);
 
+  if (do_actual_text) {
+    pdf_dev_end_actualtext();
+  }
+
   if (lr_mode == LTYPESETTING)
     dvi_right(width);
 
@@ -1767,7 +1787,7 @@
   }
   skip_bytes(4, dvi_file);
   post_id_byte = get_unsigned_byte(dvi_file);
-  if (!(post_id_byte == DVI_ID || post_id_byte == DVIV_ID || post_id_byte == XDV_ID)) {
+  if (!(post_id_byte == DVI_ID || post_id_byte == DVIV_ID || post_id_byte == XDV_ID || post_id_byte == XDV_ID_OLD)) {
     MESG("DVI ID = %d\n", post_id_byte);
     ERROR(invalid_signature);
   }
@@ -1919,8 +1939,12 @@
     /* XeTeX extensions */
     case XDV_GLYPHS:
       need_XeTeX(opcode);
-      do_glyphs();
+      do_glyphs(0);
       break;
+    case XDV_TEXT_AND_GLYPHS:
+      need_XeTeX(opcode);
+      do_glyphs(1);
+      break;
     /* should not occur - processed during pre-scanning */
     case XDV_NATIVE_FONT_DEF:
       need_XeTeX(opcode);
@@ -2457,6 +2481,14 @@
       len = get_and_buffer_unsigned_pair(fp); /* glyph count */
       get_and_buffer_bytes(fp, len * 10);     /* 2 bytes ID + 8 bytes x,y-location per glyph */
       break;
+    case XDV_TEXT_AND_GLYPHS:
+      need_XeTeX(opcode);
+      len = get_and_buffer_unsigned_pair(fp); /* utf16 code unit count */
+      get_and_buffer_bytes(fp, len * 2);      /* 2 bytes per code unit */
+      get_and_buffer_bytes(fp, 4);            /* width */
+      len = get_and_buffer_unsigned_pair(fp); /* glyph count */
+      get_and_buffer_bytes(fp, len * 10);     /* 2 bytes ID + 8 bytes x,y-location per glyph */
+      break;
     case XDV_NATIVE_FONT_DEF:
       need_XeTeX(opcode);
       do_native_font_def(get_signed_quad(dvi_file));
Index: texk/dvipdfm-x/dvicodes.h
===================================================================
--- texk/dvipdfm-x/dvicodes.h	(revision 39834)
+++ texk/dvipdfm-x/dvicodes.h	(working copy)
@@ -100,7 +100,8 @@
                               one byte unsigned comment length followed by comment. */
 #define DVI_ID     2    /* ID Byte for current DVI file */
 #define DVIV_ID    3    /* with Ascii pTeX VW mode extension */
-#define XDV_ID     6    /* XeTeX ".xdv" output that uses XDV opcodes below */
+#define XDV_ID_OLD 6    /* older XeTeX ".xdv" output that does not have XDV_TEXT_AND_GLYPHS */
+#define XDV_ID     7    /* XeTeX ".xdv" output that uses XDV opcodes below */
 #define POST       248  /* Postamble- -- similar to preamble
                               four byte pointer to final bop
                               four byte numerator
@@ -122,9 +123,8 @@
                     /* XeTeX ".xdv" codes */
 #define XDV_NATIVE_FONT_DEF 252 /* fontdef for native platform font */
 #define XDV_GLYPHS          253 /* string of glyph IDs with X and Y positions */
+#define XDV_TEXT_AND_GLYPHS 254 /* like XDV_GLYPHS plus original Unicode text */
 
-#define NOT_USED            254 /* not used */
-
 #define PTEXDIR             255 /* Ascii pTeX DIR command */
 
 #endif /* _DVICODES_H_ */
Index: texk/dvipdfm-x/pdfdev.c
===================================================================
--- texk/dvipdfm-x/pdfdev.c	(revision 39834)
+++ texk/dvipdfm-x/pdfdev.c	(working copy)
@@ -1949,3 +1949,56 @@
 
   info->flags    = 0;
 }
+
+void
+pdf_dev_begin_actualtext (uint16_t *unicodes, int count)
+{
+  int len, i, pdf_doc_enc = 1;
+
+  /* check whether we can use PDFDocEncoding for this string
+     (we punt on the 0x80..0xA0 range that does not directly correspond to unicode)  */
+  for (i = 0; i < count; i++) {
+    if (unicodes[i] > 0xff || (unicodes[i] > 0x7f && unicodes[i] < 0xa1)) {
+      pdf_doc_enc = 0;
+      break;
+    }
+  }
+
+  graphics_mode();
+
+  len = sprintf(work_buffer, "\n/Span<</ActualText(");
+  if (!pdf_doc_enc) {
+    len += sprintf(work_buffer + len, "\xFE\xFF");
+  }
+  pdf_doc_add_page_content(work_buffer, len);
+
+  while (count-- > 0) {
+    unsigned char s[2] = { *unicodes >> 8, *unicodes & 0xff };
+    i = pdf_doc_enc; /* if using PDFDocEncoding, we only care about the low 8 bits,
+                        so start with the second byte of our pair */
+    len = 0;
+    for (; i < 2; i++) {
+      unsigned char c = s[i];
+      if (c == '(' || c == ')' || c == '\\') {
+        len += sprintf(work_buffer + len, "\\%c", c);
+      } else if (c < ' ') {
+        len += sprintf(work_buffer + len, "\\%03o", c);
+      } else {
+        len += sprintf(work_buffer + len, "%c", c);
+      }
+    }
+    pdf_doc_add_page_content(work_buffer, len);
+    ++unicodes;
+  }
+
+  len = sprintf(work_buffer, ")>>BDC");
+  pdf_doc_add_page_content(work_buffer, len);
+}
+
+void
+pdf_dev_end_actualtext ()
+{
+  graphics_mode();
+
+  pdf_doc_add_page_content(" EMC", 4);
+}
Index: texk/dvipdfm-x/pdfdev.h
===================================================================
--- texk/dvipdfm-x/pdfdev.h	(revision 39834)
+++ texk/dvipdfm-x/pdfdev.h	(working copy)
@@ -221,4 +221,7 @@
 extern void   pdf_dev_push_coord(double xpos, double ypos);
 extern void   pdf_dev_pop_coord(void);
 
+extern void   pdf_dev_begin_actualtext (uint16_t *unicodes, int len);
+extern void   pdf_dev_end_actualtext ();
+
 #endif /* _PDFDEV_H_ */

--------------------------------------------------
Subscriptions, Archive, and List information, etc.:
  http://tug.org/mailman/listinfo/xetex

Reply via email to