Re: Displaying characters in user's locale

Gavin Smith Fri, 31 Jan 2014 13:34:31 -0800

On Fri, Jan 31, 2014 at 8:01 AM, Eli Zaretskii <e...@gnu.org> wrote:
>
> It is true that characters which cannot be encoded in the terminal's
> encoding should be replaced with something that still leaves the text
> legible.  However, many characters _can_ be encoded, and Info should
> use libiconv for those.  Moreover, the mere fact that a character
> cannot be represented should be taken from libiconv's output, rather
> than hard-coded in advance in Info's sources.  That way, we won't need
> any changes when/if there are terminals or encodings that don't exist
> today.
>
> IMO your UTF-8 related patch would be much more complete if it used
> libiconv as described above.
I've attached a patch which uses iconv as you suggested. I've tested
it with the two files attached under both utf-8 and iso8859-1 locales.
(I did this by, e.g. running "LANG=en_US.UTF8" to get a UTF-8
terminal.) I haven't been able to figure out how to get an ASCII-only
terminal yet.

diff -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.c info-locale-5405/trunk/info/nodes.c
--- texinfo/trunk/info/nodes.c	2014-01-07 20:11:42.000000000 +0000
+++ info-locale-5405/trunk/info/nodes.c	2014-01-31 21:23:52.000000000 +0000
@@ -27,6 +27,9 @@
 #include "info-utils.h"
 #include "tag.h"
 
+#include <nl_types.h>
+#include <langinfo.h>
+#include <iconv.h>
 
 #if defined (HANDLE_MAN_PAGES)
 #  include "man.h"
@@ -42,6 +45,8 @@
     SEARCH_BINDING *indirect_binding, SEARCH_BINDING *tags_binding);
 static void info_reload_file_buffer_contents (FILE_BUFFER *fb);
 static char *adjust_nodestart (NODE *node, int min, int max);
+static void set_file_lc_ctype (FILE_BUFFER *fb);
+static void convert_characters (FILE_BUFFER *fb);
 static FILE_BUFFER *info_load_file_internal (char *filename, int get_tags);
 static FILE_BUFFER *info_find_file_internal (char *filename, int get_tags);
 static NODE *info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer,
@@ -326,6 +331,258 @@
   return file_buffer;
 }
 
+char *encoding_names[] = { "US-ASCII", "UTF-8", "ISO-8859-1", "ISO-8859-2",
+                       "ISO-8859-15", "koi8-r", "koi8-u", 0 };
+
+/* Look for local variables section in FB and set encoding */
+static void
+set_file_lc_ctype (FILE_BUFFER *fb)
+{
+  SEARCH_BINDING binding;
+  long position;
+
+  long int enc_start, enc_end;
+  char *enc_string;
+
+  char **encoding_name;
+
+  /* See if there is a local variables section in this info file. */
+  binding.buffer = fb->contents;
+  binding.start = fb->filesize;
+  binding.end = binding.start - 1000;
+  if (binding.end < 0)
+    binding.end = 0;
+  binding.flags = S_FoldCase;
+
+  fb->lc_ctype = ENC_UNKNOWN;
+
+  if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position)
+      != search_success)
+    return;
+
+  binding.start = position;
+  binding.end = fb->filesize;
+
+  if (search_forward ("coding:", &binding, &enc_start)
+      != search_success)
+    return;
+
+  enc_start += 7; /* Skip to after "coding:" */
+  enc_start += skip_whitespace(fb->contents + enc_start);
+  binding.start = enc_start;
+
+  search_forward ("\n", &binding, &enc_end);
+
+  enc_string = xmalloc (enc_end - enc_start + 1);
+  strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start);
+  enc_string[enc_end - enc_start] = '\0';
+
+  for (encoding_name = encoding_names; *encoding_name != 0; encoding_name++)
+    if (!strcasecmp(enc_string, *encoding_name))
+      fb->lc_ctype = encoding_name - encoding_names;
+}
+
+/* The degrade functions read one character at *FROM and write out at
+   *TO a sequence of bytes representing that character in ASCII. *FROM
+   and *TO are both advanced past the read/written bytes 
+   Calling code assumes that replacement strings are no more than
+   4 characters. */
+
+struct encoding_replacement
+{
+  char *from_string;
+  char *to_string;
+};
+
+static void
+degrade_dummy (char **from, size_t *from_left, char **to, size_t *to_left)
+{
+  /* FIXME: Check if **to is in range 0x00 to 0x7F? */
+  **to = **from;
+  (*from)++; (*to)++;
+  (*from_left)--; (*to_left)--;
+}
+
+static void
+degrade_utf8 (char **from, size_t *from_left, char **to, size_t *to_left)
+{
+  struct encoding_replacement er[] = {
+  {"\xe2\x80\x98","'"}, /* Opening quote */
+  {"\xe2\x80\x99","'"}, /* Closing quote */
+  {0, 0}};
+
+  struct encoding_replacement *erp;
+
+  for (erp = er; erp->from_string != 0; erp++)
+    {
+      if (!strncmp (*from, erp->from_string, strlen (erp->from_string)))
+        {
+          strncpy(*to, erp->to_string, strlen(erp->to_string));
+          *from      += strlen (erp->from_string);
+          *from_left -= strlen (erp->from_string);
+          *to      += strlen (erp->to_string);
+          *to_left -= strlen (erp->to_string);
+          return;
+        }
+    }
+
+  /* Failing this, just copy a byte across */
+  /* FIXME: Use SUB instead (^Z)? */
+  **to = **from;
+  (*from)++; (*to)++;
+  (*from_left)--; (*to_left)--;
+}
+
+/* Convert characters in the nodes for FB to the current locale */
+static void
+convert_characters (FILE_BUFFER *fb)
+{
+  long node = 0, nextnode;
+  SEARCH_BINDING binding;
+  char *to_locale;
+
+  iconv_t iconv_state;
+  int iconv_available = 0;
+
+  void (*degrade_funcs[5])(char **, size_t *,
+                           char **, size_t *) = {
+    degrade_dummy, degrade_utf8, degrade_dummy,
+    degrade_dummy, degrade_dummy };
+
+  /* Function to use to convert file locale to ASCII */
+  void (*degrade)(char **, size_t *, char **, size_t *);
+
+  if (fb->lc_ctype == ENC_UNKNOWN) return;
+
+  /* Read environment locale */
+  to_locale = nl_langinfo(CODESET);
+
+  /* Don't degrade the contents if we are in fact
+   * in the right locale for the file */
+  if (!strcasecmp(to_locale, encoding_names[fb->lc_ctype]))
+    return;
+
+  degrade = degrade_funcs [fb->lc_ctype];
+
+  /* Check if an iconv conversion from file locale to system
+   * locale exists - if so we will try to use it. */
+  iconv_state = iconv_open (to_locale, encoding_names[fb->lc_ctype]);
+  if (iconv_state != (iconv_t) -1)
+    iconv_available = 1;
+
+  /* Return if no conversion function implemented */
+  if (!iconv_available && degrade == degrade_dummy) return;
+
+  /* Allocate space for the converted file buffer (including
+     terminating NULL). */
+  char *new_contents = xcalloc (1, fb->filesize + 1);
+  size_t new_contents_allocated = fb->filesize;
+  char *outbuf = new_contents;
+  size_t out_bytes_left = fb->filesize;
+
+  binding.buffer = fb->contents;
+  binding.start = 0;
+  binding.end = fb->filesize;
+
+  /* Convert sections of the file separated by node separators. These
+   * will be preambles, nodes, tag tables, or local variable sections.
+   * We convert all of them, although probably only the nodes need to
+   * be converted. 
+   * The second part of the condition makes us operate on the last
+   * section, which does not end with a node separator. */
+  while ((nextnode = find_node_separator (&binding)) != -1
+    || (node != fb->filesize && (nextnode = fb->filesize)))
+    {
+      char *inbuf;
+      size_t inbytesleft;
+
+      /* Update search for next iteration */
+      binding.start = nextnode + 1;
+
+      /* Convert characters from node to nextnode */
+      inbuf = binding.buffer + node;
+      inbytesleft = nextnode - node;
+
+      while (inbuf < binding.buffer + nextnode)
+        {
+          int out_offset; /* Only used when reallocating */
+
+          if (iconv_available)
+            {
+              while (1)
+                {
+                  size_t iconv_ret;
+
+                  iconv_ret = iconv (iconv_state, &inbuf, &inbytesleft,
+                           &outbuf, &out_bytes_left);
+
+                  if (iconv_ret != (size_t) -1)
+                    {
+                      /* Success */
+                      /* iconv_close (iconv_state); */
+                      goto continue_node_loop;
+                    }
+                
+                  /* There's been an error while converting. */
+                  switch (errno)
+                    {
+                    case E2BIG:
+                      /* Ran out of space in output buffer. Reallocate and
+                       * try again. */
+                      out_offset = outbuf - new_contents;
+                      new_contents_allocated *= 2;
+                      new_contents = xrealloc(new_contents,
+                                        new_contents_allocated);
+
+                      /* Update outbuf */
+                      outbuf = new_contents + out_offset;
+                      out_bytes_left = new_contents_allocated - out_offset;
+
+                      continue;
+                    case EILSEQ:
+                      /* Byte sequence in input buffer not recognized. Degrade
+                       * to ASCII instead.
+                       * (FIXME: Check that output encoding
+                       * is backwards compatible with ASCII). */
+                      goto degrade_to_ascii;
+                    case EINVAL:
+                      /* Incomplete byte sequence at end of input buffer */
+                      goto degrade_to_ascii;
+                    default: /* Unknown error - abort */
+                      return;
+                    }
+                }
+            }
+        degrade_to_ascii:
+          /* Make sure that there is enough space to write
+           * replacement string. 4 bytes should be enough for one
+           * character */
+          if (out_bytes_left <= 4)
+            {
+              out_offset = outbuf - new_contents;
+              new_contents_allocated *= 2;
+              new_contents = xrealloc(new_contents,
+                                new_contents_allocated);
+
+              /* Update outbuf */
+              outbuf = new_contents + out_offset;
+              out_bytes_left = new_contents_allocated - out_offset;
+            }
+
+          degrade(&inbuf, &inbytesleft, &outbuf, &out_bytes_left);
+        }
+    continue_node_loop:
+      node = nextnode;
+      node += skip_whitespace (binding.buffer + node);
+    }
+
+  if (iconv_available)
+    iconv_close (iconv_state);
+  free(fb->contents);
+  fb->contents = new_contents;
+  fb->filesize = outbuf - new_contents;
+}
+
 /* The workhorse function for info_load_file ().  Non-zero second argument
    says to build a list of tags (or nodes) for this file.  This is the
    default behaviour when info_load_file () is called, but it is not
@@ -397,7 +654,14 @@
   file_buffer->contents = contents;
   if (compressed)
     file_buffer->flags |= N_IsCompressed;
+
+  /* Find encoding of file, if set */
+  set_file_lc_ctype(file_buffer);
   
+  /* Convert characters in file buffer to current locale as much
+   * as possible. */
+  convert_characters (file_buffer);
+
   /* If requested, build the tags and nodes for this file buffer. */
   if (get_tags)
     build_tags_and_nodes (file_buffer);
diff -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.h info-locale-5405/trunk/info/nodes.h
--- texinfo/trunk/info/nodes.h	2013-12-28 17:11:03.000000000 +0000
+++ info-locale-5405/trunk/info/nodes.h	2014-01-31 21:28:26.000000000 +0000
@@ -72,6 +72,7 @@
 #define TAGS_TABLE_BEG_LABEL            "Tag Table:\n"
 #define INDIRECT_TAGS_TABLE_LABEL       "Indirect:\n"
 #define TAGS_TABLE_IS_INDIRECT_LABEL    "(Indirect)"
+#define LOCAL_VARIABLES_LABEL		"Local Variables"
 
 /* Character constants. */
 #define INFO_COOKIE '\037'
@@ -112,7 +113,16 @@
   TAG **tags;                   /* If non-null, the indirect tags table. */
   size_t tags_slots;            /* Number of slots allocated for TAGS. */
   int flags;                    /* Various flags.  Mimics of N_* flags. */
+  int lc_ctype;                 /* Encoding - index into encoding_names */
 } FILE_BUFFER;
+
+/* Null-terminated array of strings naming character encodings that Info
+   files could be encoded in. */
+extern char *encoding_names[];
+
+/* Value of FILE_BUFFER.lc_ctype if encoding is unknown */
+#define ENC_UNKNOWN     -1
+
 
 /* Externally visible functions.  */

iso8859_1.info
Description: Binary data

utf8.info
Description: Binary data

Re: Displaying characters in user's locale

Reply via email to