On Fri, Jan 31, 2014 at 8:01 AM, Eli Zaretskii <[email protected]> wrote: > > It is true that characters which cannot be encoded in the terminal's > encoding should be replaced with something that still leaves the text > legible. However, many characters _can_ be encoded, and Info should > use libiconv for those. Moreover, the mere fact that a character > cannot be represented should be taken from libiconv's output, rather > than hard-coded in advance in Info's sources. That way, we won't need > any changes when/if there are terminals or encodings that don't exist > today. > > IMO your UTF-8 related patch would be much more complete if it used > libiconv as described above. I've attached a patch which uses iconv as you suggested. I've tested it with the two files attached under both utf-8 and iso8859-1 locales. (I did this by, e.g. running "LANG=en_US.UTF8" to get a UTF-8 terminal.) I haven't been able to figure out how to get an ASCII-only terminal yet.
diff -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.c info-locale-5405/trunk/info/nodes.c
--- texinfo/trunk/info/nodes.c 2014-01-07 20:11:42.000000000 +0000
+++ info-locale-5405/trunk/info/nodes.c 2014-01-31 21:23:52.000000000 +0000
@@ -27,6 +27,9 @@
#include "info-utils.h"
#include "tag.h"
+#include <nl_types.h>
+#include <langinfo.h>
+#include <iconv.h>
#if defined (HANDLE_MAN_PAGES)
# include "man.h"
@@ -42,6 +45,8 @@
SEARCH_BINDING *indirect_binding, SEARCH_BINDING *tags_binding);
static void info_reload_file_buffer_contents (FILE_BUFFER *fb);
static char *adjust_nodestart (NODE *node, int min, int max);
+static void set_file_lc_ctype (FILE_BUFFER *fb);
+static void convert_characters (FILE_BUFFER *fb);
static FILE_BUFFER *info_load_file_internal (char *filename, int get_tags);
static FILE_BUFFER *info_find_file_internal (char *filename, int get_tags);
static NODE *info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer,
@@ -326,6 +331,258 @@
return file_buffer;
}
+char *encoding_names[] = { "US-ASCII", "UTF-8", "ISO-8859-1", "ISO-8859-2",
+ "ISO-8859-15", "koi8-r", "koi8-u", 0 };
+
+/* Look for local variables section in FB and set encoding */
+static void
+set_file_lc_ctype (FILE_BUFFER *fb)
+{
+ SEARCH_BINDING binding;
+ long position;
+
+ long int enc_start, enc_end;
+ char *enc_string;
+
+ char **encoding_name;
+
+ /* See if there is a local variables section in this info file. */
+ binding.buffer = fb->contents;
+ binding.start = fb->filesize;
+ binding.end = binding.start - 1000;
+ if (binding.end < 0)
+ binding.end = 0;
+ binding.flags = S_FoldCase;
+
+ fb->lc_ctype = ENC_UNKNOWN;
+
+ if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position)
+ != search_success)
+ return;
+
+ binding.start = position;
+ binding.end = fb->filesize;
+
+ if (search_forward ("coding:", &binding, &enc_start)
+ != search_success)
+ return;
+
+ enc_start += 7; /* Skip to after "coding:" */
+ enc_start += skip_whitespace(fb->contents + enc_start);
+ binding.start = enc_start;
+
+ search_forward ("\n", &binding, &enc_end);
+
+ enc_string = xmalloc (enc_end - enc_start + 1);
+ strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start);
+ enc_string[enc_end - enc_start] = '\0';
+
+ for (encoding_name = encoding_names; *encoding_name != 0; encoding_name++)
+ if (!strcasecmp(enc_string, *encoding_name))
+ fb->lc_ctype = encoding_name - encoding_names;
+}
+
+/* The degrade functions read one character at *FROM and write out at
+ *TO a sequence of bytes representing that character in ASCII. *FROM
+ and *TO are both advanced past the read/written bytes
+ Calling code assumes that replacement strings are no more than
+ 4 characters. */
+
+struct encoding_replacement
+{
+ char *from_string;
+ char *to_string;
+};
+
+static void
+degrade_dummy (char **from, size_t *from_left, char **to, size_t *to_left)
+{
+ /* FIXME: Check if **to is in range 0x00 to 0x7F? */
+ **to = **from;
+ (*from)++; (*to)++;
+ (*from_left)--; (*to_left)--;
+}
+
+static void
+degrade_utf8 (char **from, size_t *from_left, char **to, size_t *to_left)
+{
+ struct encoding_replacement er[] = {
+ {"\xe2\x80\x98","'"}, /* Opening quote */
+ {"\xe2\x80\x99","'"}, /* Closing quote */
+ {0, 0}};
+
+ struct encoding_replacement *erp;
+
+ for (erp = er; erp->from_string != 0; erp++)
+ {
+ if (!strncmp (*from, erp->from_string, strlen (erp->from_string)))
+ {
+ strncpy(*to, erp->to_string, strlen(erp->to_string));
+ *from += strlen (erp->from_string);
+ *from_left -= strlen (erp->from_string);
+ *to += strlen (erp->to_string);
+ *to_left -= strlen (erp->to_string);
+ return;
+ }
+ }
+
+ /* Failing this, just copy a byte across */
+ /* FIXME: Use SUB instead (^Z)? */
+ **to = **from;
+ (*from)++; (*to)++;
+ (*from_left)--; (*to_left)--;
+}
+
+/* Convert characters in the nodes for FB to the current locale */
+static void
+convert_characters (FILE_BUFFER *fb)
+{
+ long node = 0, nextnode;
+ SEARCH_BINDING binding;
+ char *to_locale;
+
+ iconv_t iconv_state;
+ int iconv_available = 0;
+
+ void (*degrade_funcs[5])(char **, size_t *,
+ char **, size_t *) = {
+ degrade_dummy, degrade_utf8, degrade_dummy,
+ degrade_dummy, degrade_dummy };
+
+ /* Function to use to convert file locale to ASCII */
+ void (*degrade)(char **, size_t *, char **, size_t *);
+
+ if (fb->lc_ctype == ENC_UNKNOWN) return;
+
+ /* Read environment locale */
+ to_locale = nl_langinfo(CODESET);
+
+ /* Don't degrade the contents if we are in fact
+ * in the right locale for the file */
+ if (!strcasecmp(to_locale, encoding_names[fb->lc_ctype]))
+ return;
+
+ degrade = degrade_funcs [fb->lc_ctype];
+
+ /* Check if an iconv conversion from file locale to system
+ * locale exists - if so we will try to use it. */
+ iconv_state = iconv_open (to_locale, encoding_names[fb->lc_ctype]);
+ if (iconv_state != (iconv_t) -1)
+ iconv_available = 1;
+
+ /* Return if no conversion function implemented */
+ if (!iconv_available && degrade == degrade_dummy) return;
+
+ /* Allocate space for the converted file buffer (including
+ terminating NULL). */
+ char *new_contents = xcalloc (1, fb->filesize + 1);
+ size_t new_contents_allocated = fb->filesize;
+ char *outbuf = new_contents;
+ size_t out_bytes_left = fb->filesize;
+
+ binding.buffer = fb->contents;
+ binding.start = 0;
+ binding.end = fb->filesize;
+
+ /* Convert sections of the file separated by node separators. These
+ * will be preambles, nodes, tag tables, or local variable sections.
+ * We convert all of them, although probably only the nodes need to
+ * be converted.
+ * The second part of the condition makes us operate on the last
+ * section, which does not end with a node separator. */
+ while ((nextnode = find_node_separator (&binding)) != -1
+ || (node != fb->filesize && (nextnode = fb->filesize)))
+ {
+ char *inbuf;
+ size_t inbytesleft;
+
+ /* Update search for next iteration */
+ binding.start = nextnode + 1;
+
+ /* Convert characters from node to nextnode */
+ inbuf = binding.buffer + node;
+ inbytesleft = nextnode - node;
+
+ while (inbuf < binding.buffer + nextnode)
+ {
+ int out_offset; /* Only used when reallocating */
+
+ if (iconv_available)
+ {
+ while (1)
+ {
+ size_t iconv_ret;
+
+ iconv_ret = iconv (iconv_state, &inbuf, &inbytesleft,
+ &outbuf, &out_bytes_left);
+
+ if (iconv_ret != (size_t) -1)
+ {
+ /* Success */
+ /* iconv_close (iconv_state); */
+ goto continue_node_loop;
+ }
+
+ /* There's been an error while converting. */
+ switch (errno)
+ {
+ case E2BIG:
+ /* Ran out of space in output buffer. Reallocate and
+ * try again. */
+ out_offset = outbuf - new_contents;
+ new_contents_allocated *= 2;
+ new_contents = xrealloc(new_contents,
+ new_contents_allocated);
+
+ /* Update outbuf */
+ outbuf = new_contents + out_offset;
+ out_bytes_left = new_contents_allocated - out_offset;
+
+ continue;
+ case EILSEQ:
+ /* Byte sequence in input buffer not recognized. Degrade
+ * to ASCII instead.
+ * (FIXME: Check that output encoding
+ * is backwards compatible with ASCII). */
+ goto degrade_to_ascii;
+ case EINVAL:
+ /* Incomplete byte sequence at end of input buffer */
+ goto degrade_to_ascii;
+ default: /* Unknown error - abort */
+ return;
+ }
+ }
+ }
+ degrade_to_ascii:
+ /* Make sure that there is enough space to write
+ * replacement string. 4 bytes should be enough for one
+ * character */
+ if (out_bytes_left <= 4)
+ {
+ out_offset = outbuf - new_contents;
+ new_contents_allocated *= 2;
+ new_contents = xrealloc(new_contents,
+ new_contents_allocated);
+
+ /* Update outbuf */
+ outbuf = new_contents + out_offset;
+ out_bytes_left = new_contents_allocated - out_offset;
+ }
+
+ degrade(&inbuf, &inbytesleft, &outbuf, &out_bytes_left);
+ }
+ continue_node_loop:
+ node = nextnode;
+ node += skip_whitespace (binding.buffer + node);
+ }
+
+ if (iconv_available)
+ iconv_close (iconv_state);
+ free(fb->contents);
+ fb->contents = new_contents;
+ fb->filesize = outbuf - new_contents;
+}
+
/* The workhorse function for info_load_file (). Non-zero second argument
says to build a list of tags (or nodes) for this file. This is the
default behaviour when info_load_file () is called, but it is not
@@ -397,7 +654,14 @@
file_buffer->contents = contents;
if (compressed)
file_buffer->flags |= N_IsCompressed;
+
+ /* Find encoding of file, if set */
+ set_file_lc_ctype(file_buffer);
+ /* Convert characters in file buffer to current locale as much
+ * as possible. */
+ convert_characters (file_buffer);
+
/* If requested, build the tags and nodes for this file buffer. */
if (get_tags)
build_tags_and_nodes (file_buffer);
diff -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.h info-locale-5405/trunk/info/nodes.h
--- texinfo/trunk/info/nodes.h 2013-12-28 17:11:03.000000000 +0000
+++ info-locale-5405/trunk/info/nodes.h 2014-01-31 21:28:26.000000000 +0000
@@ -72,6 +72,7 @@
#define TAGS_TABLE_BEG_LABEL "Tag Table:\n"
#define INDIRECT_TAGS_TABLE_LABEL "Indirect:\n"
#define TAGS_TABLE_IS_INDIRECT_LABEL "(Indirect)"
+#define LOCAL_VARIABLES_LABEL "Local Variables"
/* Character constants. */
#define INFO_COOKIE '\037'
@@ -112,7 +113,16 @@
TAG **tags; /* If non-null, the indirect tags table. */
size_t tags_slots; /* Number of slots allocated for TAGS. */
int flags; /* Various flags. Mimics of N_* flags. */
+ int lc_ctype; /* Encoding - index into encoding_names */
} FILE_BUFFER;
+
+/* Null-terminated array of strings naming character encodings that Info
+ files could be encoded in. */
+extern char *encoding_names[];
+
+/* Value of FILE_BUFFER.lc_ctype if encoding is unknown */
+#define ENC_UNKNOWN -1
+
/* Externally visible functions. */
iso8859_1.info
Description: Binary data
utf8.info
Description: Binary data
